diff options
Diffstat (limited to 'fs')
641 files changed, 12122 insertions, 124030 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 795c6388744c..1581ebac5bb4 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -252,7 +252,7 @@ static int v9fs_drop_inode(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) - return generic_drop_inode(inode); + return inode_generic_drop(inode); /* * in case of non cached mode always drop the * inode because we want the inode attribute diff --git a/fs/Kconfig b/fs/Kconfig index c654a3642897..7815379032da 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -51,7 +51,6 @@ source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" source "fs/f2fs/Kconfig" -source "fs/bcachefs/Kconfig" source "fs/zonefs/Kconfig" endif # BLOCK diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index bd2f530e5740..1949e25c7741 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -184,4 +184,13 @@ config EXEC_KUNIT_TEST This builds the exec KUnit tests, which tests boundary conditions of various aspects of the exec internals. +config ARCH_HAS_ELF_CORE_EFLAGS + bool + depends on BINFMT_ELF && ELF_CORE + default n + help + Select this option if the architecture makes use of the e_flags + field in the ELF header to store ABI or other architecture-specific + information that should be preserved in core dumps. + endmenu diff --git a/fs/Makefile b/fs/Makefile index 334654f9584b..e3523ab2e587 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -121,7 +121,6 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ obj-$(CONFIG_F2FS_FS) += f2fs/ -obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 69e1dd55b160..894d2bad6b6c 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -42,7 +42,7 @@ static void afs_volume_init_callback(struct afs_volume *volume) list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) { if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) { afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb); - queue_work(system_unbound_wq, &vnode->cb_work); + queue_work(system_dfl_wq, &vnode->cb_work); } } @@ -90,7 +90,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas if (reason != afs_cb_break_for_deleted && vnode->status.type == AFS_FTYPE_FILE && atomic_read(&vnode->cb_nr_mmap)) - queue_work(system_unbound_wq, &vnode->cb_work); + queue_work(system_dfl_wq, &vnode->cb_work); trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true); } else { diff --git a/fs/afs/dir.c b/fs/afs/dir.c index bfb69e066672..89d36e3e5c79 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1823,7 +1823,8 @@ error: static void afs_rename_success(struct afs_operation *op) { - struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry)); + struct afs_vnode *vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; _enter("op=%08x", op->debug_id); @@ -1834,22 +1835,40 @@ static void afs_rename_success(struct afs_operation *op) op->ctime = op->file[1].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[1]); } + if (op->more_files[0].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[0]); + if (op->more_files[1].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[1]); /* If we're moving a subdir between dirs, we need to update * its DV counter too as the ".." will be altered. */ - if (S_ISDIR(vnode->netfs.inode.i_mode) && - op->file[0].vnode != op->file[1].vnode) { - u64 new_dv; + if (op->file[0].vnode != op->file[1].vnode) { + if (S_ISDIR(vnode->netfs.inode.i_mode)) { + u64 new_dv; - write_seqlock(&vnode->cb_lock); + write_seqlock(&vnode->cb_lock); - new_dv = vnode->status.data_version + 1; - trace_afs_set_dv(vnode, new_dv); - vnode->status.data_version = new_dv; - inode_set_iversion_raw(&vnode->netfs.inode, new_dv); + new_dv = vnode->status.data_version + 1; + trace_afs_set_dv(vnode, new_dv); + vnode->status.data_version = new_dv; + inode_set_iversion_raw(&vnode->netfs.inode, new_dv); - write_sequnlock(&vnode->cb_lock); + write_sequnlock(&vnode->cb_lock); + } + + if ((op->rename.rename_flags & RENAME_EXCHANGE) && + S_ISDIR(new_vnode->netfs.inode.i_mode)) { + u64 new_dv; + + write_seqlock(&new_vnode->cb_lock); + + new_dv = new_vnode->status.data_version + 1; + new_vnode->status.data_version = new_dv; + inode_set_iversion_raw(&new_vnode->netfs.inode, new_dv); + + write_sequnlock(&new_vnode->cb_lock); + } } } @@ -1900,8 +1919,8 @@ static void afs_rename_edit_dir(struct afs_operation *op) if (S_ISDIR(vnode->netfs.inode.i_mode) && new_dvnode != orig_dvnode && test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) - afs_edit_dir_update_dotdot(vnode, new_dvnode, - afs_edit_dir_for_rename_sub); + afs_edit_dir_update(vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); new_inode = d_inode(new_dentry); if (new_inode) { @@ -1915,9 +1934,6 @@ static void afs_rename_edit_dir(struct afs_operation *op) /* Now we can update d_fsdata on the dentries to reflect their * new parent's data_version. - * - * Note that if we ever implement RENAME_EXCHANGE, we'll have - * to update both dentries with opposing dir versions. */ afs_update_dentry_version(op, new_dvp, op->dentry); afs_update_dentry_version(op, new_dvp, op->dentry_2); @@ -1930,6 +1946,67 @@ static void afs_rename_edit_dir(struct afs_operation *op) fscache_end_operation(&new_cres); } +static void afs_rename_exchange_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode *orig_dvnode = orig_dvp->vnode; + struct afs_vnode *new_dvnode = new_dvp->vnode; + struct afs_vnode *old_vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; + struct dentry *old_dentry = op->dentry; + struct dentry *new_dentry = op->dentry_2; + + _enter("op=%08x", op->debug_id); + + if (new_dvnode == orig_dvnode) { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) { + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + afs_edit_dir_update(orig_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + } + + d_exchange(old_dentry, new_dentry); + up_write(&orig_dvnode->validate_lock); + } else { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + + up_write(&orig_dvnode->validate_lock); + down_write(&new_dvnode->validate_lock); + + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) && + new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta) + afs_edit_dir_update(new_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + + if (S_ISDIR(old_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &old_vnode->flags)) + afs_edit_dir_update(old_vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); + + if (S_ISDIR(new_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &new_vnode->flags)) + afs_edit_dir_update(new_vnode, &dotdot_name, orig_dvnode, + afs_edit_dir_for_rename_sub); + + /* Now we can update d_fsdata on the dentries to reflect their + * new parents' data_version. + */ + afs_update_dentry_version(op, new_dvp, old_dentry); + afs_update_dentry_version(op, orig_dvp, new_dentry); + + d_exchange(old_dentry, new_dentry); + up_write(&new_dvnode->validate_lock); + } +} + static void afs_rename_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); @@ -1948,6 +2025,32 @@ static const struct afs_operation_ops afs_rename_operation = { .put = afs_rename_put, }; +#if 0 /* Autoswitched in yfs_fs_rename_replace(). */ +static const struct afs_operation_ops afs_rename_replace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_replace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; +#endif + +static const struct afs_operation_ops afs_rename_noreplace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_noreplace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; + +static const struct afs_operation_ops afs_rename_exchange_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_exchange, + .success = afs_rename_success, + .edit_dir = afs_rename_exchange_edit_dir, + .put = afs_rename_put, +}; + /* * rename a file in an AFS filesystem and/or move it between directories */ @@ -1956,10 +2059,10 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *new_dentry, unsigned int flags) { struct afs_operation *op; - struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; + struct afs_vnode *orig_dvnode, *new_dvnode, *vnode, *new_vnode = NULL; int ret; - if (flags) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; /* Don't allow silly-rename files be moved around. */ @@ -1969,6 +2072,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); + if (d_is_positive(new_dentry)) + new_vnode = AFS_FS_I(d_inode(new_dentry)); _enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, @@ -1989,6 +2094,11 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (ret < 0) goto error; + ret = -ENOMEM; + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) + goto error; + afs_op_set_vnode(op, 0, orig_dvnode); afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; @@ -1997,46 +2107,63 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; + op->more_files[0].vnode = vnode; + op->more_files[0].speculative = true; + op->more_files[1].vnode = new_vnode; + op->more_files[1].speculative = true; + op->nr_files = 4; op->dentry = old_dentry; op->dentry_2 = new_dentry; + op->rename.rename_flags = flags; op->rename.new_negative = d_is_negative(new_dentry); - op->ops = &afs_rename_operation; - /* For non-directories, check whether the target is busy and if so, - * make a copy of the dentry and then do a silly-rename. If the - * silly-rename succeeds, the copied dentry is hashed and becomes the - * new target. - */ - if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { - /* To prevent any new references to the target during the - * rename, we unhash the dentry in advance. + if (flags & RENAME_NOREPLACE) { + op->ops = &afs_rename_noreplace_operation; + } else if (flags & RENAME_EXCHANGE) { + op->ops = &afs_rename_exchange_operation; + d_drop(new_dentry); + } else { + /* If we might displace the target, we might need to do silly + * rename. */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - op->rename.rehash = new_dentry; - } + op->ops = &afs_rename_operation; - if (d_count(new_dentry) > 2) { - /* copy the target dentry's name */ - op->rename.tmp = d_alloc(new_dentry->d_parent, - &new_dentry->d_name); - if (!op->rename.tmp) { - afs_op_nomem(op); - goto error; + /* For non-directories, check whether the target is busy and if + * so, make a copy of the dentry and then do a silly-rename. + * If the silly-rename succeeds, the copied dentry is hashed + * and becomes the new target. + */ + if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { + /* To prevent any new references to the target during + * the rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + op->rename.rehash = new_dentry; } - ret = afs_sillyrename(new_dvnode, - AFS_FS_I(d_inode(new_dentry)), - new_dentry, op->key); - if (ret) { - afs_op_set_error(op, ret); - goto error; + if (d_count(new_dentry) > 2) { + /* copy the target dentry's name */ + op->rename.tmp = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!op->rename.tmp) { + afs_op_nomem(op); + goto error; + } + + ret = afs_sillyrename(new_dvnode, + AFS_FS_I(d_inode(new_dentry)), + new_dentry, op->key); + if (ret) { + afs_op_set_error(op, ret); + goto error; + } + + op->dentry_2 = op->rename.tmp; + op->rename.rehash = NULL; + op->rename.new_negative = true; } - - op->dentry_2 = op->rename.tmp; - op->rename.rehash = NULL; - op->rename.new_negative = true; } } @@ -2052,6 +2179,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, d_drop(old_dentry); ret = afs_do_sync_operation(op); + if (ret == -ENOTSUPP) + ret = -EINVAL; out: afs_dir_unuse_cookie(orig_dvnode, ret); if (new_dvnode != orig_dvnode) diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 60a549f1d9c5..4b1342c72089 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -522,11 +522,11 @@ error: } /* - * Edit a subdirectory that has been moved between directories to update the - * ".." entry. + * Edit an entry in a directory to update the vnode it refers to. This is also + * used to update the ".." entry in a directory. */ -void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, - enum afs_edit_dir_reason why) +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why) { union afs_xdr_dir_block *block; union afs_xdr_dirent *de; @@ -557,7 +557,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) goto already_invalidated; - slot = afs_dir_scan_block(block, &dotdot_name, b); + slot = afs_dir_scan_block(block, name, b); if (slot >= 0) goto found_dirent; @@ -566,7 +566,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d /* Didn't find the dirent to clobber. Download the directory again. */ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd); goto out; @@ -576,7 +576,7 @@ found_dirent: de->u.unique = htonl(new_dvnode->fid.unique); trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot, - ntohl(de->u.vnode), ntohl(de->u.unique), ".."); + ntohl(de->u.vnode), ntohl(de->u.unique), name->name); kunmap_local(block); netfs_single_mark_inode_dirty(&vnode->netfs.inode); @@ -589,12 +589,12 @@ out: already_invalidated: kunmap_local(block); trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); goto out; error: trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); goto out; } diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 0b80eb93fa40..014495d4b868 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -69,6 +69,12 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode if (IS_ERR(op)) return PTR_ERR(op); + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) { + afs_put_operation(op); + return -ENOMEM; + } + afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, dvnode); op->file[0].dv_delta = 1; @@ -77,6 +83,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; + op->more_files[0].vnode = AFS_FS_I(d_inode(old)); + op->more_files[0].speculative = true; + op->more_files[1].vnode = AFS_FS_I(d_inode(new)); + op->more_files[1].speculative = true; + op->nr_files = 4; op->dentry = old; op->dentry_2 = new; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e9538e91f848..e1cb17b85791 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -723,9 +723,9 @@ int afs_drop_inode(struct inode *inode) _enter(""); if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags)) - return generic_delete_inode(inode); + return inode_just_drop(inode); else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /* diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 1124ea4000cb..444a3ea4fdf6 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -562,6 +562,7 @@ struct afs_server { #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ #define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */ +#define AFS_SERVER_FL_NO_RENAME2 20 /* YFS Fileserver doesn't support enhanced rename */ refcount_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ @@ -891,9 +892,10 @@ struct afs_operation { bool need_rehash; } unlink; struct { - struct dentry *rehash; - struct dentry *tmp; - bool new_negative; + struct dentry *rehash; + struct dentry *tmp; + unsigned int rename_flags; + bool new_negative; } rename; struct { struct netfs_io_subrequest *subreq; @@ -1100,8 +1102,8 @@ int afs_single_writepages(struct address_space *mapping, extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); -void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, - enum afs_edit_dir_reason why); +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); /* @@ -1693,6 +1695,9 @@ extern void yfs_fs_remove_dir(struct afs_operation *); extern void yfs_fs_link(struct afs_operation *); extern void yfs_fs_symlink(struct afs_operation *); extern void yfs_fs_rename(struct afs_operation *); +void yfs_fs_rename_replace(struct afs_operation *op); +void yfs_fs_rename_noreplace(struct afs_operation *op); +void yfs_fs_rename_exchange(struct afs_operation *op); extern void yfs_fs_store_data(struct afs_operation *); extern void yfs_fs_setattr(struct afs_operation *); extern void yfs_fs_get_volume_status(struct afs_operation *); diff --git a/fs/afs/main.c b/fs/afs/main.c index 02475d415d88..e6bb8237db98 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -169,13 +169,13 @@ static int __init afs_init(void) printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); - afs_wq = alloc_workqueue("afs", 0, 0); + afs_wq = alloc_workqueue("afs", WQ_PERCPU, 0); if (!afs_wq) goto error_afs_wq; afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!afs_async_calls) goto error_async; - afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0); + afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!afs_lock_manager) goto error_lockmgr; diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 8f2b3a177690..c8a7f266080d 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -131,6 +131,7 @@ int afs_abort_to_error(u32 abort_code) case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; case RXGEN_OPCODE: return -ENOTSUPP; + case RX_INVALID_OPERATION: return -ENOTSUPP; default: return -EREMOTEIO; } diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h index e4cd89c44c46..b2f06c1917c2 100644 --- a/fs/afs/protocol_yfs.h +++ b/fs/afs/protocol_yfs.h @@ -50,6 +50,9 @@ enum YFS_FS_Operations { YFSREMOVEACL = 64171, YFSREMOVEFILE2 = 64173, YFSSTOREOPAQUEACL2 = 64174, + YFSRENAME_REPLACE = 64176, + YFSRENAME_NOREPLACE = 64177, + YFSRENAME_EXCHANGE = 64187, YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */ YFSFETCHDATA64 = 64537, /* YFS Fetch file data */ YFSSTOREDATA64 = 64538, /* YFS Store file data */ diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index a1c24f589d9e..6a4e7da10fc4 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -432,6 +432,16 @@ bool afs_select_fileserver(struct afs_operation *op) afs_op_set_error(op, -EDQUOT); goto failed_but_online; + case RX_INVALID_OPERATION: + case RXGEN_OPCODE: + /* Handle downgrading to an older operation. */ + afs_op_set_error(op, -ENOTSUPP); + if (op->flags & AFS_OPERATION_DOWNGRADE) { + op->flags &= ~AFS_OPERATION_DOWNGRADE; + goto go_again; + } + goto failed_but_online; + default: afs_op_accumulate_error(op, error, abort_code); failed_but_online: @@ -620,12 +630,13 @@ iterate_address: op->addr_index = addr_index; set_bit(addr_index, &op->addr_tried); - op->volsync.creation = TIME64_MIN; - op->volsync.update = TIME64_MIN; - op->call_responded = false; _debug("address [%u] %u/%u %pISp", op->server_index, addr_index, alist->nr_addrs, rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer)); +go_again: + op->volsync.creation = TIME64_MIN; + op->volsync.update = TIME64_MIN; + op->call_responded = false; _leave(" = t"); return true; diff --git a/fs/afs/server.c b/fs/afs/server.c index a97562f831eb..c4428ebddb1d 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -331,13 +331,14 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate, void afs_put_server(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - unsigned int a, debug_id = server->debug_id; + unsigned int a, debug_id; bool zero; int r; if (!server) return; + debug_id = server->debug_id; a = atomic_read(&server->active); zero = __refcount_dec_and_test(&server->ref, &r); trace_afs_server(debug_id, r - 1, a, reason); diff --git a/fs/afs/write.c b/fs/afs/write.c index 2e7526ea883a..93ad86ff3345 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -172,7 +172,7 @@ static void afs_issue_write_worker(struct work_struct *work) void afs_issue_write(struct netfs_io_subrequest *subreq) { subreq->work.func = afs_issue_write_worker; - if (!queue_work(system_unbound_wq, &subreq->work)) + if (!queue_work(system_dfl_wq, &subreq->work)) WARN_ON_ONCE(1); } diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 257af259c04a..febf13a49f0b 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -1042,6 +1042,9 @@ void yfs_fs_rename(struct afs_operation *op) _enter(""); + if (!test_bit(AFS_SERVER_FL_NO_RENAME2, &op->server->flags)) + return yfs_fs_rename_replace(op); + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename, sizeof(__be32) + sizeof(struct yfs_xdr_RPCFlags) + @@ -1071,6 +1074,252 @@ void yfs_fs_rename(struct afs_operation *op) } /* + * Deliver reply data to a YFS.Rename_NoReplace operation. This does not + * return the status of a displaced target inode as there cannot be one. + */ +static int yfs_deliver_fs_rename_1(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +/* + * Deliver reply data to a YFS.Rename_Replace or a YFS.Rename_Exchange + * operation. These return the status of the displaced target inode if there + * was one. + */ +static int yfs_deliver_fs_rename_2(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + struct afs_vnode_param *new_vp = &op->more_files[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSFid(&bp, &new_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &new_vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +static void yfs_done_fs_rename_replace(struct afs_call *call) +{ + if (call->error == -ECONNABORTED && + (call->abort_code == RX_INVALID_OPERATION || + call->abort_code == RXGEN_OPCODE)) { + set_bit(AFS_SERVER_FL_NO_RENAME2, &call->op->server->flags); + call->op->flags |= AFS_OPERATION_DOWNGRADE; + } +} + +/* + * YFS.Rename_Replace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Replace = { + .name = "FS.Rename_Replace", + .op = yfs_FS_Rename_Replace, + .deliver = yfs_deliver_fs_rename_2, + .done = yfs_done_fs_rename_replace, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_NoReplace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_NoReplace = { + .name = "FS.Rename_NoReplace", + .op = yfs_FS_Rename_NoReplace, + .deliver = yfs_deliver_fs_rename_1, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_Exchange operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Exchange = { + .name = "FS.Rename_Exchange", + .op = yfs_FS_Rename_Exchange, + .deliver = yfs_deliver_fs_rename_2, + .destructor = afs_flat_call_destructor, +}; + +/* + * Rename a file or directory, replacing the target if it exists. The status + * of a displaced target is returned. + */ +void yfs_fs_rename_replace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Replace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_REPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Rename a file or directory, failing if the target dirent exists. + */ +void yfs_fs_rename_noreplace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_NoReplace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_NOREPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Exchange a pair of files directories. + */ +void yfs_fs_rename_exchange(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Exchange, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_EXCHANGE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* * YFS.StoreData64 operation type. */ static const struct afs_call_type yfs_RXYFSStoreData64 = { @@ -636,7 +636,7 @@ static void free_ioctx_reqs(struct percpu_ref *ref) /* Synchronize against RCU protected table->table[] dereferences */ INIT_RCU_WORK(&ctx->free_rwork, free_ioctx); - queue_rcu_work(system_wq, &ctx->free_rwork); + queue_rcu_work(system_percpu_wq, &ctx->free_rwork); } /* diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig deleted file mode 100644 index 8cb2b9d5da96..000000000000 --- a/fs/bcachefs/Kconfig +++ /dev/null @@ -1,121 +0,0 @@ - -config BCACHEFS_FS - tristate "bcachefs filesystem support (EXPERIMENTAL)" - depends on BLOCK - select EXPORTFS - select CLOSURES - select CRC32 - select CRC64 - select FS_POSIX_ACL - select LZ4_COMPRESS - select LZ4_DECOMPRESS - select LZ4HC_COMPRESS - select LZ4HC_DECOMPRESS - select ZLIB_DEFLATE - select ZLIB_INFLATE - select ZSTD_COMPRESS - select ZSTD_DECOMPRESS - select CRYPTO_LIB_SHA256 - select CRYPTO_LIB_CHACHA - select CRYPTO_LIB_POLY1305 - select KEYS - select RAID6_PQ - select XOR_BLOCKS - select XXHASH - select SRCU - select SYMBOLIC_ERRNAME - select MIN_HEAP - select XARRAY_MULTI - help - The bcachefs filesystem - a modern, copy on write filesystem, with - support for multiple devices, compression, checksumming, etc. - -config BCACHEFS_QUOTA - bool "bcachefs quota support" - depends on BCACHEFS_FS - select QUOTACTL - -config BCACHEFS_ERASURE_CODING - bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)" - depends on BCACHEFS_FS - select QUOTACTL - help - This enables the "erasure_code" filesysystem and inode option, which - organizes data into reed-solomon stripes instead of ordinary - replication. - - WARNING: this feature is still undergoing on disk format changes, and - should only be enabled for testing purposes. - -config BCACHEFS_POSIX_ACL - bool "bcachefs POSIX ACL support" - depends on BCACHEFS_FS - select FS_POSIX_ACL - -config BCACHEFS_DEBUG - bool "bcachefs debugging" - depends on BCACHEFS_FS - help - Enables many extra debugging checks and assertions. - - The resulting code will be significantly slower than normal; you - probably shouldn't select this option unless you're a developer. - -config BCACHEFS_INJECT_TRANSACTION_RESTARTS - bool "Randomly inject transaction restarts" - depends on BCACHEFS_DEBUG - help - Randomly inject transaction restarts in a few core paths - may have a - significant performance penalty - -config BCACHEFS_TESTS - bool "bcachefs unit and performance tests" - depends on BCACHEFS_FS - help - Include some unit and performance tests for the core btree code - -config BCACHEFS_LOCK_TIME_STATS - bool "bcachefs lock time statistics" - depends on BCACHEFS_FS - help - Expose statistics for how long we held a lock in debugfs - -config BCACHEFS_NO_LATENCY_ACCT - bool "disable latency accounting and time stats" - depends on BCACHEFS_FS - help - This disables device latency tracking and time stats, only for performance testing - -config BCACHEFS_SIX_OPTIMISTIC_SPIN - bool "Optimistic spinning for six locks" - depends on BCACHEFS_FS - depends on SMP - default y - help - Instead of immediately sleeping when attempting to take a six lock that - is held by another thread, spin for a short while, as long as the - thread owning the lock is running. - -config BCACHEFS_PATH_TRACEPOINTS - bool "Extra btree_path tracepoints" - depends on BCACHEFS_FS && TRACING - help - Enable extra tracepoints for debugging btree_path operations; we don't - normally want these enabled because they happen at very high rates. - -config BCACHEFS_TRANS_KMALLOC_TRACE - bool "Trace bch2_trans_kmalloc() calls" - depends on BCACHEFS_FS - -config BCACHEFS_ASYNC_OBJECT_LISTS - bool "Keep async objects on fast_lists for debugfs visibility" - depends on BCACHEFS_FS && DEBUG_FS - -config MEAN_AND_VARIANCE_UNIT_TEST - tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS - depends on KUNIT - depends on BCACHEFS_FS - default KUNIT_ALL_TESTS - help - This option enables the kunit tests for mean_and_variance module. - If unsure, say N. diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile deleted file mode 100644 index 93c8ee5425c8..000000000000 --- a/fs/bcachefs/Makefile +++ /dev/null @@ -1,107 +0,0 @@ - -obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o - -bcachefs-y := \ - acl.o \ - alloc_background.o \ - alloc_foreground.o \ - backpointers.o \ - bkey.o \ - bkey_methods.o \ - bkey_sort.o \ - bset.o \ - btree_cache.o \ - btree_gc.o \ - btree_io.o \ - btree_iter.o \ - btree_journal_iter.o \ - btree_key_cache.o \ - btree_locking.o \ - btree_node_scan.o \ - btree_trans_commit.o \ - btree_update.o \ - btree_update_interior.o \ - btree_write_buffer.o \ - buckets.o \ - buckets_waiting_for_journal.o \ - chardev.o \ - checksum.o \ - clock.o \ - compress.o \ - darray.o \ - data_update.o \ - debug.o \ - dirent.o \ - disk_accounting.o \ - disk_groups.o \ - ec.o \ - enumerated_ref.o \ - errcode.o \ - error.o \ - extents.o \ - extent_update.o \ - eytzinger.o \ - fast_list.o \ - fs.o \ - fs-ioctl.o \ - fs-io.o \ - fs-io-buffered.o \ - fs-io-direct.o \ - fs-io-pagecache.o \ - fsck.o \ - inode.o \ - io_read.o \ - io_misc.o \ - io_write.o \ - journal.o \ - journal_io.o \ - journal_reclaim.o \ - journal_sb.o \ - journal_seq_blacklist.o \ - keylist.o \ - logged_ops.o \ - lru.o \ - mean_and_variance.o \ - migrate.o \ - move.o \ - movinggc.o \ - namei.o \ - nocow_locking.o \ - opts.o \ - printbuf.o \ - progress.o \ - quota.o \ - rebalance.o \ - rcu_pending.o \ - recovery.o \ - recovery_passes.o \ - reflink.o \ - replicas.o \ - sb-clean.o \ - sb-counters.o \ - sb-downgrade.o \ - sb-errors.o \ - sb-members.o \ - siphash.o \ - six.o \ - snapshot.o \ - str_hash.o \ - subvolume.o \ - super.o \ - super-io.o \ - sysfs.o \ - tests.o \ - time_stats.o \ - thread_with_file.o \ - trace.o \ - two_state_shared_lock.o \ - util.o \ - varint.o \ - xattr.o - -bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o - -obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o - -# Silence "note: xyz changed in GCC X.X" messages -subdir-ccflags-y += $(call cc-disable-warning, psabi) diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c deleted file mode 100644 index d03adc36100e..000000000000 --- a/fs/bcachefs/acl.c +++ /dev/null @@ -1,445 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" - -#include "acl.h" -#include "xattr.h" - -#include <linux/posix_acl.h> - -static const char * const acl_types[] = { - [ACL_USER_OBJ] = "user_obj", - [ACL_USER] = "user", - [ACL_GROUP_OBJ] = "group_obj", - [ACL_GROUP] = "group", - [ACL_MASK] = "mask", - [ACL_OTHER] = "other", - NULL, -}; - -void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size) -{ - const void *p, *end = value + size; - - if (!value || - size < sizeof(bch_acl_header) || - ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION)) - return; - - p = value + sizeof(bch_acl_header); - while (p < end) { - const bch_acl_entry *in = p; - unsigned tag = le16_to_cpu(in->e_tag); - - prt_str(out, acl_types[tag]); - - switch (tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - p += sizeof(bch_acl_entry_short); - break; - case ACL_USER: - prt_printf(out, " uid %u", le32_to_cpu(in->e_id)); - p += sizeof(bch_acl_entry); - break; - case ACL_GROUP: - prt_printf(out, " gid %u", le32_to_cpu(in->e_id)); - p += sizeof(bch_acl_entry); - break; - } - - prt_printf(out, " %o", le16_to_cpu(in->e_perm)); - - if (p != end) - prt_char(out, ' '); - } -} - -#ifdef CONFIG_BCACHEFS_POSIX_ACL - -#include "fs.h" - -#include <linux/fs.h> -#include <linux/posix_acl_xattr.h> -#include <linux/sched.h> -#include <linux/slab.h> - -static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) -{ - return sizeof(bch_acl_header) + - sizeof(bch_acl_entry_short) * nr_short + - sizeof(bch_acl_entry) * nr_long; -} - -static inline int acl_to_xattr_type(int type) -{ - switch (type) { - case ACL_TYPE_ACCESS: - return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; - case ACL_TYPE_DEFAULT: - return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; - default: - BUG(); - } -} - -/* - * Convert from filesystem to in-memory representation. - */ -static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, - const void *value, size_t size) -{ - const void *p, *end = value + size; - struct posix_acl *acl; - struct posix_acl_entry *out; - unsigned count = 0; - int ret; - - if (!value) - return NULL; - if (size < sizeof(bch_acl_header)) - goto invalid; - if (((bch_acl_header *)value)->a_version != - cpu_to_le32(BCH_ACL_VERSION)) - goto invalid; - - p = value + sizeof(bch_acl_header); - while (p < end) { - const bch_acl_entry *entry = p; - - if (p + sizeof(bch_acl_entry_short) > end) - goto invalid; - - switch (le16_to_cpu(entry->e_tag)) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - p += sizeof(bch_acl_entry_short); - break; - case ACL_USER: - case ACL_GROUP: - p += sizeof(bch_acl_entry); - break; - default: - goto invalid; - } - - count++; - } - - if (p > end) - goto invalid; - - if (!count) - return NULL; - - acl = allocate_dropping_locks(trans, ret, - posix_acl_alloc(count, _gfp)); - if (!acl) - return ERR_PTR(-ENOMEM); - if (ret) { - kfree(acl); - return ERR_PTR(ret); - } - - out = acl->a_entries; - - p = value + sizeof(bch_acl_header); - while (p < end) { - const bch_acl_entry *in = p; - - out->e_tag = le16_to_cpu(in->e_tag); - out->e_perm = le16_to_cpu(in->e_perm); - - switch (out->e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - p += sizeof(bch_acl_entry_short); - break; - case ACL_USER: - out->e_uid = make_kuid(&init_user_ns, - le32_to_cpu(in->e_id)); - p += sizeof(bch_acl_entry); - break; - case ACL_GROUP: - out->e_gid = make_kgid(&init_user_ns, - le32_to_cpu(in->e_id)); - p += sizeof(bch_acl_entry); - break; - } - - out++; - } - - BUG_ON(out != acl->a_entries + acl->a_count); - - return acl; -invalid: - pr_err("invalid acl entry"); - return ERR_PTR(-EINVAL); -} - -/* - * Convert from in-memory to filesystem representation. - */ -static struct bkey_i_xattr * -bch2_acl_to_xattr(struct btree_trans *trans, - const struct posix_acl *acl, - int type) -{ - struct bkey_i_xattr *xattr; - bch_acl_header *acl_header; - const struct posix_acl_entry *acl_e, *pe; - void *outptr; - unsigned nr_short = 0, nr_long = 0, acl_len, u64s; - - FOREACH_ACL_ENTRY(acl_e, acl, pe) { - switch (acl_e->e_tag) { - case ACL_USER: - case ACL_GROUP: - nr_long++; - break; - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - nr_short++; - break; - default: - return ERR_PTR(-EINVAL); - } - } - - acl_len = bch2_acl_size(nr_short, nr_long); - u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); - - if (u64s > U8_MAX) - return ERR_PTR(-E2BIG); - - xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); - if (IS_ERR(xattr)) - return xattr; - - bkey_xattr_init(&xattr->k_i); - xattr->k.u64s = u64s; - xattr->v.x_type = acl_to_xattr_type(type); - xattr->v.x_name_len = 0; - xattr->v.x_val_len = cpu_to_le16(acl_len); - - acl_header = xattr_val(&xattr->v); - acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); - - outptr = (void *) acl_header + sizeof(*acl_header); - - FOREACH_ACL_ENTRY(acl_e, acl, pe) { - bch_acl_entry *entry = outptr; - - entry->e_tag = cpu_to_le16(acl_e->e_tag); - entry->e_perm = cpu_to_le16(acl_e->e_perm); - switch (acl_e->e_tag) { - case ACL_USER: - entry->e_id = cpu_to_le32( - from_kuid(&init_user_ns, acl_e->e_uid)); - outptr += sizeof(bch_acl_entry); - break; - case ACL_GROUP: - entry->e_id = cpu_to_le32( - from_kgid(&init_user_ns, acl_e->e_gid)); - outptr += sizeof(bch_acl_entry); - break; - - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - outptr += sizeof(bch_acl_entry_short); - break; - } - } - - BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); - - return xattr; -} - -struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_iter iter = {}; - struct posix_acl *acl = NULL; - - if (rcu) - return ERR_PTR(-ECHILD); - - struct btree_trans *trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash, inode_inum(inode), &search, 0); - int ret = bkey_err(k); - if (ret) - goto err; - - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); - ret = PTR_ERR_OR_ZERO(acl); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - if (ret) - acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL; - - if (!IS_ERR_OR_NULL(acl)) - set_cached_acl(&inode->v, type, acl); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return acl; -} - -int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode_u, - struct posix_acl *acl, int type) -{ - struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); - int ret; - - if (type == ACL_TYPE_DEFAULT && - !S_ISDIR(inode_u->bi_mode)) - return acl ? -EACCES : 0; - - if (acl) { - struct bkey_i_xattr *xattr = - bch2_acl_to_xattr(trans, acl, type); - if (IS_ERR(xattr)) - return PTR_ERR(xattr); - - ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, - inum, &xattr->k_i, 0); - } else { - struct xattr_search_key search = - X_SEARCH(acl_to_xattr_type(type), "", 0); - - ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, - inum, &search); - } - - return bch2_err_matches(ret, ENOENT) ? 0 : ret; -} - -int bch2_set_acl(struct mnt_idmap *idmap, - struct dentry *dentry, - struct posix_acl *_acl, int type) -{ - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_iter inode_iter = {}; - struct bch_inode_unpacked inode_u; - struct posix_acl *acl; - umode_t mode; - int ret; - - mutex_lock(&inode->ei_update_lock); - struct btree_trans *trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - acl = _acl; - - ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?: - bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_intent); - if (ret) - goto btree_err; - - mode = inode_u.bi_mode; - - if (type == ACL_TYPE_ACCESS) { - ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl); - if (ret) - goto btree_err; - } - - ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type); - if (ret) - goto btree_err; - - inode_u.bi_ctime = bch2_current_time(c); - inode_u.bi_mode = mode; - - ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, 0); -btree_err: - bch2_trans_iter_exit(trans, &inode_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (unlikely(ret)) - goto err; - - bch2_inode_update_after_write(trans, inode, &inode_u, - ATTR_CTIME|ATTR_MODE); - - set_cached_acl(&inode->v, type, acl); -err: - bch2_trans_put(trans); - mutex_unlock(&inode->ei_update_lock); - - return ret; -} - -int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode, - umode_t mode, - struct posix_acl **new_acl) -{ - struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); - struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); - struct btree_iter iter; - struct posix_acl *acl = NULL; - - struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash_info, inum, &search, BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - return bch2_err_matches(ret, ENOENT) ? 0 : ret; - - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - - acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); - ret = PTR_ERR_OR_ZERO(acl); - if (ret) - goto err; - - ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode)); - if (ret) - goto err; - - struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - new->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, &new->k_i, 0); - *new_acl = acl; - acl = NULL; -err: - bch2_trans_iter_exit(trans, &iter); - if (!IS_ERR_OR_NULL(acl)) - kfree(acl); - return ret; -} - -#endif /* CONFIG_BCACHEFS_POSIX_ACL */ diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h deleted file mode 100644 index fe730a6bf0c1..000000000000 --- a/fs/bcachefs/acl.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ACL_H -#define _BCACHEFS_ACL_H - -struct bch_inode_unpacked; -struct bch_hash_info; -struct bch_inode_info; -struct posix_acl; - -#define BCH_ACL_VERSION 0x0001 - -typedef struct { - __le16 e_tag; - __le16 e_perm; - __le32 e_id; -} bch_acl_entry; - -typedef struct { - __le16 e_tag; - __le16 e_perm; -} bch_acl_entry_short; - -typedef struct { - __le32 a_version; -} bch_acl_header; - -void bch2_acl_to_text(struct printbuf *, const void *, size_t); - -#ifdef CONFIG_BCACHEFS_POSIX_ACL - -struct posix_acl *bch2_get_acl(struct inode *, int, bool); - -int bch2_set_acl_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - struct posix_acl *, int); -int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); -int bch2_acl_chmod(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - umode_t, struct posix_acl **); - -#else - -static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode_u, - struct posix_acl *acl, int type) -{ - return 0; -} - -static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode, - umode_t mode, - struct posix_acl **new_acl) -{ - return 0; -} - -#endif /* CONFIG_BCACHEFS_POSIX_ACL */ - -#endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c deleted file mode 100644 index 66de46318620..000000000000 --- a/fs/bcachefs/alloc_background.c +++ /dev/null @@ -1,2680 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_gc.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "buckets_waiting_for_journal.h" -#include "clock.h" -#include "debug.h" -#include "disk_accounting.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "lru.h" -#include "recovery.h" -#include "varint.h" - -#include <linux/kthread.h> -#include <linux/math64.h> -#include <linux/random.h> -#include <linux/rculist.h> -#include <linux/rcupdate.h> -#include <linux/sched/task.h> -#include <linux/sort.h> -#include <linux/jiffies.h> - -static void bch2_discard_one_bucket_fast(struct bch_dev *, u64); - -/* Persistent alloc info: */ - -static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { -#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, - BCH_ALLOC_FIELDS_V1() -#undef x -}; - -struct bkey_alloc_unpacked { - u64 journal_seq; - u8 gen; - u8 oldest_gen; - u8 data_type; - bool need_discard:1; - bool need_inc_gen:1; -#define x(_name, _bits) u##_bits _name; - BCH_ALLOC_FIELDS_V2() -#undef x -}; - -static inline u64 alloc_field_v1_get(const struct bch_alloc *a, - const void **p, unsigned field) -{ - unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; - u64 v; - - if (!(a->fields & (1 << field))) - return 0; - - switch (bytes) { - case 1: - v = *((const u8 *) *p); - break; - case 2: - v = le16_to_cpup(*p); - break; - case 4: - v = le32_to_cpup(*p); - break; - case 8: - v = le64_to_cpup(*p); - break; - default: - BUG(); - } - - *p += bytes; - return v; -} - -static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, - struct bkey_s_c k) -{ - const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; - const void *d = in->data; - unsigned idx = 0; - - out->gen = in->gen; - -#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); - BCH_ALLOC_FIELDS_V1() -#undef x -} - -static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, - struct bkey_s_c k) -{ - struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); - const u8 *in = a.v->data; - const u8 *end = bkey_val_end(a); - unsigned fieldnr = 0; - int ret; - u64 v; - - out->gen = a.v->gen; - out->oldest_gen = a.v->oldest_gen; - out->data_type = a.v->data_type; - -#define x(_name, _bits) \ - if (fieldnr < a.v->nr_fields) { \ - ret = bch2_varint_decode_fast(in, end, &v); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - } else { \ - v = 0; \ - } \ - out->_name = v; \ - if (v != out->_name) \ - return -1; \ - fieldnr++; - - BCH_ALLOC_FIELDS_V2() -#undef x - return 0; -} - -static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, - struct bkey_s_c k) -{ - struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); - const u8 *in = a.v->data; - const u8 *end = bkey_val_end(a); - unsigned fieldnr = 0; - int ret; - u64 v; - - out->gen = a.v->gen; - out->oldest_gen = a.v->oldest_gen; - out->data_type = a.v->data_type; - out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); - out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); - out->journal_seq = le64_to_cpu(a.v->journal_seq); - -#define x(_name, _bits) \ - if (fieldnr < a.v->nr_fields) { \ - ret = bch2_varint_decode_fast(in, end, &v); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - } else { \ - v = 0; \ - } \ - out->_name = v; \ - if (v != out->_name) \ - return -1; \ - fieldnr++; - - BCH_ALLOC_FIELDS_V2() -#undef x - return 0; -} - -static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) -{ - struct bkey_alloc_unpacked ret = { .gen = 0 }; - - switch (k.k->type) { - case KEY_TYPE_alloc: - bch2_alloc_unpack_v1(&ret, k); - break; - case KEY_TYPE_alloc_v2: - bch2_alloc_unpack_v2(&ret, k); - break; - case KEY_TYPE_alloc_v3: - bch2_alloc_unpack_v3(&ret, k); - break; - } - - return ret; -} - -static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) -{ - unsigned i, bytes = offsetof(struct bch_alloc, data); - - for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) - if (a->fields & (1 << i)) - bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; - - return DIV_ROUND_UP(bytes, sizeof(u64)); -} - -int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); - int ret = 0; - - /* allow for unknown fields */ - bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), - c, alloc_v1_val_size_bad, - "incorrect value size (%zu < %u)", - bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); -fsck_err: - return ret; -} - -int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_alloc_unpacked u; - int ret = 0; - - bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), - c, alloc_v2_unpack_error, - "unpack error"); -fsck_err: - return ret; -} - -int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_alloc_unpacked u; - int ret = 0; - - bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), - c, alloc_v3_unpack_error, - "unpack error"); -fsck_err: - return ret; -} - -int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bch_alloc_v4 a; - int ret = 0; - - bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k)); - - bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k), - c, alloc_v4_val_size_bad, - "bad val size (%u > %zu)", - alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k)); - - bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) && - BCH_ALLOC_V4_NR_BACKPOINTERS(&a), - c, alloc_v4_backpointers_start_bad, - "invalid backpointers_start"); - - bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type, - c, alloc_key_data_type_bad, - "invalid data type (got %u should be %u)", - a.data_type, alloc_data_type(a, a.data_type)); - - for (unsigned i = 0; i < 2; i++) - bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX, - c, alloc_key_io_time_bad, - "invalid io_time[%s]: %llu, max %llu", - i == READ ? "read" : "write", - a.io_time[i], LRU_TIME_MAX); - - unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) > - offsetof(struct bch_alloc_v4, stripe_sectors) - ? a.stripe_sectors - : 0; - - switch (a.data_type) { - case BCH_DATA_free: - case BCH_DATA_need_gc_gens: - case BCH_DATA_need_discard: - bkey_fsck_err_on(stripe_sectors || - a.dirty_sectors || - a.cached_sectors || - a.stripe, - c, alloc_key_empty_but_have_data, - "empty data type free but have data %u.%u.%u %u", - stripe_sectors, - a.dirty_sectors, - a.cached_sectors, - a.stripe); - break; - case BCH_DATA_sb: - case BCH_DATA_journal: - case BCH_DATA_btree: - case BCH_DATA_user: - case BCH_DATA_parity: - bkey_fsck_err_on(!a.dirty_sectors && - !stripe_sectors, - c, alloc_key_dirty_sectors_0, - "data_type %s but dirty_sectors==0", - bch2_data_type_str(a.data_type)); - break; - case BCH_DATA_cached: - bkey_fsck_err_on(!a.cached_sectors || - a.dirty_sectors || - stripe_sectors || - a.stripe, - c, alloc_key_cached_inconsistency, - "data type inconsistency"); - - bkey_fsck_err_on(!a.io_time[READ] && - !(c->recovery.passes_to_run & - BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)), - c, alloc_key_cached_but_read_time_zero, - "cached bucket with read_time == 0"); - break; - case BCH_DATA_stripe: - break; - } -fsck_err: - return ret; -} - -void bch2_alloc_v4_swab(struct bkey_s k) -{ - struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; - - a->journal_seq_nonempty = swab64(a->journal_seq_nonempty); - a->journal_seq_empty = swab64(a->journal_seq_empty); - a->flags = swab32(a->flags); - a->dirty_sectors = swab32(a->dirty_sectors); - a->cached_sectors = swab32(a->cached_sectors); - a->io_time[0] = swab64(a->io_time[0]); - a->io_time[1] = swab64(a->io_time[1]); - a->stripe = swab32(a->stripe); - a->nr_external_backpointers = swab32(a->nr_external_backpointers); - a->stripe_sectors = swab32(a->stripe_sectors); -} - -static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, - unsigned dev, const struct bch_alloc_v4 *a) -{ - struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL; - - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); - bch2_prt_data_type(out, a->data_type); - prt_newline(out); - prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty); - prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); - prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); - prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); - prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); - prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); - prt_printf(out, "cached_sectors %u\n", a->cached_sectors); - prt_printf(out, "stripe %u\n", a->stripe); - prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); - prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); - prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); - - if (ca) - prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca)); - prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a)); - printbuf_indent_sub(out, 2); - - bch2_dev_put(ca); -} - -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - - __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a); -} - -void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v); -} - -void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) -{ - if (k.k->type == KEY_TYPE_alloc_v4) { - void *src, *dst; - - *out = *bkey_s_c_to_alloc_v4(k).v; - - src = alloc_v4_backpointers(out); - SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); - dst = alloc_v4_backpointers(out); - - if (src < dst) - memset(src, 0, dst - src); - - SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); - } else { - struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); - - *out = (struct bch_alloc_v4) { - .journal_seq_nonempty = u.journal_seq, - .flags = u.need_discard, - .gen = u.gen, - .oldest_gen = u.oldest_gen, - .data_type = u.data_type, - .stripe_redundancy = u.stripe_redundancy, - .dirty_sectors = u.dirty_sectors, - .cached_sectors = u.cached_sectors, - .io_time[READ] = u.read_time, - .io_time[WRITE] = u.write_time, - .stripe = u.stripe, - }; - - SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); - } -} - -static noinline struct bkey_i_alloc_v4 * -__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bkey_i_alloc_v4 *ret; - - ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); - if (IS_ERR(ret)) - return ret; - - if (k.k->type == KEY_TYPE_alloc_v4) { - void *src, *dst; - - bkey_reassemble(&ret->k_i, k); - - src = alloc_v4_backpointers(&ret->v); - SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); - dst = alloc_v4_backpointers(&ret->v); - - if (src < dst) - memset(src, 0, dst - src); - - SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); - set_alloc_v4_u64s(ret); - } else { - bkey_alloc_v4_init(&ret->k_i); - ret->k.p = k.k->p; - bch2_alloc_to_v4(k, &ret->v); - } - return ret; -} - -static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bkey_s_c_alloc_v4 a; - - if (likely(k.k->type == KEY_TYPE_alloc_v4) && - ((a = bkey_s_c_to_alloc_v4(k), true) && - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) - return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); - - return __bch2_alloc_to_v4_mut(trans, k); -} - -struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) -{ - return bch2_alloc_to_v4_mut_inlined(trans, k); -} - -struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos) -{ - struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, - BTREE_ITER_with_updates| - BTREE_ITER_cached| - BTREE_ITER_intent); - int ret = bkey_err(k); - if (unlikely(ret)) - return ERR_PTR(ret); - - struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (unlikely(ret)) - goto err; - return a; -err: - bch2_trans_iter_exit(trans, iter); - return ERR_PTR(ret); -} - -__flatten -struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos, - enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos, - BTREE_ITER_with_updates| - BTREE_ITER_cached| - BTREE_ITER_intent); - int ret = bkey_err(k); - if (unlikely(ret)) - return ERR_PTR(ret); - - if ((void *) k.v >= trans->mem && - (void *) k.v < trans->mem + trans->mem_top) { - bch2_trans_iter_exit(trans, &iter); - return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v); - } - - struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); - if (IS_ERR(a)) { - bch2_trans_iter_exit(trans, &iter); - return a; - } - - ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_); - bch2_trans_iter_exit(trans, &iter); - return unlikely(ret) ? ERR_PTR(ret) : a; -} - -static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) -{ - *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; - - pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; - return pos; -} - -static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) -{ - pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; - pos.offset += offset; - return pos; -} - -static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) -{ - return k.k->type == KEY_TYPE_bucket_gens - ? bkey_s_c_to_bucket_gens(k).v->gens[offset] - : 0; -} - -int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), - c, bucket_gens_val_size_bad, - "bad val size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); -fsck_err: - return ret; -} - -void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); - unsigned i; - - for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { - if (i) - prt_char(out, ' '); - prt_printf(out, "%u", g.v->gens[i]); - } -} - -int bch2_bucket_gens_init(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bkey_i_bucket_gens g; - bool have_bucket_gens_key = false; - int ret; - - ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_prefetch, k, ({ - /* - * Not a fsck error because this is checked/repaired by - * bch2_check_alloc_key() which runs later: - */ - if (!bch2_dev_bucket_exists(c, k.k->p)) - continue; - - struct bch_alloc_v4 a; - u8 gen = bch2_alloc_to_v4(k, &a)->gen; - unsigned offset; - struct bpos pos = alloc_gens_pos(iter.pos, &offset); - int ret2 = 0; - - if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) { - ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret2) - goto iter_err; - have_bucket_gens_key = false; - } - - if (!have_bucket_gens_key) { - bkey_bucket_gens_init(&g.k_i); - g.k.p = pos; - have_bucket_gens_key = true; - } - - g.v.gens[offset] = gen; -iter_err: - ret2; - })); - - if (have_bucket_gens_key && !ret) - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); - - bch2_trans_put(trans); - - bch_err_fn(c, ret); - return ret; -} - -int bch2_alloc_read(struct bch_fs *c) -{ - down_read(&c->state_lock); - - struct btree_trans *trans = bch2_trans_get(c); - struct bch_dev *ca = NULL; - int ret; - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { - ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_prefetch, k, ({ - u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; - u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; - - if (k.k->type != KEY_TYPE_bucket_gens) - continue; - - ca = bch2_dev_iterate(c, ca, k.k->p.inode); - /* - * Not a fsck error because this is checked/repaired by - * bch2_check_alloc_key() which runs later: - */ - if (!ca) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - - const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; - - for (u64 b = max_t(u64, ca->mi.first_bucket, start); - b < min_t(u64, ca->mi.nbuckets, end); - b++) - *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; - 0; - })); - } else { - ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_prefetch, k, ({ - ca = bch2_dev_iterate(c, ca, k.k->p.inode); - /* - * Not a fsck error because this is checked/repaired by - * bch2_check_alloc_key() which runs later: - */ - if (!ca) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - - if (k.k->p.offset < ca->mi.first_bucket) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket)); - continue; - } - - if (k.k->p.offset >= ca->mi.nbuckets) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - - struct bch_alloc_v4 a; - *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; - 0; - })); - } - - bch2_dev_put(ca); - bch2_trans_put(trans); - - up_read(&c->state_lock); - bch_err_fn(c, ret); - return ret; -} - -/* Free space/discard btree: */ - -static int __need_discard_or_freespace_err(struct btree_trans *trans, - struct bkey_s_c alloc_k, - bool set, bool discard, bool repair) -{ - struct bch_fs *c = trans->c; - enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0); - enum bch_sb_error_id err_id = discard - ? BCH_FSCK_ERR_need_discard_key_wrong - : BCH_FSCK_ERR_freespace_key_wrong; - enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace; - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, alloc_k); - - int ret = __bch2_fsck_err(NULL, trans, flags, err_id, - "bucket incorrectly %sset in %s btree\n%s", - set ? "" : "un", - bch2_btree_id_str(btree), - buf.buf); - if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) || - bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed)) - ret = 0; - - printbuf_exit(&buf); - return ret; -} - -#define need_discard_or_freespace_err(...) \ - fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__)) - -#define need_discard_or_freespace_err_on(cond, ...) \ - (unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false) - -static int bch2_bucket_do_index(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c alloc_k, - const struct bch_alloc_v4 *a, - bool set) -{ - enum btree_id btree; - struct bpos pos; - - if (a->data_type != BCH_DATA_free && - a->data_type != BCH_DATA_need_discard) - return 0; - - switch (a->data_type) { - case BCH_DATA_free: - btree = BTREE_ID_freespace; - pos = alloc_freespace_pos(alloc_k.k->p, *a); - break; - case BCH_DATA_need_discard: - btree = BTREE_ID_need_discard; - pos = alloc_k.k->p; - break; - default: - return 0; - } - - struct btree_iter iter; - struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent); - int ret = bkey_err(old); - if (ret) - return ret; - - need_discard_or_freespace_err_on(ca->mi.freespace_initialized && - !old.k->type != set, - trans, alloc_k, set, - btree == BTREE_ID_need_discard, false); - - ret = bch2_btree_bit_mod_iter(trans, &iter, set); -fsck_err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline int bch2_bucket_gen_update(struct btree_trans *trans, - struct bpos bucket, u8 gen) -{ - struct btree_iter iter; - unsigned offset; - struct bpos pos = alloc_gens_pos(bucket, &offset); - struct bkey_i_bucket_gens *g; - struct bkey_s_c k; - int ret; - - g = bch2_trans_kmalloc(trans, sizeof(*g)); - ret = PTR_ERR_OR_ZERO(g); - if (ret) - return ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, - BTREE_ITER_intent| - BTREE_ITER_with_updates); - ret = bkey_err(k); - if (ret) - return ret; - - if (k.k->type != KEY_TYPE_bucket_gens) { - bkey_bucket_gens_init(&g->k_i); - g->k.p = iter.pos; - } else { - bkey_reassemble(&g->k_i, k); - } - - g->v.gens[offset] = gen; - - ret = bch2_trans_update(trans, &iter, &g->k_i, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca, - enum bch_data_type data_type, - s64 delta_buckets, - s64 delta_sectors, - s64 delta_fragmented, unsigned flags) -{ - s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; - - return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, - d, dev_data_type, - .dev = ca->dev_idx, - .data_type = data_type); -} - -int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, - const struct bch_alloc_v4 *old, - const struct bch_alloc_v4 *new, - unsigned flags) -{ - s64 old_sectors = bch2_bucket_sectors(*old); - s64 new_sectors = bch2_bucket_sectors(*new); - if (old->data_type != new->data_type) { - int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, - 1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?: - bch2_dev_data_type_accounting_mod(trans, ca, old->data_type, - -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags); - if (ret) - return ret; - } else if (old_sectors != new_sectors) { - int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, - 0, - new_sectors - old_sectors, - bch2_bucket_sectors_fragmented(ca, *new) - - bch2_bucket_sectors_fragmented(ca, *old), flags); - if (ret) - return ret; - } - - s64 old_unstriped = bch2_bucket_sectors_unstriped(*old); - s64 new_unstriped = bch2_bucket_sectors_unstriped(*new); - if (old_unstriped != new_unstriped) { - int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped, - !!new_unstriped - !!old_unstriped, - new_unstriped - old_unstriped, - 0, - flags); - if (ret) - return ret; - } - - return 0; -} - -int bch2_trigger_alloc(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); - if (!ca) - return bch_err_throw(c, trigger_alloc); - - struct bch_alloc_v4 old_a_convert; - const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - - struct bch_alloc_v4 *new_a; - if (likely(new.k->type == KEY_TYPE_alloc_v4)) { - new_a = bkey_s_to_alloc_v4(new).v; - } else { - BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair))); - - struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); - ret = PTR_ERR_OR_ZERO(new_ka); - if (unlikely(ret)) - goto err; - new_a = &new_ka->v; - } - - if (flags & BTREE_TRIGGER_transactional) { - alloc_data_type_set(new_a, new_a->data_type); - - int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - - (int) data_type_is_empty(old_a->data_type); - - if (is_empty_delta < 0) { - new_a->io_time[READ] = bch2_current_io_time(c, READ); - new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); - SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); - } - - if (data_type_is_empty(new_a->data_type) && - BCH_ALLOC_V4_NEED_INC_GEN(new_a) && - !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { - if (new_a->oldest_gen == new_a->gen && - !bch2_bucket_sectors_total(*new_a)) - new_a->oldest_gen++; - new_a->gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); - alloc_data_type_set(new_a, new_a->data_type); - } - - if (old_a->data_type != new_a->data_type || - (new_a->data_type == BCH_DATA_free && - alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { - ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?: - bch2_bucket_do_index(trans, ca, new.s_c, new_a, true); - if (ret) - goto err; - } - - if (new_a->data_type == BCH_DATA_cached && - !new_a->io_time[READ]) - new_a->io_time[READ] = bch2_current_io_time(c, READ); - - ret = bch2_lru_change(trans, new.k->p.inode, - bucket_to_u64(new.k->p), - alloc_lru_idx_read(*old_a), - alloc_lru_idx_read(*new_a)); - if (ret) - goto err; - - ret = bch2_lru_change(trans, - BCH_LRU_BUCKET_FRAGMENTATION, - bucket_to_u64(new.k->p), - alloc_lru_idx_fragmentation(*old_a, ca), - alloc_lru_idx_fragmentation(*new_a, ca)); - if (ret) - goto err; - - if (old_a->gen != new_a->gen) { - ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); - if (ret) - goto err; - } - - ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); - if (ret) - goto err; - } - - if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { - u64 transaction_seq = trans->journal_res.seq; - BUG_ON(!transaction_seq); - - if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq, - trans, alloc_key_journal_seq_in_future, - "bucket journal seq in future (currently at %llu)\n%s", - journal_cur_seq(&c->journal), - (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf))) - new_a->journal_seq_nonempty = transaction_seq; - - int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - - (int) data_type_is_empty(old_a->data_type); - - /* - * Record journal sequence number of empty -> nonempty transition: - * Note that there may be multiple empty -> nonempty - * transitions, data in a bucket may be overwritten while we're - * still writing to it - so be careful to only record the first: - * */ - if (is_empty_delta < 0 && - new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) { - new_a->journal_seq_nonempty = transaction_seq; - new_a->journal_seq_empty = 0; - } - - /* - * Bucket becomes empty: mark it as waiting for a journal flush, - * unless updates since empty -> nonempty transition were never - * flushed - we may need to ask the journal not to flush - * intermediate sequence numbers: - */ - if (is_empty_delta > 0) { - if (new_a->journal_seq_nonempty == transaction_seq || - bch2_journal_noflush_seq(&c->journal, - new_a->journal_seq_nonempty, - transaction_seq)) { - new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0; - } else { - new_a->journal_seq_empty = transaction_seq; - - ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - new.k->p.inode, new.k->p.offset, - transaction_seq); - if (bch2_fs_fatal_err_on(ret, c, - "setting bucket_needs_journal_commit: %s", - bch2_err_str(ret))) - goto err; - } - } - - if (new_a->gen != old_a->gen) { - guard(rcu)(); - u8 *gen = bucket_gen(ca, new.k->p.offset); - if (unlikely(!gen)) - goto invalid_bucket; - *gen = new_a->gen; - } - -#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) -#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) -#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk) - - if (statechange(a->data_type == BCH_DATA_free) && - bucket_flushed(new_a)) - closure_wake_up(&c->freelist_wait); - - if (statechange(a->data_type == BCH_DATA_need_discard) && - !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && - bucket_flushed(new_a)) - bch2_discard_one_bucket_fast(ca, new.k->p.offset); - - if (statechange(a->data_type == BCH_DATA_cached) && - !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && - should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) - bch2_dev_do_invalidates(ca); - - if (statechange(a->data_type == BCH_DATA_need_gc_gens)) - bch2_gc_gens_async(c); - } - - if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { - guard(rcu)(); - struct bucket *g = gc_bucket(ca, new.k->p.offset); - if (unlikely(!g)) - goto invalid_bucket; - g->gen_valid = 1; - g->gen = new_a->gen; - } -err: -fsck_err: - printbuf_exit(&buf); - bch2_dev_put(ca); - return ret; -invalid_bucket: - bch2_fs_inconsistent(c, "reference to invalid bucket\n%s", - (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = bch_err_throw(c, trigger_alloc); - goto err; -} - -/* - * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for - * extents style btrees, but works on non-extents btrees: - */ -static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, struct bkey *hole) -{ - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); - - if (bkey_err(k)) - return k; - - if (k.k->type) { - return k; - } else { - struct btree_iter iter2; - struct bpos next; - - bch2_trans_copy_iter(trans, &iter2, iter); - - struct btree_path *path = btree_iter_path(trans, iter); - if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) - end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); - - end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); - - /* - * btree node min/max is a closed interval, upto takes a half - * open interval: - */ - k = bch2_btree_iter_peek_max(trans, &iter2, end); - next = iter2.pos; - bch2_trans_iter_exit(trans, &iter2); - - BUG_ON(next.offset >= iter->pos.offset + U32_MAX); - - if (bkey_err(k)) - return k; - - bkey_init(hole); - hole->p = iter->pos; - - bch2_key_resize(hole, next.offset - iter->pos.offset); - return (struct bkey_s_c) { hole, NULL }; - } -} - -static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket) -{ - if (*ca) { - if (bucket->offset < (*ca)->mi.first_bucket) - bucket->offset = (*ca)->mi.first_bucket; - - if (bucket->offset < (*ca)->mi.nbuckets) - return true; - - bch2_dev_put(*ca); - *ca = NULL; - bucket->inode++; - bucket->offset = 0; - } - - guard(rcu)(); - *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); - if (*ca) { - *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); - bch2_dev_get(*ca); - } - - return *ca != NULL; -} - -static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_dev **ca, struct bkey *hole) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c k; -again: - k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole); - if (bkey_err(k)) - return k; - - *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode); - - if (!k.k->type) { - struct bpos hole_start = bkey_start_pos(k.k); - - if (!*ca || !bucket_valid(*ca, hole_start.offset)) { - if (!next_bucket(c, ca, &hole_start)) - return bkey_s_c_null; - - bch2_btree_iter_set_pos(trans, iter, hole_start); - goto again; - } - - if (k.k->p.offset > (*ca)->mi.nbuckets) - bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset); - } - - return k; -} - -static noinline_for_stack -int bch2_check_alloc_key(struct btree_trans *trans, - struct bkey_s_c alloc_k, - struct btree_iter *alloc_iter, - struct btree_iter *discard_iter, - struct btree_iter *freespace_iter, - struct btree_iter *bucket_gens_iter) -{ - struct bch_fs *c = trans->c; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - unsigned gens_offset; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); - if (fsck_err_on(!ca, - trans, alloc_key_to_missing_dev_bucket, - "alloc key for invalid device:bucket %llu:%llu", - alloc_k.k->p.inode, alloc_k.k->p.offset)) - ret = bch2_btree_delete_at(trans, alloc_iter, 0); - if (!ca) - return ret; - - if (!ca->mi.freespace_initialized) - goto out; - - a = bch2_alloc_to_v4(alloc_k, &a_convert); - - bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p); - k = bch2_btree_iter_peek_slot(trans, discard_iter); - ret = bkey_err(k); - if (ret) - goto err; - - bool is_discarded = a->data_type == BCH_DATA_need_discard; - if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded, - trans, alloc_k, !is_discarded, true, true)) { - ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded); - if (ret) - goto err; - } - - bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); - k = bch2_btree_iter_peek_slot(trans, freespace_iter); - ret = bkey_err(k); - if (ret) - goto err; - - bool is_free = a->data_type == BCH_DATA_free; - if (need_discard_or_freespace_err_on(!!k.k->type != is_free, - trans, alloc_k, !is_free, false, true)) { - ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free); - if (ret) - goto err; - } - - bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); - k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), - trans, bucket_gens_key_wrong, - "incorrect gen in bucket_gens btree (got %u should be %u)\n%s", - alloc_gen(k, gens_offset), a->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - struct bkey_i_bucket_gens *g = - bch2_trans_kmalloc(trans, sizeof(*g)); - - ret = PTR_ERR_OR_ZERO(g); - if (ret) - goto err; - - if (k.k->type == KEY_TYPE_bucket_gens) { - bkey_reassemble(&g->k_i, k); - } else { - bkey_bucket_gens_init(&g->k_i); - g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); - } - - g->v.gens[gens_offset] = a->gen; - - ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); - if (ret) - goto err; - } -out: -err: -fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -static noinline_for_stack -int bch2_check_alloc_hole_freespace(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos start, - struct bpos *end, - struct btree_iter *freespace_iter) -{ - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - int ret; - - if (!ca->mi.freespace_initialized) - return 0; - - bch2_btree_iter_set_pos(trans, freespace_iter, start); - - k = bch2_btree_iter_peek_slot(trans, freespace_iter); - ret = bkey_err(k); - if (ret) - goto err; - - *end = bkey_min(k.k->p, *end); - - if (fsck_err_on(k.k->type != KEY_TYPE_set, - trans, freespace_hole_missing, - "hole in alloc btree missing in freespace btree\n" - "device %llu buckets %llu-%llu", - freespace_iter->pos.inode, - freespace_iter->pos.offset, - end->offset)) { - struct bkey_i *update = - bch2_trans_kmalloc(trans, sizeof(*update)); - - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_init(&update->k); - update->k.type = KEY_TYPE_set; - update->k.p = freespace_iter->pos; - bch2_key_resize(&update->k, - min_t(u64, U32_MAX, end->offset - - freespace_iter->pos.offset)); - - ret = bch2_trans_update(trans, freespace_iter, update, 0); - if (ret) - goto err; - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static noinline_for_stack -int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, - struct bpos start, - struct bpos *end, - struct btree_iter *bucket_gens_iter) -{ - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - unsigned i, gens_offset, gens_end_offset; - int ret; - - bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); - - k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (bkey_cmp(alloc_gens_pos(start, &gens_offset), - alloc_gens_pos(*end, &gens_end_offset))) - gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; - - if (k.k->type == KEY_TYPE_bucket_gens) { - struct bkey_i_bucket_gens g; - bool need_update = false; - - bkey_reassemble(&g.k_i, k); - - for (i = gens_offset; i < gens_end_offset; i++) { - if (fsck_err_on(g.v.gens[i], trans, - bucket_gens_hole_wrong, - "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", - bucket_gens_pos_to_alloc(k.k->p, i).inode, - bucket_gens_pos_to_alloc(k.k->p, i).offset, - g.v.gens[i])) { - g.v.gens[i] = 0; - need_update = true; - } - } - - if (need_update) { - struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); - - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - memcpy(u, &g, sizeof(g)); - - ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); - if (ret) - goto err; - } - } - - *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -struct check_discard_freespace_key_async { - struct work_struct work; - struct bch_fs *c; - struct bbpos pos; -}; - -static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0); - int ret = bkey_err(k); - if (ret) - return ret; - - u8 gen; - ret = k.k->type != KEY_TYPE_set - ? bch2_check_discard_freespace_key(trans, &iter, &gen, false) - : 0; - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static void check_discard_freespace_key_work(struct work_struct *work) -{ - struct check_discard_freespace_key_async *w = - container_of(work, struct check_discard_freespace_key_async, work); - - bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); - enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key); - kfree(w); -} - -int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, - bool async_repair) -{ - struct bch_fs *c = trans->c; - enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard - ? BCH_DATA_need_discard - : BCH_DATA_free; - struct printbuf buf = PRINTBUF; - - unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)| - FSCK_CAN_FIX|FSCK_CAN_IGNORE; - - struct bpos bucket = iter->pos; - bucket.offset &= ~(~0ULL << 56); - u64 genbits = iter->pos.offset & (~0ULL << 56); - - struct btree_iter alloc_iter; - struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, - BTREE_ID_alloc, bucket, - async_repair ? BTREE_ITER_cached : 0); - int ret = bkey_err(alloc_k); - if (ret) - return ret; - - if (!bch2_dev_bucket_exists(c, bucket)) { - if (__fsck_err(trans, fsck_flags, - need_discard_freespace_key_to_invalid_dev_bucket, - "entry in %s btree for nonexistant dev:bucket %llu:%llu", - bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) - goto delete; - ret = 1; - goto out; - } - - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - - if (a->data_type != state || - (state == BCH_DATA_free && - genbits != alloc_freespace_genbits(*a))) { - if (__fsck_err(trans, fsck_flags, - need_discard_freespace_key_bad, - "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), - bch2_btree_id_str(iter->btree_id), - iter->pos.inode, - iter->pos.offset, - a->data_type == state, - genbits >> 56, alloc_freespace_genbits(*a) >> 56)) - goto delete; - ret = 1; - goto out; - } - - *gen = a->gen; -out: -fsck_err: - bch2_set_btree_iter_dontneed(trans, &alloc_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); - return ret; -delete: - if (!async_repair) { - ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc) ?: - bch_err_throw(c, transaction_restart_commit); - goto out; - } else { - /* - * We can't repair here when called from the allocator path: the - * commit will recurse back into the allocator - */ - struct check_discard_freespace_key_async *w = - kzalloc(sizeof(*w), GFP_KERNEL); - if (!w) - goto out; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) { - kfree(w); - goto out; - } - - INIT_WORK(&w->work, check_discard_freespace_key_work); - w->c = c; - w->pos = BBPOS(iter->btree_id, iter->pos); - queue_work(c->write_ref_wq, &w->work); - - ret = 1; /* don't allocate from this bucket */ - goto out; - } -} - -static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter) -{ - u8 gen; - int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false); - return ret < 0 ? ret : 0; -} - -/* - * We've already checked that generation numbers in the bucket_gens btree are - * valid for buckets that exist; this just checks for keys for nonexistent - * buckets. - */ -static noinline_for_stack -int bch2_check_bucket_gens_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_i_bucket_gens g; - u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; - u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; - u64 b; - bool need_update = false; - struct printbuf buf = PRINTBUF; - int ret = 0; - - BUG_ON(k.k->type != KEY_TYPE_bucket_gens); - bkey_reassemble(&g.k_i, k); - - struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); - if (!ca) { - if (fsck_err(trans, bucket_gens_to_invalid_dev, - "bucket_gens key for invalid device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, 0); - goto out; - } - - if (fsck_err_on(end <= ca->mi.first_bucket || - start >= ca->mi.nbuckets, - trans, bucket_gens_to_invalid_buckets, - "bucket_gens key for invalid buckets:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto out; - } - - for (b = start; b < ca->mi.first_bucket; b++) - if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], - trans, bucket_gens_nonzero_for_invalid_buckets, - "bucket_gens key has nonzero gen for invalid bucket")) { - g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; - need_update = true; - } - - for (b = ca->mi.nbuckets; b < end; b++) - if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], - trans, bucket_gens_nonzero_for_invalid_buckets, - "bucket_gens key has nonzero gen for invalid bucket")) { - g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; - need_update = true; - } - - if (need_update) { - struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); - - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto out; - - memcpy(u, &g, sizeof(g)); - ret = bch2_trans_update(trans, iter, u, 0); - } -out: -fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -int bch2_check_alloc_info(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; - struct bch_dev *ca = NULL; - struct bkey hole; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_prefetch); - bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_prefetch); - bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_prefetch); - bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_prefetch); - - while (1) { - struct bpos next; - - bch2_trans_begin(trans); - - k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole); - ret = bkey_err(k); - if (ret) - goto bkey_err; - - if (!k.k) - break; - - if (k.k->type) { - next = bpos_nosnap_successor(k.k->p); - - ret = bch2_check_alloc_key(trans, - k, &iter, - &discard_iter, - &freespace_iter, - &bucket_gens_iter); - if (ret) - goto bkey_err; - } else { - next = k.k->p; - - ret = bch2_check_alloc_hole_freespace(trans, ca, - bkey_start_pos(k.k), - &next, - &freespace_iter) ?: - bch2_check_alloc_hole_bucket_gens(trans, - bkey_start_pos(k.k), - &next, - &bucket_gens_iter); - if (ret) - goto bkey_err; - } - - ret = bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto bkey_err; - - bch2_btree_iter_set_pos(trans, &iter, next); -bkey_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - } - bch2_trans_iter_exit(trans, &bucket_gens_iter); - bch2_trans_iter_exit(trans, &freespace_iter); - bch2_trans_iter_exit(trans, &discard_iter); - bch2_trans_iter_exit(trans, &iter); - bch2_dev_put(ca); - ca = NULL; - - if (ret < 0) - goto err; - - ret = for_each_btree_key(trans, iter, - BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_prefetch, k, - bch2_check_discard_freespace_key_fsck(trans, &iter)); - if (ret) - goto err; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_prefetch); - while (1) { - bch2_trans_begin(trans); - k = bch2_btree_iter_peek(trans, &iter); - if (!k.k) - break; - - ret = bkey_err(k) ?: - bch2_check_discard_freespace_key_fsck(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - if (ret) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - - bch_err(c, "while checking %s", buf.buf); - printbuf_exit(&buf); - break; - } - - bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos)); - } - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; - - ret = for_each_btree_key_commit(trans, iter, - BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_bucket_gens_key(trans, &iter, k)); -err: - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, - struct btree_iter *alloc_iter, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - struct bkey_s_c alloc_k; - struct printbuf buf = PRINTBUF; - int ret; - - alloc_k = bch2_btree_iter_peek(trans, alloc_iter); - if (!alloc_k.k) - return 0; - - ret = bkey_err(alloc_k); - if (ret) - return ret; - - struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode); - if (!ca) - return 0; - - a = bch2_alloc_to_v4(alloc_k, &a_convert); - - u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); - if (lru_idx) { - ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION, - bucket_to_u64(alloc_k.k->p), - lru_idx, alloc_k, last_flushed); - if (ret) - goto err; - } - - if (a->data_type != BCH_DATA_cached) - goto err; - - if (fsck_err_on(!a->io_time[READ], - trans, alloc_key_cached_but_read_time_zero, - "cached bucket with read_time 0\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - struct bkey_i_alloc_v4 *a_mut = - bch2_alloc_to_v4_mut(trans, alloc_k); - ret = PTR_ERR_OR_ZERO(a_mut); - if (ret) - goto err; - - a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); - ret = bch2_trans_update(trans, alloc_iter, - &a_mut->k_i, BTREE_TRIGGER_norun); - if (ret) - goto err; - - a = &a_mut->v; - } - - ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ], - alloc_k, last_flushed); - if (ret) - goto err; -err: -fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -int bch2_check_alloc_to_lru_refs(struct bch_fs *c) -{ - struct bkey_buf last_flushed; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?: - bch2_check_stripe_to_lru_refs(c); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; -} - -static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) -{ - struct bch_fs *c = ca->fs; - int ret; - - mutex_lock(&ca->discard_buckets_in_flight_lock); - struct discard_in_flight *i = - darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); - if (i) { - ret = bch_err_throw(c, EEXIST_discard_in_flight_add); - goto out; - } - - ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { - .in_progress = in_progress, - .bucket = bucket, - })); -out: - mutex_unlock(&ca->discard_buckets_in_flight_lock); - return ret; -} - -static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) -{ - mutex_lock(&ca->discard_buckets_in_flight_lock); - struct discard_in_flight *i = - darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); - BUG_ON(!i || !i->in_progress); - - darray_remove_item(&ca->discard_buckets_in_flight, i); - mutex_unlock(&ca->discard_buckets_in_flight_lock); -} - -struct discard_buckets_state { - u64 seen; - u64 open; - u64 need_journal_commit; - u64 discarded; -}; - -static int bch2_discard_one_bucket(struct btree_trans *trans, - struct bch_dev *ca, - struct btree_iter *need_discard_iter, - struct bpos *discard_pos_done, - struct discard_buckets_state *s, - bool fastpath) -{ - struct bch_fs *c = trans->c; - struct bpos pos = need_discard_iter->pos; - struct btree_iter iter = {}; - struct bkey_s_c k; - struct bkey_i_alloc_v4 *a; - struct printbuf buf = PRINTBUF; - bool discard_locked = false; - int ret = 0; - - if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { - s->open++; - goto out; - } - - u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, - pos.inode, pos.offset); - if (seq_ready > c->journal.flushed_seq_ondisk) { - if (seq_ready > c->journal.flushing_seq) - s->need_journal_commit++; - goto out; - } - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - need_discard_iter->pos, - BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) - goto out; - - a = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto out; - - if (a->v.data_type != BCH_DATA_need_discard) { - if (need_discard_or_freespace_err(trans, k, true, true, true)) { - ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false); - if (ret) - goto out; - goto commit; - } - - goto out; - } - - if (!fastpath) { - if (discard_in_flight_add(ca, iter.pos.offset, true)) - goto out; - - discard_locked = true; - } - - if (!bkey_eq(*discard_pos_done, iter.pos)) { - s->discarded++; - *discard_pos_done = iter.pos; - - if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) { - /* - * This works without any other locks because this is the only - * thread that removes items from the need_discard tree - */ - bch2_trans_unlock_long(trans); - blkdev_issue_discard(ca->disk_sb.bdev, - k.k->p.offset * ca->mi.bucket_size, - ca->mi.bucket_size, - GFP_KERNEL); - ret = bch2_trans_relock_notrace(trans); - if (ret) - goto out; - } - } - - SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - alloc_data_type_set(&a->v, a->v.data_type); - - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - if (ret) - goto out; -commit: - ret = bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto out; - - if (!fastpath) - count_event(c, bucket_discard); - else - count_event(c, bucket_discard_fast); -out: -fsck_err: - if (discard_locked) - discard_in_flight_remove(ca, iter.pos.offset); - if (!ret) - s->seen++; - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -} - -static void bch2_do_discards_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); - struct bch_fs *c = ca->fs; - struct discard_buckets_state s = {}; - struct bpos discard_pos_done = POS_MAX; - int ret; - - /* - * We're doing the commit in bch2_discard_one_bucket instead of using - * for_each_btree_key_commit() so that we can increment counters after - * successful commit: - */ - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, - BTREE_ID_need_discard, - POS(ca->dev_idx, 0), - POS(ca->dev_idx, U64_MAX), 0, k, - bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); - - if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal)) - bch2_journal_flush_async(&c->journal, NULL); - - trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, - bch2_err_str(ret)); - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); -} - -void bch2_dev_do_discards(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard)) - return; - - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards)) - goto put_write_ref; - - if (queue_work(c->write_ref_wq, &ca->discard_work)) - return; - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); -put_write_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); -} - -void bch2_do_discards(struct bch_fs *c) -{ - for_each_member_device(c, ca) - bch2_dev_do_discards(ca); -} - -static int bch2_do_discards_fast_one(struct btree_trans *trans, - struct bch_dev *ca, - u64 bucket, - struct bpos *discard_pos_done, - struct discard_buckets_state *s) -{ - struct btree_iter need_discard_iter; - struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter, - BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); - int ret = bkey_err(discard_k); - if (ret) - return ret; - - if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set, - trans, discarding_bucket_not_in_need_discard_btree, - "attempting to discard bucket %u:%llu not in need_discard btree", - ca->dev_idx, bucket)) - goto out; - - ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); -out: -fsck_err: - bch2_trans_iter_exit(trans, &need_discard_iter); - return ret; -} - -static void bch2_do_discards_fast_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); - struct bch_fs *c = ca->fs; - struct discard_buckets_state s = {}; - struct bpos discard_pos_done = POS_MAX; - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - - while (1) { - bool got_bucket = false; - u64 bucket; - - mutex_lock(&ca->discard_buckets_in_flight_lock); - darray_for_each(ca->discard_buckets_in_flight, i) { - if (i->in_progress) - continue; - - got_bucket = true; - bucket = i->bucket; - i->in_progress = true; - break; - } - mutex_unlock(&ca->discard_buckets_in_flight_lock); - - if (!got_bucket) - break; - - ret = lockrestart_do(trans, - bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s)); - bch_err_fn(c, ret); - - discard_in_flight_remove(ca, bucket); - - if (ret) - break; - } - - trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - - bch2_trans_put(trans); - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); -} - -static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) -{ - struct bch_fs *c = ca->fs; - - if (discard_in_flight_add(ca, bucket, false)) - return; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast)) - return; - - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast)) - goto put_ref; - - if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) - return; - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); -put_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); -} - -static int invalidate_one_bp(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c_backpointer bp, - struct bkey_buf *last_flushed) -{ - struct btree_iter extent_iter; - struct bkey_s_c extent_k = - bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed); - int ret = bkey_err(extent_k); - if (ret) - return ret; - - if (!extent_k.k) - return 0; - - struct bkey_i *n = - bch2_bkey_make_mut(trans, &extent_iter, &extent_k, - BTREE_UPDATE_internal_snapshot_node); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx); -err: - bch2_trans_iter_exit(trans, &extent_iter); - return ret; -} - -static int invalidate_one_bucket_by_bps(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, - u8 gen, - struct bkey_buf *last_flushed) -{ - struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket); - struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket); - - return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, - bp_start, bp_end, 0, k, - NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc, ({ - if (k.k->type != KEY_TYPE_backpointer) - continue; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - - if (bp.v->bucket_gen != gen) - continue; - - /* filter out bps with gens that don't match */ - - invalidate_one_bp(trans, ca, bp, last_flushed); - })); -} - -noinline_for_stack -static int invalidate_one_bucket(struct btree_trans *trans, - struct bch_dev *ca, - struct btree_iter *lru_iter, - struct bkey_s_c lru_k, - struct bkey_buf *last_flushed, - s64 *nr_to_invalidate) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); - struct btree_iter alloc_iter = {}; - int ret = 0; - - if (*nr_to_invalidate <= 0) - return 1; - - if (!bch2_dev_bucket_exists(c, bucket)) { - if (fsck_err(trans, lru_entry_to_invalid_bucket, - "lru key points to nonexistent device:bucket %llu:%llu", - bucket.inode, bucket.offset)) - return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); - goto out; - } - - if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) - return 0; - - struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, - BTREE_ID_alloc, bucket, - BTREE_ITER_cached); - ret = bkey_err(alloc_k); - if (ret) - return ret; - - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - - /* We expect harmless races here due to the btree write buffer: */ - if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a)) - goto out; - - /* - * Impossible since alloc_lru_idx_read() only returns nonzero if the - * bucket is supposed to be on the cached bucket LRU (i.e. - * BCH_DATA_cached) - * - * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 - */ - BUG_ON(a->data_type != BCH_DATA_cached); - BUG_ON(a->dirty_sectors); - - if (!a->cached_sectors) { - bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset, - true, last_flushed); - goto out; - } - - unsigned cached_sectors = a->cached_sectors; - u8 gen = a->gen; - - ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed); - if (ret) - goto out; - - trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); - --*nr_to_invalidate; -out: -fsck_err: - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); - return ret; -} - -static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, - struct bch_dev *ca, bool *wrapped) -{ - struct bkey_s_c k; -again: - k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); - if (!k.k && !*wrapped) { - bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0)); - *wrapped = true; - goto again; - } - - return k; -} - -static void bch2_do_invalidates_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); - struct bch_fs *c = ca->fs; - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - ret = bch2_btree_write_buffer_tryflush(trans); - if (ret) - goto err; - - s64 nr_to_invalidate = - should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - struct btree_iter iter; - bool wrapped = false; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, - lru_pos(ca->dev_idx, 0, - ((bch2_current_io_time(c, READ) + U32_MAX) & - LRU_TIME_MAX)), 0); - - while (true) { - bch2_trans_begin(trans); - - struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); - ret = bkey_err(k); - if (ret) - goto restart_err; - if (!k.k) - break; - - ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate); -restart_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - bch2_btree_iter_advance(trans, &iter); - } - bch2_trans_iter_exit(trans, &iter); -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&last_flushed, c); - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); -} - -void bch2_dev_do_invalidates(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate)) - return; - - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates)) - goto put_ref; - - if (queue_work(c->write_ref_wq, &ca->invalidate_work)) - return; - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); -put_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); -} - -void bch2_do_invalidates(struct bch_fs *c) -{ - for_each_member_device(c, ca) - bch2_dev_do_invalidates(ca); -} - -int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, - u64 bucket_start, u64 bucket_end) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bkey hole; - struct bpos end = POS(ca->dev_idx, bucket_end); - struct bch_member *m; - unsigned long last_updated = jiffies; - int ret; - - BUG_ON(bucket_start > bucket_end); - BUG_ON(bucket_end > ca->mi.nbuckets); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), - BTREE_ITER_prefetch); - /* - * Scan the alloc btree for every bucket on @ca, and add buckets to the - * freespace/need_discard/need_gc_gens btrees as needed: - */ - while (1) { - if (time_after(jiffies, last_updated + HZ * 10)) { - bch_info(ca, "%s: currently at %llu/%llu", - __func__, iter.pos.offset, ca->mi.nbuckets); - last_updated = jiffies; - } - - bch2_trans_begin(trans); - - if (bkey_ge(iter.pos, end)) { - ret = 0; - break; - } - - k = bch2_get_key_or_hole(trans, &iter, end, &hole); - ret = bkey_err(k); - if (ret) - goto bkey_err; - - if (k.k->type) { - /* - * We process live keys in the alloc btree one at a - * time: - */ - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - - ret = bch2_bucket_do_index(trans, ca, k, a, true) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto bkey_err; - - bch2_btree_iter_advance(trans, &iter); - } else { - struct bkey_i *freespace; - - freespace = bch2_trans_kmalloc(trans, sizeof(*freespace)); - ret = PTR_ERR_OR_ZERO(freespace); - if (ret) - goto bkey_err; - - bkey_init(&freespace->k); - freespace->k.type = KEY_TYPE_set; - freespace->k.p = k.k->p; - freespace->k.size = k.k->size; - - ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto bkey_err; - - bch2_btree_iter_set_pos(trans, &iter, k.k->p); - } -bkey_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - - if (ret < 0) { - bch_err_msg(ca, ret, "initializing free space"); - return ret; - } - - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); - mutex_unlock(&c->sb_lock); - - return 0; -} - -int bch2_fs_freespace_init(struct bch_fs *c) -{ - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) - return 0; - - - /* - * We can crash during the device add path, so we need to check this on - * every mount: - */ - - bool doing_init = false; - for_each_member_device(c, ca) { - if (ca->mi.freespace_initialized) - continue; - - if (!doing_init) { - bch_info(c, "initializing freespace"); - doing_init = true; - } - - int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); - if (ret) { - bch2_dev_put(ca); - bch_err_fn(c, ret); - return ret; - } - } - - if (doing_init) { - mutex_lock(&c->sb_lock); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - bch_verbose(c, "done initializing freespace"); - } - - return 0; -} - -/* device removal */ - -int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) -{ - struct bpos start = POS(ca->dev_idx, 0); - struct bpos end = POS(ca->dev_idx, U64_MAX); - int ret; - - /* - * We clear the LRU and need_discard btrees first so that we don't race - * with bch2_do_invalidates() and bch2_do_discards() - */ - ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_dev_usage_remove(c, ca->dev_idx); - bch_err_msg(ca, ret, "removing dev alloc info"); - return ret; -} - -/* Bucket IO clocks: */ - -static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, - size_t bucket_nr, int rw) -{ - struct bch_fs *c = trans->c; - - struct btree_iter iter; - struct bkey_i_alloc_v4 *a = - bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) - return ret; - - u64 now = bch2_current_io_time(c, rw); - if (a->v.io_time[rw] == now) - goto out; - - a->v.io_time[rw] = now; - - ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, 0); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, - size_t bucket_nr, int rw) -{ - if (bch2_trans_relock(trans)) - bch2_trans_begin(trans); - - return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw)); -} - -/* Startup/shutdown (ro/rw): */ - -void bch2_recalc_capacity(struct bch_fs *c) -{ - u64 capacity = 0, reserved_sectors = 0, gc_reserve; - unsigned bucket_size_max = 0; - unsigned long ra_pages = 0; - - lockdep_assert_held(&c->state_lock); - - guard(rcu)(); - for_each_member_device_rcu(c, ca, NULL) { - struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev); - if (bdev) - ra_pages += bdev->bd_disk->bdi->ra_pages; - - if (ca->mi.state != BCH_MEMBER_STATE_rw) - continue; - - u64 dev_reserve = 0; - - /* - * We need to reserve buckets (from the number - * of currently available buckets) against - * foreground writes so that mainly copygc can - * make forward progress. - * - * We need enough to refill the various reserves - * from scratch - copygc will use its entire - * reserve all at once, then run against when - * its reserve is refilled (from the formerly - * available buckets). - * - * This reserve is just used when considering if - * allocations for foreground writes must wait - - * not -ENOSPC calculations. - */ - - dev_reserve += ca->nr_btree_reserve * 2; - dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ - - dev_reserve += 1; /* btree write point */ - dev_reserve += 1; /* copygc write point */ - dev_reserve += 1; /* rebalance write point */ - - dev_reserve *= ca->mi.bucket_size; - - capacity += bucket_to_sector(ca, ca->mi.nbuckets - - ca->mi.first_bucket); - - reserved_sectors += dev_reserve * 2; - - bucket_size_max = max_t(unsigned, bucket_size_max, - ca->mi.bucket_size); - } - - bch2_set_ra_pages(c, ra_pages); - - gc_reserve = c->opts.gc_reserve_bytes - ? c->opts.gc_reserve_bytes >> 9 - : div64_u64(capacity * c->opts.gc_reserve_percent, 100); - - reserved_sectors = max(gc_reserve, reserved_sectors); - - reserved_sectors = min(reserved_sectors, capacity); - - c->reserved = reserved_sectors; - c->capacity = capacity - reserved_sectors; - - c->bucket_size_max = bucket_size_max; - - /* Wake up case someone was waiting for buckets */ - closure_wake_up(&c->freelist_wait); -} - -u64 bch2_min_rw_member_capacity(struct bch_fs *c) -{ - u64 ret = U64_MAX; - - guard(rcu)(); - for_each_rw_member_rcu(c, ca) - ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); - return ret; -} - -static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) -{ - struct open_bucket *ob; - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { - scoped_guard(spinlock, &ob->lock) { - if (ob->valid && !ob->on_partial_list && - ob->dev == ca->dev_idx) - return true; - } - } - - return false; -} - -void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) -{ - /* BCH_DATA_free == all rw devs */ - - for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - if (rw && - (i == BCH_DATA_free || - (ca->mi.data_allowed & BIT(i)))) - set_bit(ca->dev_idx, c->rw_devs[i].d); - else - clear_bit(ca->dev_idx, c->rw_devs[i].d); -} - -/* device goes ro: */ -void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) -{ - lockdep_assert_held(&c->state_lock); - - /* First, remove device from allocation groups: */ - bch2_dev_allocator_set_rw(c, ca, false); - - c->rw_devs_change_count++; - - /* - * Capacity is calculated based off of devices in allocation groups: - */ - bch2_recalc_capacity(c); - - bch2_open_buckets_stop(c, ca, false); - - /* - * Wake up threads that were blocked on allocation, so they can notice - * the device can no longer be removed and the capacity has changed: - */ - closure_wake_up(&c->freelist_wait); - - /* - * journal_res_get() can block waiting for free space in the journal - - * it needs to notice there may not be devices to allocate from anymore: - */ - wake_up(&c->journal.wait); - - /* Now wait for any in flight writes: */ - - closure_wait_event(&c->open_buckets_wait, - !bch2_dev_has_open_write_point(c, ca)); -} - -/* device goes rw: */ -void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) -{ - lockdep_assert_held(&c->state_lock); - - bch2_dev_allocator_set_rw(c, ca, true); - c->rw_devs_change_count++; -} - -void bch2_dev_allocator_background_exit(struct bch_dev *ca) -{ - darray_exit(&ca->discard_buckets_in_flight); -} - -void bch2_dev_allocator_background_init(struct bch_dev *ca) -{ - mutex_init(&ca->discard_buckets_in_flight_lock); - INIT_WORK(&ca->discard_work, bch2_do_discards_work); - INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work); - INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work); -} - -void bch2_fs_allocator_background_init(struct bch_fs *c) -{ - spin_lock_init(&c->freelist_lock); -} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h deleted file mode 100644 index 0cc5adc55b6f..000000000000 --- a/fs/bcachefs/alloc_background.h +++ /dev/null @@ -1,361 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ALLOC_BACKGROUND_H -#define _BCACHEFS_ALLOC_BACKGROUND_H - -#include "bcachefs.h" -#include "alloc_types.h" -#include "buckets.h" -#include "debug.h" -#include "super.h" - -/* How out of date a pointer gen is allowed to be: */ -#define BUCKET_GC_GEN_MAX 96U - -static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) -{ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); - return ca && bucket_valid(ca, pos.offset); -} - -static inline u64 bucket_to_u64(struct bpos bucket) -{ - return (bucket.inode << 48) | bucket.offset; -} - -static inline struct bpos u64_to_bucket(u64 bucket) -{ - return POS(bucket >> 48, bucket & ~(~0ULL << 48)); -} - -static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) -{ - return a.gen - a.oldest_gen; -} - -static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src) -{ - dst->gen = src.gen; - dst->data_type = src.data_type; - dst->stripe_sectors = src.stripe_sectors; - dst->dirty_sectors = src.dirty_sectors; - dst->cached_sectors = src.cached_sectors; - dst->stripe = src.stripe; -} - -static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src) -{ - dst->gen = src.gen; - dst->data_type = src.data_type; - dst->stripe_sectors = src.stripe_sectors; - dst->dirty_sectors = src.dirty_sectors; - dst->cached_sectors = src.cached_sectors; - dst->stripe = src.stripe; -} - -static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) -{ - struct bch_alloc_v4 ret = {}; - __bucket_m_to_alloc(&ret, b); - return ret; -} - -static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) -{ - switch (data_type) { - case BCH_DATA_cached: - case BCH_DATA_stripe: - return BCH_DATA_user; - default: - return data_type; - } -} - -static inline bool bucket_data_type_mismatch(enum bch_data_type bucket, - enum bch_data_type ptr) -{ - return !data_type_is_empty(bucket) && - bucket_data_type(bucket) != bucket_data_type(ptr); -} - -/* - * It is my general preference to use unsigned types for unsigned quantities - - * however, these helpers are used in disk accounting calculations run by - * triggers where the output will be negated and added to an s64. unsigned is - * right out even though all these quantities will fit in 32 bits, since it - * won't be sign extended correctly; u64 will negate "correctly", but s64 is the - * simpler option here. - */ -static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a) -{ - return a.stripe_sectors + a.dirty_sectors + a.cached_sectors; -} - -static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a) -{ - return a.stripe_sectors + a.dirty_sectors; -} - -static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a) -{ - return a.data_type == BCH_DATA_cached - ? a.cached_sectors - : bch2_bucket_sectors_dirty(a); -} - -static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca, - struct bch_alloc_v4 a) -{ - int d = bch2_bucket_sectors(a); - - return d ? max(0, ca->mi.bucket_size - d) : 0; -} - -static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a) -{ - int d = a.stripe_sectors + a.dirty_sectors; - - return d ? max(0, ca->mi.bucket_size - d) : 0; -} - -static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a) -{ - return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0; -} - -static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, - enum bch_data_type data_type) -{ - if (a.stripe) - return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; - if (bch2_bucket_sectors_dirty(a)) - return bucket_data_type(data_type); - if (a.cached_sectors) - return BCH_DATA_cached; - if (BCH_ALLOC_V4_NEED_DISCARD(&a)) - return BCH_DATA_need_discard; - if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) - return BCH_DATA_need_gc_gens; - return BCH_DATA_free; -} - -static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type) -{ - a->data_type = alloc_data_type(*a, data_type); -} - -static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) -{ - return a.data_type == BCH_DATA_cached - ? a.io_time[READ] & LRU_TIME_MAX - : 0; -} - -#define DATA_TYPES_MOVABLE \ - ((1U << BCH_DATA_btree)| \ - (1U << BCH_DATA_user)| \ - (1U << BCH_DATA_stripe)) - -static inline bool data_type_movable(enum bch_data_type type) -{ - return (1U << type) & DATA_TYPES_MOVABLE; -} - -static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, - struct bch_dev *ca) -{ - if (a.data_type >= BCH_DATA_NR) - return 0; - - if (!data_type_movable(a.data_type) || - !bch2_bucket_sectors_fragmented(ca, a)) - return 0; - - /* - * avoid overflowing LRU_TIME_BITS on a corrupted fs, when - * bucket_sectors_dirty is (much) bigger than bucket_size - */ - u64 d = min_t(s64, bch2_bucket_sectors_dirty(a), - ca->mi.bucket_size); - - return div_u64(d * (1ULL << 31), ca->mi.bucket_size); -} - -static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) -{ - return ((u64) alloc_gc_gen(a) >> 4) << 56; -} - -static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a) -{ - pos.offset |= alloc_freespace_genbits(a); - return pos; -} - -static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a) -{ - return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: - BCH_ALLOC_V4_U64s_V0) + - BCH_ALLOC_V4_NR_BACKPOINTERS(a) * - (sizeof(struct bch_backpointer) / sizeof(u64)); -} - -static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) -{ - unsigned ret = alloc_v4_u64s_noerror(a); - BUG_ON(ret > U8_MAX - BKEY_U64s); - return ret; -} - -static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) -{ - set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v)); -} - -struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos); -struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *, struct bpos, - enum btree_iter_update_trigger_flags); - -void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); - -static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert) -{ - const struct bch_alloc_v4 *ret; - - if (unlikely(k.k->type != KEY_TYPE_alloc_v4)) - goto slowpath; - - ret = bkey_s_c_to_alloc_v4(k).v; - if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s) - goto slowpath; - - return ret; -slowpath: - __bch2_alloc_to_v4(k, convert); - return convert; -} - -struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); - -int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); - -int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_alloc_v4_swab(struct bkey_s); -void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_alloc ((struct bkey_ops) { \ - .key_validate = bch2_alloc_v1_validate, \ - .val_to_text = bch2_alloc_to_text, \ - .trigger = bch2_trigger_alloc, \ - .min_val_size = 8, \ -}) - -#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ - .key_validate = bch2_alloc_v2_validate, \ - .val_to_text = bch2_alloc_to_text, \ - .trigger = bch2_trigger_alloc, \ - .min_val_size = 8, \ -}) - -#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ - .key_validate = bch2_alloc_v3_validate, \ - .val_to_text = bch2_alloc_to_text, \ - .trigger = bch2_trigger_alloc, \ - .min_val_size = 16, \ -}) - -#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ - .key_validate = bch2_alloc_v4_validate, \ - .val_to_text = bch2_alloc_v4_to_text, \ - .swab = bch2_alloc_v4_swab, \ - .trigger = bch2_trigger_alloc, \ - .min_val_size = 48, \ -}) - -int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ - .key_validate = bch2_bucket_gens_validate, \ - .val_to_text = bch2_bucket_gens_to_text, \ -}) - -int bch2_bucket_gens_init(struct bch_fs *); - -static inline bool bkey_is_alloc(const struct bkey *k) -{ - return k->type == KEY_TYPE_alloc || - k->type == KEY_TYPE_alloc_v2 || - k->type == KEY_TYPE_alloc_v3; -} - -int bch2_alloc_read(struct bch_fs *); - -int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *, - const struct bch_alloc_v4 *, - const struct bch_alloc_v4 *, unsigned); -int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool); -int bch2_check_alloc_info(struct bch_fs *); -int bch2_check_alloc_to_lru_refs(struct bch_fs *); -void bch2_dev_do_discards(struct bch_dev *); -void bch2_do_discards(struct bch_fs *); - -static inline u64 should_invalidate_buckets(struct bch_dev *ca, - struct bch_dev_usage u) -{ - u64 want_free = ca->mi.nbuckets >> 7; - u64 free = max_t(s64, 0, - u.buckets[BCH_DATA_free] - + u.buckets[BCH_DATA_need_discard] - - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe)); - - return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]); -} - -void bch2_dev_do_invalidates(struct bch_dev *); -void bch2_do_invalidates(struct bch_fs *); - -static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) -{ - return (void *) ((u64 *) &a->v + - (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: - BCH_ALLOC_V4_U64s_V0)); -} - -static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a) -{ - return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); -} - -int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64); -int bch2_fs_freespace_init(struct bch_fs *); -int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); - -void bch2_recalc_capacity(struct bch_fs *); -u64 bch2_min_rw_member_capacity(struct bch_fs *); - -void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool); -void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); -void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); - -void bch2_dev_allocator_background_exit(struct bch_dev *); -void bch2_dev_allocator_background_init(struct bch_dev *); - -void bch2_fs_allocator_background_init(struct bch_fs *); - -#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h deleted file mode 100644 index 740238369a5a..000000000000 --- a/fs/bcachefs/alloc_background_format.h +++ /dev/null @@ -1,95 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H -#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H - -struct bch_alloc { - struct bch_val v; - __u8 fields; - __u8 gen; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V1() \ - x(read_time, 16) \ - x(write_time, 16) \ - x(data_type, 8) \ - x(dirty_sectors, 16) \ - x(cached_sectors, 16) \ - x(oldest_gen, 8) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -enum { -#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, - BCH_ALLOC_FIELDS_V1() -#undef x -}; - -struct bch_alloc_v2 { - struct bch_val v; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V2() \ - x(read_time, 64) \ - x(write_time, 64) \ - x(dirty_sectors, 32) \ - x(cached_sectors, 32) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -struct bch_alloc_v3 { - struct bch_val v; - __le64 journal_seq; - __le32 flags; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) -LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) - -struct bch_alloc_v4 { - struct bch_val v; - __u64 journal_seq_nonempty; - __u32 flags; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 stripe_redundancy; - __u32 dirty_sectors; - __u32 cached_sectors; - __u64 io_time[2]; - __u32 stripe; - __u32 nr_external_backpointers; - /* end of fields in original version of alloc_v4 */ - __u64 journal_seq_empty; - __u32 stripe_sectors; - __u32 pad; -} __packed __aligned(8); - -#define BCH_ALLOC_V4_U64s_V0 6 -#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) - -BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) -BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) -BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) -BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) - -#define KEY_TYPE_BUCKET_GENS_BITS 8 -#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) -#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) - -struct bch_bucket_gens { - struct bch_val v; - u8 gens[KEY_TYPE_BUCKET_GENS_NR]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c deleted file mode 100644 index b58525ec7b4d..000000000000 --- a/fs/bcachefs/alloc_foreground.c +++ /dev/null @@ -1,1683 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2012 Google, Inc. - * - * Foreground allocator code: allocate buckets from freelist, and allocate in - * sector granularity from writepoints. - * - * bch2_bucket_alloc() allocates a single bucket from a specific device. - * - * bch2_bucket_alloc_set() allocates one or more buckets from different devices - * in a given filesystem. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_gc.h" -#include "buckets.h" -#include "buckets_waiting_for_journal.h" -#include "clock.h" -#include "debug.h" -#include "disk_groups.h" -#include "ec.h" -#include "error.h" -#include "io_write.h" -#include "journal.h" -#include "movinggc.h" -#include "nocow_locking.h" -#include "trace.h" - -#include <linux/math64.h> -#include <linux/rculist.h> -#include <linux/rcupdate.h> - -static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans, - struct mutex *lock) -{ - if (!mutex_trylock(lock)) { - bch2_trans_unlock(trans); - mutex_lock(lock); - } -} - -const char * const bch2_watermarks[] = { -#define x(t) #t, - BCH_WATERMARKS() -#undef x - NULL -}; - -/* - * Open buckets represent a bucket that's currently being allocated from. They - * serve two purposes: - * - * - They track buckets that have been partially allocated, allowing for - * sub-bucket sized allocations - they're used by the sector allocator below - * - * - They provide a reference to the buckets they own that mark and sweep GC - * can find, until the new allocation has a pointer to it inserted into the - * btree - * - * When allocating some space with the sector allocator, the allocation comes - * with a reference to an open bucket - the caller is required to put that - * reference _after_ doing the index update that makes its allocation reachable. - */ - -void bch2_reset_alloc_cursors(struct bch_fs *c) -{ - guard(rcu)(); - for_each_member_device_rcu(c, ca, NULL) - memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); -} - -static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) -{ - open_bucket_idx_t idx = ob - c->open_buckets; - open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); - - ob->hash = *slot; - *slot = idx; -} - -static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) -{ - open_bucket_idx_t idx = ob - c->open_buckets; - open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); - - while (*slot != idx) { - BUG_ON(!*slot); - slot = &c->open_buckets[*slot].hash; - } - - *slot = ob->hash; - ob->hash = 0; -} - -void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - if (ob->ec) { - ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); - return; - } - - spin_lock(&ob->lock); - ob->valid = false; - ob->data_type = 0; - spin_unlock(&ob->lock); - - spin_lock(&c->freelist_lock); - bch2_open_bucket_hash_remove(c, ob); - - ob->freelist = c->open_buckets_freelist; - c->open_buckets_freelist = ob - c->open_buckets; - - c->open_buckets_nr_free++; - ca->nr_open_buckets--; - spin_unlock(&c->freelist_lock); - - closure_wake_up(&c->open_buckets_wait); -} - -void bch2_open_bucket_write_error(struct bch_fs *c, - struct open_buckets *obs, - unsigned dev, int err) -{ - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, obs, ob, i) - if (ob->dev == dev && ob->ec) - bch2_ec_bucket_cancel(c, ob, err); -} - -static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) -{ - struct open_bucket *ob; - - BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); - - ob = c->open_buckets + c->open_buckets_freelist; - c->open_buckets_freelist = ob->freelist; - atomic_set(&ob->pin, 1); - ob->data_type = 0; - - c->open_buckets_nr_free--; - return ob; -} - -static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) -{ - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs)) - return false; - - return bch2_is_superblock_bucket(ca, b); -} - -static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) -{ - BUG_ON(c->open_buckets_partial_nr >= - ARRAY_SIZE(c->open_buckets_partial)); - - spin_lock(&c->freelist_lock); - scoped_guard(rcu) - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; - - ob->on_partial_list = true; - c->open_buckets_partial[c->open_buckets_partial_nr++] = - ob - c->open_buckets; - spin_unlock(&c->freelist_lock); - - closure_wake_up(&c->open_buckets_wait); - closure_wake_up(&c->freelist_wait); -} - -static inline bool may_alloc_bucket(struct bch_fs *c, - struct alloc_request *req, - struct bpos bucket) -{ - if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { - req->counters.skipped_open++; - return false; - } - - u64 journal_seq_ready = - bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, - bucket.inode, bucket.offset); - if (journal_seq_ready > c->journal.flushed_seq_ondisk) { - if (journal_seq_ready > c->journal.flushing_seq) - req->counters.need_journal_commit++; - req->counters.skipped_need_journal_commit++; - return false; - } - - if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { - req->counters.skipped_nocow++; - return false; - } - - return true; -} - -static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, - struct alloc_request *req, - u64 bucket, u8 gen, - struct closure *cl) -{ - struct bch_dev *ca = req->ca; - - if (unlikely(is_superblock_bucket(c, ca, bucket))) - return NULL; - - if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { - req->counters.skipped_nouse++; - return NULL; - } - - spin_lock(&c->freelist_lock); - - if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) { - if (cl) - closure_wait(&c->open_buckets_wait, cl); - - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); - spin_unlock(&c->freelist_lock); - return ERR_PTR(bch_err_throw(c, open_buckets_empty)); - } - - /* Recheck under lock: */ - if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { - spin_unlock(&c->freelist_lock); - req->counters.skipped_open++; - return NULL; - } - - struct open_bucket *ob = bch2_open_bucket_alloc(c); - - spin_lock(&ob->lock); - ob->valid = true; - ob->sectors_free = ca->mi.bucket_size; - ob->dev = ca->dev_idx; - ob->gen = gen; - ob->bucket = bucket; - spin_unlock(&ob->lock); - - ca->nr_open_buckets++; - bch2_open_bucket_hash_add(c, ob); - - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); - track_event_change(&c->times[BCH_TIME_blocked_allocate], false); - - spin_unlock(&c->freelist_lock); - return ob; -} - -static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, - struct alloc_request *req, - struct btree_iter *freespace_iter, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - - if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b))) - return NULL; - - u8 gen; - int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); - if (ret < 0) - return ERR_PTR(ret); - if (ret) - return NULL; - - return __try_alloc_bucket(c, req, b, gen, cl); -} - -/* - * This path is for before the freespace btree is initialized: - */ -static noinline struct open_bucket * -bch2_bucket_alloc_early(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca = req->ca; - struct btree_iter iter, citer; - struct bkey_s_c k, ck; - struct open_bucket *ob = NULL; - u64 first_bucket = ca->mi.first_bucket; - u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; - u64 alloc_start = max(first_bucket, *dev_alloc_cursor); - u64 alloc_cursor = alloc_start; - int ret; - - /* - * Scan with an uncached iterator to avoid polluting the key cache. An - * uncached iter will return a cached key if one exists, but if not - * there is no other underlying protection for the associated key cache - * slot. To avoid racing bucket allocations, look up the cached key slot - * of any likely allocation candidate before attempting to proceed with - * the allocation. This provides proper exclusion on the associated - * bucket. - */ -again: - for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), - BTREE_ITER_slots, k, ret) { - u64 bucket = k.k->p.offset; - - if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) - break; - - if (req->btree_bitmap != BTREE_BITMAP_ANY && - req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, - bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (req->btree_bitmap == BTREE_BITMAP_YES && - bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) - break; - - bucket = sector_to_bucket(ca, - round_up(bucket_to_sector(ca, bucket) + 1, - 1ULL << ca->mi.btree_bitmap_shift)); - bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket)); - req->counters.buckets_seen++; - req->counters.skipped_mi_btree_bitmap++; - continue; - } - - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - if (a->data_type != BCH_DATA_free) - continue; - - /* now check the cached key to serialize concurrent allocs of the bucket */ - ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached); - ret = bkey_err(ck); - if (ret) - break; - - a = bch2_alloc_to_v4(ck, &a_convert); - if (a->data_type != BCH_DATA_free) - goto next; - - req->counters.buckets_seen++; - - ob = may_alloc_bucket(c, req, k.k->p) - ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl) - : NULL; -next: - bch2_set_btree_iter_dontneed(trans, &citer); - bch2_trans_iter_exit(trans, &citer); - if (ob) - break; - } - bch2_trans_iter_exit(trans, &iter); - - alloc_cursor = iter.pos.offset; - - if (!ob && ret) - ob = ERR_PTR(ret); - - if (!ob && alloc_start > first_bucket) { - alloc_cursor = alloc_start = first_bucket; - goto again; - } - - *dev_alloc_cursor = alloc_cursor; - - return ob; -} - -static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - struct bch_dev *ca = req->ca; - struct btree_iter iter; - struct bkey_s_c k; - struct open_bucket *ob = NULL; - u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); - u64 alloc_cursor = alloc_start; - int ret; -again: - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, - POS(ca->dev_idx, alloc_cursor), - POS(ca->dev_idx, U64_MAX), - 0, k, ret) { - /* - * peek normally dosen't trim extents - they can span iter.pos, - * which is not what we want here: - */ - iter.k.size = iter.k.p.offset - iter.pos.offset; - - while (iter.k.size) { - req->counters.buckets_seen++; - - u64 bucket = iter.pos.offset & ~(~0ULL << 56); - if (req->btree_bitmap != BTREE_BITMAP_ANY && - req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, - bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (req->btree_bitmap == BTREE_BITMAP_YES && - bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) - goto fail; - - bucket = sector_to_bucket(ca, - round_up(bucket_to_sector(ca, bucket + 1), - 1ULL << ca->mi.btree_bitmap_shift)); - alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); - - bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor)); - req->counters.skipped_mi_btree_bitmap++; - goto next; - } - - ob = try_alloc_bucket(trans, req, &iter, cl); - if (ob) { - if (!IS_ERR(ob)) - *dev_alloc_cursor = iter.pos.offset; - bch2_set_btree_iter_dontneed(trans, &iter); - break; - } - - iter.k.size--; - iter.pos.offset++; - } -next: - if (ob || ret) - break; - } -fail: - bch2_trans_iter_exit(trans, &iter); - - BUG_ON(ob && ret); - - if (ret) - ob = ERR_PTR(ret); - - if (!ob && alloc_start > ca->mi.first_bucket) { - alloc_cursor = alloc_start = ca->mi.first_bucket; - goto again; - } - - return ob; -} - -static noinline void trace_bucket_alloc2(struct bch_fs *c, - struct alloc_request *req, - struct closure *cl, - struct open_bucket *ob) -{ - struct printbuf buf = PRINTBUF; - - printbuf_tabstop_push(&buf, 24); - - prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx); - prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); - prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); - prt_printf(&buf, "blocking\t%u\n", cl != NULL); - prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); - prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); - prt_printf(&buf, "copygc_wait\t%llu/%lli\n", - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); - prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); - prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open); - prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit); - prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow); - prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse); - prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap); - - if (!IS_ERR(ob)) { - prt_printf(&buf, "allocated\t%llu\n", ob->bucket); - trace_bucket_alloc(c, buf.buf); - } else { - prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob))); - trace_bucket_alloc_fail(c, buf.buf); - } - - printbuf_exit(&buf); -} - -/** - * bch2_bucket_alloc_trans - allocate a single bucket from a specific device - * @trans: transaction object - * @req: state for the entire allocation - * @cl: if not NULL, closure to be used to wait if buckets not available - * @nowait: if true, do not wait for buckets to become available - * - * Returns: an open_bucket on success, or an ERR_PTR() on failure. - */ -static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl, - bool nowait) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca = req->ca; - struct open_bucket *ob = NULL; - bool freespace = READ_ONCE(ca->mi.freespace_initialized); - u64 avail; - bool waiting = nowait; - - req->btree_bitmap = req->data_type == BCH_DATA_btree; - memset(&req->counters, 0, sizeof(req->counters)); -again: - bch2_dev_usage_read_fast(ca, &req->usage); - avail = dev_buckets_free(ca, req->usage, req->watermark); - - if (req->usage.buckets[BCH_DATA_need_discard] > - min(avail, ca->mi.nbuckets >> 7)) - bch2_dev_do_discards(ca); - - if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) - bch2_gc_gens_async(c); - - if (should_invalidate_buckets(ca, req->usage)) - bch2_dev_do_invalidates(ca); - - if (!avail) { - if (req->watermark > BCH_WATERMARK_normal && - c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) - goto alloc; - - if (cl && !waiting) { - closure_wait(&c->freelist_wait, cl); - waiting = true; - goto again; - } - - track_event_change(&c->times[BCH_TIME_blocked_allocate], true); - - ob = ERR_PTR(bch_err_throw(c, freelist_empty)); - goto err; - } - - if (waiting) - closure_wake_up(&c->freelist_wait); -alloc: - ob = likely(freespace) - ? bch2_bucket_alloc_freelist(trans, req, cl) - : bch2_bucket_alloc_early(trans, req, cl); - - if (req->counters.need_journal_commit * 2 > avail) - bch2_journal_flush_async(&c->journal, NULL); - - if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) { - req->btree_bitmap = BTREE_BITMAP_ANY; - goto alloc; - } - - if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) { - freespace = false; - goto alloc; - } -err: - if (!ob) - ob = ERR_PTR(bch_err_throw(c, no_buckets_found)); - - if (!IS_ERR(ob)) - ob->data_type = req->data_type; - - if (!IS_ERR(ob)) - count_event(c, bucket_alloc); - else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) - count_event(c, bucket_alloc_fail); - - if (!IS_ERR(ob) - ? trace_bucket_alloc_enabled() - : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, req, cl, ob); - - return ob; -} - -struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, - struct closure *cl) -{ - struct open_bucket *ob; - struct alloc_request req = { - .watermark = watermark, - .data_type = data_type, - .ca = ca, - }; - - bch2_trans_do(c, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false))); - return ob; -} - -static int __dev_stripe_cmp(struct dev_stripe_state *stripe, - unsigned l, unsigned r) -{ - return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]); -} - -#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) - -void bch2_dev_alloc_list(struct bch_fs *c, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs, - struct dev_alloc_list *ret) -{ - ret->nr = 0; - - unsigned i; - for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) - ret->data[ret->nr++] = i; - - bubble_sort(ret->data, ret->nr, dev_stripe_cmp); -} - -static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ -static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */ -static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */ - -static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe) -{ - /* - * Avoid underflowing clock hands if at all possible, if clock hands go - * to 0 then we lose information - clock hands can be in a wide range if - * we have devices we rarely try to allocate from, if we generally - * allocate from a specified target but only sometimes have to fall back - * to the whole filesystem. - */ - u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */ - u64 scale_min = 0; /* minumum we must subtract to avoid overflow */ - - for (u64 *v = stripe->next_alloc; - v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) { - if (*v) - scale_max = min(scale_max, *v); - if (*v > stripe_clock_hand_max) - scale_min = max(scale_min, *v - stripe_clock_hand_max); - } - - u64 scale = max(scale_min, scale_max); - - for (u64 *v = stripe->next_alloc; - v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) - *v = *v < scale ? 0 : *v - scale; -} - -static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, - struct dev_stripe_state *stripe, - struct bch_dev_usage *usage) -{ - /* - * Stripe state has a per device clock hand: we allocate from the device - * with the smallest clock hand. - * - * When we allocate, we don't do a simple increment; we add the inverse - * of the device's free space. This results in round robin behavior that - * biases in favor of the device(s) with more free space. - */ - - u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); - u64 free_space_inv = free_space - ? div64_u64(stripe_clock_hand_inv, free_space) - : stripe_clock_hand_inv; - - /* Saturating add, avoid overflow: */ - u64 sum = *v + free_space_inv; - *v = sum >= *v ? sum : U64_MAX; - - if (unlikely(*v > stripe_clock_hand_rescale)) - bch2_stripe_state_rescale(stripe); -} - -void bch2_dev_stripe_increment(struct bch_dev *ca, - struct dev_stripe_state *stripe) -{ - struct bch_dev_usage usage; - - bch2_dev_usage_read_fast(ca, &usage); - bch2_dev_stripe_increment_inlined(ca, stripe, &usage); -} - -static int add_new_bucket(struct bch_fs *c, - struct alloc_request *req, - struct open_bucket *ob) -{ - unsigned durability = ob_dev(c, ob)->mi.durability; - - BUG_ON(req->nr_effective >= req->nr_replicas); - - __clear_bit(ob->dev, req->devs_may_alloc.d); - req->nr_effective += durability; - req->have_cache |= !durability; - - ob_push(c, &req->ptrs, ob); - - if (req->nr_effective >= req->nr_replicas) - return 1; - if (ob->ec) - return 1; - return 0; -} - -inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct alloc_request *req, - struct dev_stripe_state *stripe, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - BUG_ON(req->nr_effective >= req->nr_replicas); - - bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted); - - darray_for_each(req->devs_sorted, i) { - req->ca = bch2_dev_tryget_noerror(c, *i); - if (!req->ca) - continue; - - if (!req->ca->mi.durability && req->have_cache) { - bch2_dev_put(req->ca); - continue; - } - - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl, - req->flags & BCH_WRITE_alloc_nowait); - if (!IS_ERR(ob)) - bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage); - bch2_dev_put(req->ca); - - if (IS_ERR(ob)) { - ret = PTR_ERR(ob); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl) - break; - continue; - } - - ret = add_new_bucket(c, req, ob); - if (ret) - break; - } - - if (ret == 1) - return 0; - if (ret) - return ret; - return bch_err_throw(c, insufficient_devices); -} - -/* Allocate from stripes: */ - -/* - * if we can't allocate a new stripe because there are already too many - * partially filled stripes, force allocating from an existing stripe even when - * it's to a device we don't want: - */ - -static int bucket_alloc_from_stripe(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - if (req->nr_replicas < 2) - return 0; - - if (ec_open_bucket(c, &req->ptrs)) - return 0; - - struct ec_stripe_head *h = - bch2_ec_stripe_head_get(trans, req, 0, cl); - if (IS_ERR(h)) - return PTR_ERR(h); - if (!h) - return 0; - - bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted); - - darray_for_each(req->devs_sorted, i) - for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { - if (!h->s->blocks[ec_idx]) - continue; - - struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; - if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { - ob->ec_idx = ec_idx; - ob->ec = h->s; - ec_stripe_new_get(h->s, STRIPE_REF_io); - - ret = add_new_bucket(c, req, ob); - goto out; - } - } -out: - bch2_ec_stripe_head_put(c, h); - return ret; -} - -/* Sector allocator */ - -static bool want_bucket(struct bch_fs *c, - struct alloc_request *req, - struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - if (!test_bit(ob->dev, req->devs_may_alloc.d)) - return false; - - if (ob->data_type != req->wp->data_type) - return false; - - if (!ca->mi.durability && - (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache)) - return false; - - if (req->ec != (ob->ec != NULL)) - return false; - - return true; -} - -static int bucket_alloc_set_writepoint(struct bch_fs *c, - struct alloc_request *req) -{ - struct open_bucket *ob; - unsigned i; - int ret = 0; - - req->scratch_ptrs.nr = 0; - - open_bucket_for_each(c, &req->wp->ptrs, ob, i) { - if (!ret && want_bucket(c, req, ob)) - ret = add_new_bucket(c, req, ob); - else - ob_push(c, &req->scratch_ptrs, ob); - } - req->wp->ptrs = req->scratch_ptrs; - - return ret; -} - -static int bucket_alloc_set_partial(struct bch_fs *c, - struct alloc_request *req) -{ - int i, ret = 0; - - if (!c->open_buckets_partial_nr) - return 0; - - spin_lock(&c->freelist_lock); - - if (!c->open_buckets_partial_nr) - goto unlock; - - for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { - struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; - - if (want_bucket(c, req, ob)) { - struct bch_dev *ca = ob_dev(c, ob); - u64 avail; - - bch2_dev_usage_read_fast(ca, &req->usage); - avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets; - if (!avail) - continue; - - array_remove_item(c->open_buckets_partial, - c->open_buckets_partial_nr, - i); - ob->on_partial_list = false; - - scoped_guard(rcu) - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - - ret = add_new_bucket(c, req, ob); - if (ret) - break; - } - } -unlock: - spin_unlock(&c->freelist_lock); - return ret; -} - -static int __open_bucket_add_buckets(struct btree_trans *trans, - struct alloc_request *req, - struct closure *_cl) -{ - struct bch_fs *c = trans->c; - struct open_bucket *ob; - struct closure *cl = NULL; - unsigned i; - int ret; - - req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); - - /* Don't allocate from devices we already have pointers to: */ - darray_for_each(*req->devs_have, i) - __clear_bit(*i, req->devs_may_alloc.d); - - open_bucket_for_each(c, &req->ptrs, ob, i) - __clear_bit(ob->dev, req->devs_may_alloc.d); - - ret = bucket_alloc_set_writepoint(c, req); - if (ret) - return ret; - - ret = bucket_alloc_set_partial(c, req); - if (ret) - return ret; - - if (req->ec) { - ret = bucket_alloc_from_stripe(trans, req, _cl); - } else { -retry_blocking: - /* - * Try nonblocking first, so that if one device is full we'll try from - * other devices: - */ - ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && - !cl && _cl) { - cl = _cl; - goto retry_blocking; - } - } - - return ret; -} - -static int open_bucket_add_buckets(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - int ret; - - if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { - ret = __open_bucket_add_buckets(trans, req, cl); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, BCH_ERR_operation_blocked) || - bch2_err_matches(ret, BCH_ERR_freelist_empty) || - bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - return ret; - if (req->nr_effective >= req->nr_replicas) - return 0; - } - - bool ec = false; - swap(ec, req->ec); - ret = __open_bucket_add_buckets(trans, req, cl); - swap(ec, req->ec); - - return ret < 0 ? ret : 0; -} - -/** - * should_drop_bucket - check if this is open_bucket should go away - * @ob: open_bucket to predicate on - * @c: filesystem handle - * @ca: if set, we're killing buckets for a particular device - * @ec: if true, we're shutting down erasure coding and killing all ec - * open_buckets - * otherwise, return true - * Returns: true if we should kill this open_bucket - * - * We're killing open_buckets because we're shutting down a device, erasure - * coding, or the entire filesystem - check if this open_bucket matches: - */ -static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, - struct bch_dev *ca, bool ec) -{ - if (ec) { - return ob->ec != NULL; - } else if (ca) { - bool drop = ob->dev == ca->dev_idx; - struct open_bucket *ob2; - unsigned i; - - if (!drop && ob->ec) { - unsigned nr_blocks; - - mutex_lock(&ob->ec->lock); - nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; - - for (i = 0; i < nr_blocks; i++) { - if (!ob->ec->blocks[i]) - continue; - - ob2 = c->open_buckets + ob->ec->blocks[i]; - drop |= ob2->dev == ca->dev_idx; - } - mutex_unlock(&ob->ec->lock); - } - - return drop; - } else { - return true; - } -} - -static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, - bool ec, struct write_point *wp) -{ - struct open_buckets ptrs = { .nr = 0 }; - struct open_bucket *ob; - unsigned i; - - mutex_lock(&wp->lock); - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (should_drop_bucket(ob, c, ca, ec)) - bch2_open_bucket_put(c, ob); - else - ob_push(c, &ptrs, ob); - wp->ptrs = ptrs; - mutex_unlock(&wp->lock); -} - -void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, - bool ec) -{ - unsigned i; - - /* Next, close write points that point to this device... */ - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - bch2_writepoint_stop(c, ca, ec, &c->write_points[i]); - - bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point); - bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); - bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); - - mutex_lock(&c->btree_reserve_cache_lock); - while (c->btree_reserve_cache_nr) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - bch2_open_buckets_put(c, &a->ob); - } - mutex_unlock(&c->btree_reserve_cache_lock); - - spin_lock(&c->freelist_lock); - i = 0; - while (i < c->open_buckets_partial_nr) { - struct open_bucket *ob = - c->open_buckets + c->open_buckets_partial[i]; - - if (should_drop_bucket(ob, c, ca, ec)) { - --c->open_buckets_partial_nr; - swap(c->open_buckets_partial[i], - c->open_buckets_partial[c->open_buckets_partial_nr]); - - ob->on_partial_list = false; - - scoped_guard(rcu) - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - - spin_unlock(&c->freelist_lock); - bch2_open_bucket_put(c, ob); - spin_lock(&c->freelist_lock); - } else { - i++; - } - } - spin_unlock(&c->freelist_lock); - - bch2_ec_stop_dev(c, ca); -} - -static inline struct hlist_head *writepoint_hash(struct bch_fs *c, - unsigned long write_point) -{ - unsigned hash = - hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); - - return &c->write_points_hash[hash]; -} - -static struct write_point *__writepoint_find(struct hlist_head *head, - unsigned long write_point) -{ - struct write_point *wp; - - guard(rcu)(); - hlist_for_each_entry_rcu(wp, head, node) - if (wp->write_point == write_point) - return wp; - return NULL; -} - -static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) -{ - u64 stranded = c->write_points_nr * c->bucket_size_max; - u64 free = bch2_fs_usage_read_short(c).free; - - return stranded * factor > free; -} - -static noinline bool try_increase_writepoints(struct bch_fs *c) -{ - struct write_point *wp; - - if (c->write_points_nr == ARRAY_SIZE(c->write_points) || - too_many_writepoints(c, 32)) - return false; - - wp = c->write_points + c->write_points_nr++; - hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); - return true; -} - -static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) -{ - struct bch_fs *c = trans->c; - struct write_point *wp; - struct open_bucket *ob; - unsigned i; - - mutex_lock(&c->write_points_hash_lock); - if (c->write_points_nr < old_nr) { - mutex_unlock(&c->write_points_hash_lock); - return true; - } - - if (c->write_points_nr == 1 || - !too_many_writepoints(c, 8)) { - mutex_unlock(&c->write_points_hash_lock); - return false; - } - - wp = c->write_points + --c->write_points_nr; - - hlist_del_rcu(&wp->node); - mutex_unlock(&c->write_points_hash_lock); - - bch2_trans_mutex_lock_norelock(trans, &wp->lock); - open_bucket_for_each(c, &wp->ptrs, ob, i) - open_bucket_free_unused(c, ob); - wp->ptrs.nr = 0; - mutex_unlock(&wp->lock); - return true; -} - -static struct write_point *writepoint_find(struct btree_trans *trans, - unsigned long write_point) -{ - struct bch_fs *c = trans->c; - struct write_point *wp, *oldest; - struct hlist_head *head; - - if (!(write_point & 1UL)) { - wp = (struct write_point *) write_point; - bch2_trans_mutex_lock_norelock(trans, &wp->lock); - return wp; - } - - head = writepoint_hash(c, write_point); -restart_find: - wp = __writepoint_find(head, write_point); - if (wp) { -lock_wp: - bch2_trans_mutex_lock_norelock(trans, &wp->lock); - if (wp->write_point == write_point) - goto out; - mutex_unlock(&wp->lock); - goto restart_find; - } -restart_find_oldest: - oldest = NULL; - for (wp = c->write_points; - wp < c->write_points + c->write_points_nr; wp++) - if (!oldest || time_before64(wp->last_used, oldest->last_used)) - oldest = wp; - - bch2_trans_mutex_lock_norelock(trans, &oldest->lock); - bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock); - if (oldest >= c->write_points + c->write_points_nr || - try_increase_writepoints(c)) { - mutex_unlock(&c->write_points_hash_lock); - mutex_unlock(&oldest->lock); - goto restart_find_oldest; - } - - wp = __writepoint_find(head, write_point); - if (wp && wp != oldest) { - mutex_unlock(&c->write_points_hash_lock); - mutex_unlock(&oldest->lock); - goto lock_wp; - } - - wp = oldest; - hlist_del_rcu(&wp->node); - wp->write_point = write_point; - hlist_add_head_rcu(&wp->node, head); - mutex_unlock(&c->write_points_hash_lock); -out: - wp->last_used = local_clock(); - return wp; -} - -static noinline void -deallocate_extra_replicas(struct bch_fs *c, - struct alloc_request *req) -{ - struct open_bucket *ob; - unsigned extra_replicas = req->nr_effective - req->nr_replicas; - unsigned i; - - req->scratch_ptrs.nr = 0; - - open_bucket_for_each(c, &req->ptrs, ob, i) { - unsigned d = ob_dev(c, ob)->mi.durability; - - if (d && d <= extra_replicas) { - extra_replicas -= d; - ob_push(c, &req->wp->ptrs, ob); - } else { - ob_push(c, &req->scratch_ptrs, ob); - } - } - - req->ptrs = req->scratch_ptrs; -} - -/* - * Get us an open_bucket we can allocate from, return with it locked: - */ -int bch2_alloc_sectors_start_trans(struct btree_trans *trans, - unsigned target, - unsigned erasure_code, - struct write_point_specifier write_point, - struct bch_devs_list *devs_have, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl, - struct write_point **wp_ret) -{ - struct bch_fs *c = trans->c; - struct open_bucket *ob; - unsigned write_points_nr; - int i; - - struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req)); - int ret = PTR_ERR_OR_ZERO(req); - if (unlikely(ret)) - return ret; - - if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) - erasure_code = false; - - req->nr_replicas = nr_replicas; - req->target = target; - req->ec = erasure_code; - req->watermark = watermark; - req->flags = flags; - req->devs_have = devs_have; - - BUG_ON(!nr_replicas || !nr_replicas_required); -retry: - req->ptrs.nr = 0; - req->nr_effective = 0; - req->have_cache = false; - write_points_nr = c->write_points_nr; - - *wp_ret = req->wp = writepoint_find(trans, write_point.v); - - req->data_type = req->wp->data_type; - - ret = bch2_trans_relock(trans); - if (ret) - goto err; - - /* metadata may not allocate on cache devices: */ - if (req->data_type != BCH_DATA_user) - req->have_cache = true; - - if (target && !(flags & BCH_WRITE_only_specified_devs)) { - ret = open_bucket_add_buckets(trans, req, NULL); - if (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto alloc_done; - - /* Don't retry from all devices if we're out of open buckets: */ - if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret2 = open_bucket_add_buckets(trans, req, cl); - if (!ret2 || - bch2_err_matches(ret2, BCH_ERR_transaction_restart) || - bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { - ret = ret2; - goto alloc_done; - } - } - - /* - * Only try to allocate cache (durability = 0 devices) from the - * specified target: - */ - req->have_cache = true; - req->target = 0; - - ret = open_bucket_add_buckets(trans, req, cl); - } else { - ret = open_bucket_add_buckets(trans, req, cl); - } -alloc_done: - BUG_ON(!ret && req->nr_effective < req->nr_replicas); - - if (erasure_code && !ec_open_bucket(c, &req->ptrs)) - pr_debug("failed to get ec bucket: ret %u", ret); - - if (ret == -BCH_ERR_insufficient_devices && - req->nr_effective >= nr_replicas_required) - ret = 0; - - if (ret) - goto err; - - if (req->nr_effective > req->nr_replicas) - deallocate_extra_replicas(c, req); - - /* Free buckets we didn't use: */ - open_bucket_for_each(c, &req->wp->ptrs, ob, i) - open_bucket_free_unused(c, ob); - - req->wp->ptrs = req->ptrs; - - req->wp->sectors_free = UINT_MAX; - - open_bucket_for_each(c, &req->wp->ptrs, ob, i) { - /* - * Ensure proper write alignment - either due to misaligned - * bucket sizes (from buggy bcachefs-tools), or writes that mix - * logical/physical alignment: - */ - struct bch_dev *ca = ob_dev(c, ob); - u64 offset = bucket_to_sector(ca, ob->bucket) + - ca->mi.bucket_size - - ob->sectors_free; - unsigned align = round_up(offset, block_sectors(c)) - offset; - - ob->sectors_free = max_t(int, 0, ob->sectors_free - align); - - req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free); - } - - req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c)); - - /* Did alignment use up space in an open_bucket? */ - if (unlikely(!req->wp->sectors_free)) { - bch2_alloc_sectors_done(c, req->wp); - goto retry; - } - - BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX); - - return 0; -err: - open_bucket_for_each(c, &req->wp->ptrs, ob, i) - if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v)) - ob_push(c, &req->ptrs, ob); - else - open_bucket_free_unused(c, ob); - req->wp->ptrs = req->ptrs; - - mutex_unlock(&req->wp->lock); - - if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && - try_decrease_writepoints(trans, write_points_nr)) - goto retry; - - if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - ret = bch_err_throw(c, bucket_alloc_blocked); - - if (cl && !(flags & BCH_WRITE_alloc_nowait) && - bch2_err_matches(ret, BCH_ERR_freelist_empty)) - ret = bch_err_throw(c, bucket_alloc_blocked); - - return ret; -} - -void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, - struct bkey_i *k, unsigned sectors, - bool cached) -{ - bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached); -} - -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ -void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) -{ - bch2_alloc_sectors_done_inlined(c, wp); -} - -static inline void writepoint_init(struct write_point *wp, - enum bch_data_type type) -{ - mutex_init(&wp->lock); - wp->data_type = type; - - INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); - INIT_LIST_HEAD(&wp->writes); - spin_lock_init(&wp->writes_lock); -} - -void bch2_fs_allocator_foreground_init(struct bch_fs *c) -{ - struct open_bucket *ob; - struct write_point *wp; - - mutex_init(&c->write_points_hash_lock); - c->write_points_nr = ARRAY_SIZE(c->write_points); - - /* open bucket 0 is a sentinal NULL: */ - spin_lock_init(&c->open_buckets[0].lock); - - for (ob = c->open_buckets + 1; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { - spin_lock_init(&ob->lock); - c->open_buckets_nr_free++; - - ob->freelist = c->open_buckets_freelist; - c->open_buckets_freelist = ob - c->open_buckets; - } - - writepoint_init(&c->btree_write_point, BCH_DATA_btree); - writepoint_init(&c->rebalance_write_point, BCH_DATA_user); - writepoint_init(&c->copygc_write_point, BCH_DATA_user); - - for (wp = c->write_points; - wp < c->write_points + c->write_points_nr; wp++) { - writepoint_init(wp, BCH_DATA_user); - - wp->last_used = local_clock(); - wp->write_point = (unsigned long) wp; - hlist_add_head_rcu(&wp->node, - writepoint_hash(c, wp->write_point)); - } -} - -void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - unsigned data_type = ob->data_type; - barrier(); /* READ_ONCE() doesn't work on bitfields */ - - prt_printf(out, "%zu ref %u ", - ob - c->open_buckets, - atomic_read(&ob->pin)); - bch2_prt_data_type(out, data_type); - prt_printf(out, " %u:%llu gen %u allocated %u/%u", - ob->dev, ob->bucket, ob->gen, - ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); - if (ob->ec) - prt_printf(out, " ec idx %llu", ob->ec->idx); - if (ob->on_partial_list) - prt_str(out, " partial"); - prt_newline(out); -} - -void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_dev *ca) -{ - struct open_bucket *ob; - - out->atomic++; - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { - spin_lock(&ob->lock); - if (ob->valid && (!ca || ob->dev == ca->dev_idx)) - bch2_open_bucket_to_text(out, c, ob); - spin_unlock(&ob->lock); - } - - --out->atomic; -} - -void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) -{ - unsigned i; - - out->atomic++; - spin_lock(&c->freelist_lock); - - for (i = 0; i < c->open_buckets_partial_nr; i++) - bch2_open_bucket_to_text(out, c, - c->open_buckets + c->open_buckets_partial[i]); - - spin_unlock(&c->freelist_lock); - --out->atomic; -} - -static const char * const bch2_write_point_states[] = { -#define x(n) #n, - WRITE_POINT_STATES() -#undef x - NULL -}; - -static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, - struct write_point *wp) -{ - struct open_bucket *ob; - unsigned i; - - mutex_lock(&wp->lock); - - prt_printf(out, "%lu: ", wp->write_point); - prt_human_readable_u64(out, wp->sectors_allocated << 9); - - prt_printf(out, " last wrote: "); - bch2_pr_time_units(out, sched_clock() - wp->last_used); - - for (i = 0; i < WRITE_POINT_STATE_NR; i++) { - prt_printf(out, " %s: ", bch2_write_point_states[i]); - bch2_pr_time_units(out, wp->time[i]); - } - - prt_newline(out); - - printbuf_indent_add(out, 2); - open_bucket_for_each(c, &wp->ptrs, ob, i) - bch2_open_bucket_to_text(out, c, ob); - printbuf_indent_sub(out, 2); - - mutex_unlock(&wp->lock); -} - -void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct write_point *wp; - - prt_str(out, "Foreground write points\n"); - for (wp = c->write_points; - wp < c->write_points + ARRAY_SIZE(c->write_points); - wp++) - bch2_write_point_to_text(out, c, wp); - - prt_str(out, "Copygc write point\n"); - bch2_write_point_to_text(out, c, &c->copygc_write_point); - - prt_str(out, "Rebalance write point\n"); - bch2_write_point_to_text(out, c, &c->rebalance_write_point); - - prt_str(out, "Btree write point\n"); - bch2_write_point_to_text(out, c, &c->btree_write_point); -} - -void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) -{ - unsigned nr[BCH_DATA_NR]; - - memset(nr, 0, sizeof(nr)); - - for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) - nr[c->open_buckets[i].data_type]++; - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 24); - - prt_printf(out, "capacity\t%llu\n", c->capacity); - prt_printf(out, "reserved\t%llu\n", c->reserved); - prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden)); - prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree)); - prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data)); - prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached)); - prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved)); - prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved)); - prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes)); - - prt_newline(out); - prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty"); - prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); - prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT); - prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty"); - prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]); - prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]); - prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr); -} - -void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca); - unsigned nr[BCH_DATA_NR]; - - memset(nr, 0, sizeof(nr)); - - for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) - nr[c->open_buckets[i].data_type]++; - - bch2_dev_usage_to_text(out, ca, &stats); - - prt_newline(out); - - prt_printf(out, "reserves:\n"); - for (unsigned i = 0; i < BCH_WATERMARK_NR; i++) - prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i)); - - prt_newline(out); - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 12); - printbuf_tabstop_push(out, 16); - - prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); - prt_printf(out, "buckets to invalidate\t%llu\r\n", - should_invalidate_buckets(ca, bch2_dev_usage_read(ca))); -} - -static noinline void bch2_print_allocator_stuck(struct bch_fs *c) -{ - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n", - c->opts.allocator_stuck_timeout); - - prt_printf(&buf, "Allocator debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_fs_alloc_debug_to_text(&buf, c); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - - bch2_printbuf_make_room(&buf, 4096); - - buf.atomic++; - scoped_guard(rcu) - for_each_online_member_rcu(c, ca) { - prt_printf(&buf, "Dev %u:\n", ca->dev_idx); - printbuf_indent_add(&buf, 2); - bch2_dev_alloc_debug_to_text(&buf, ca); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } - --buf.atomic; - - prt_printf(&buf, "Copygc debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_copygc_wait_to_text(&buf, c); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - - prt_printf(&buf, "Journal debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_journal_debug_to_text(&buf, &c->journal); - printbuf_indent_sub(&buf, 2); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -} - -static inline unsigned allocator_wait_timeout(struct bch_fs *c) -{ - if (c->allocator_last_stuck && - time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies)) - return 0; - - return c->opts.allocator_stuck_timeout * HZ; -} - -void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) -{ - unsigned t = allocator_wait_timeout(c); - - if (t && closure_sync_timeout(cl, t)) { - c->allocator_last_stuck = jiffies; - bch2_print_allocator_stuck(c); - } - - closure_sync(cl); -} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h deleted file mode 100644 index 1b3fc8460096..000000000000 --- a/fs/bcachefs/alloc_foreground.h +++ /dev/null @@ -1,318 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ALLOC_FOREGROUND_H -#define _BCACHEFS_ALLOC_FOREGROUND_H - -#include "bcachefs.h" -#include "buckets.h" -#include "alloc_types.h" -#include "extents.h" -#include "io_write_types.h" -#include "sb-members.h" - -#include <linux/hash.h> - -struct bkey; -struct bch_dev; -struct bch_fs; -struct bch_devs_List; - -extern const char * const bch2_watermarks[]; - -void bch2_reset_alloc_cursors(struct bch_fs *); - -struct dev_alloc_list { - unsigned nr; - u8 data[BCH_SB_MEMBERS_MAX]; -}; - -struct alloc_request { - unsigned nr_replicas; - unsigned target; - bool ec; - enum bch_watermark watermark; - enum bch_write_flags flags; - enum bch_data_type data_type; - struct bch_devs_list *devs_have; - struct write_point *wp; - - /* These fields are used primarily by open_bucket_add_buckets */ - struct open_buckets ptrs; - unsigned nr_effective; /* sum of @ptrs durability */ - bool have_cache; /* have we allocated from a 0 durability dev */ - struct bch_devs_mask devs_may_alloc; - - /* bch2_bucket_alloc_set_trans(): */ - struct dev_alloc_list devs_sorted; - struct bch_dev_usage usage; - - /* bch2_bucket_alloc_trans(): */ - struct bch_dev *ca; - - enum { - BTREE_BITMAP_NO, - BTREE_BITMAP_YES, - BTREE_BITMAP_ANY, - } btree_bitmap; - - struct { - u64 buckets_seen; - u64 skipped_open; - u64 skipped_need_journal_commit; - u64 need_journal_commit; - u64 skipped_nocow; - u64 skipped_nouse; - u64 skipped_mi_btree_bitmap; - } counters; - - unsigned scratch_nr_replicas; - unsigned scratch_nr_effective; - bool scratch_have_cache; - enum bch_data_type scratch_data_type; - struct open_buckets scratch_ptrs; - struct bch_devs_mask scratch_devs_may_alloc; -}; - -void bch2_dev_alloc_list(struct bch_fs *, - struct dev_stripe_state *, - struct bch_devs_mask *, - struct dev_alloc_list *); -void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); - -static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) -{ - return bch2_dev_have_ref(c, ob->dev); -} - -static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) -{ - switch (watermark) { - case BCH_WATERMARK_interior_updates: - return 0; - case BCH_WATERMARK_reclaim: - return OPEN_BUCKETS_COUNT / 6; - case BCH_WATERMARK_btree: - case BCH_WATERMARK_btree_copygc: - return OPEN_BUCKETS_COUNT / 4; - case BCH_WATERMARK_copygc: - return OPEN_BUCKETS_COUNT / 3; - default: - return OPEN_BUCKETS_COUNT / 2; - } -} - -struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, - enum bch_watermark, enum bch_data_type, - struct closure *); - -static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, - struct open_bucket *ob) -{ - BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); - - obs->v[obs->nr++] = ob - c->open_buckets; -} - -#define open_bucket_for_each(_c, _obs, _ob, _i) \ - for ((_i) = 0; \ - (_i) < (_obs)->nr && \ - ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ - (_i)++) - -static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, - struct open_buckets *obs) -{ - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, obs, ob, i) - if (ob->ec) - return ob; - - return NULL; -} - -void bch2_open_bucket_write_error(struct bch_fs *, - struct open_buckets *, unsigned, int); - -void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); - -static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -{ - if (atomic_dec_and_test(&ob->pin)) - __bch2_open_bucket_put(c, ob); -} - -static inline void bch2_open_buckets_put(struct bch_fs *c, - struct open_buckets *ptrs) -{ - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, ptrs, ob, i) - bch2_open_bucket_put(c, ob); - ptrs->nr = 0; -} - -static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp) -{ - struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, &wp->ptrs, ob, i) - ob_push(c, ob->sectors_free < block_sectors(c) - ? &ptrs - : &keep, ob); - wp->ptrs = keep; - - mutex_unlock(&wp->lock); - - bch2_open_buckets_put(c, &ptrs); -} - -static inline void bch2_open_bucket_get(struct bch_fs *c, - struct write_point *wp, - struct open_buckets *ptrs) -{ - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, &wp->ptrs, ob, i) { - ob->data_type = wp->data_type; - atomic_inc(&ob->pin); - ob_push(c, ptrs, ob); - } -} - -static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c, - unsigned dev, u64 bucket) -{ - return c->open_buckets_hash + - (jhash_3words(dev, bucket, bucket >> 32, 0) & - (OPEN_BUCKETS_COUNT - 1)); -} - -static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket) -{ - open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket); - - while (slot) { - struct open_bucket *ob = &c->open_buckets[slot]; - - if (ob->dev == dev && ob->bucket == bucket) - return true; - - slot = ob->hash; - } - - return false; -} - -static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) -{ - bool ret; - - if (bch2_bucket_is_open(c, dev, bucket)) - return true; - - spin_lock(&c->freelist_lock); - ret = bch2_bucket_is_open(c, dev, bucket); - spin_unlock(&c->freelist_lock); - - return ret; -} - -enum bch_write_flags; -int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, - struct dev_stripe_state *, struct closure *); - -int bch2_alloc_sectors_start_trans(struct btree_trans *, - unsigned, unsigned, - struct write_point_specifier, - struct bch_devs_list *, - unsigned, unsigned, - enum bch_watermark, - enum bch_write_flags, - struct closure *, - struct write_point **); - -static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - return (struct bch_extent_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = ob->gen, - .dev = ob->dev, - .offset = bucket_to_sector(ca, ob->bucket) + - ca->mi.bucket_size - - ob->sectors_free, - }; -} - -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ -static inline void -bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp, - struct bkey_i *k, unsigned sectors, - bool cached) -{ - struct open_bucket *ob; - unsigned i; - - BUG_ON(sectors > wp->sectors_free); - wp->sectors_free -= sectors; - wp->sectors_allocated += sectors; - - open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = ob_dev(c, ob); - struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); - - ptr.cached = cached || - (!ca->mi.durability && - wp->data_type == BCH_DATA_user); - - bch2_bkey_append_ptr(k, ptr); - - BUG_ON(sectors > ob->sectors_free); - ob->sectors_free -= sectors; - } -} - -void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, - struct bkey_i *, unsigned, bool); -void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); - -void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool); - -static inline struct write_point_specifier writepoint_hashed(unsigned long v) -{ - return (struct write_point_specifier) { .v = v | 1 }; -} - -static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) -{ - return (struct write_point_specifier) { .v = (unsigned long) wp }; -} - -void bch2_fs_allocator_foreground_init(struct bch_fs *); - -void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *); -void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *); -void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); - -void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *); -void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *); - -void __bch2_wait_on_allocator(struct bch_fs *, struct closure *); -static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) -{ - if (cl->closure_get_happened) - __bch2_wait_on_allocator(c, cl); -} - -#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h deleted file mode 100644 index e7becdf22cba..000000000000 --- a/fs/bcachefs/alloc_types.h +++ /dev/null @@ -1,121 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ALLOC_TYPES_H -#define _BCACHEFS_ALLOC_TYPES_H - -#include <linux/mutex.h> -#include <linux/spinlock.h> - -#include "clock_types.h" -#include "fifo.h" - -#define BCH_WATERMARKS() \ - x(stripe) \ - x(normal) \ - x(copygc) \ - x(btree) \ - x(btree_copygc) \ - x(reclaim) \ - x(interior_updates) - -enum bch_watermark { -#define x(name) BCH_WATERMARK_##name, - BCH_WATERMARKS() -#undef x - BCH_WATERMARK_NR, -}; - -#define BCH_WATERMARK_BITS 3 -#define BCH_WATERMARK_MASK ~(~0U << BCH_WATERMARK_BITS) - -#define OPEN_BUCKETS_COUNT 1024 - -#define WRITE_POINT_HASH_NR 32 -#define WRITE_POINT_MAX 32 - -/* - * 0 is never a valid open_bucket_idx_t: - */ -typedef u16 open_bucket_idx_t; - -struct open_bucket { - spinlock_t lock; - atomic_t pin; - open_bucket_idx_t freelist; - open_bucket_idx_t hash; - - /* - * When an open bucket has an ec_stripe attached, this is the index of - * the block in the stripe this open_bucket corresponds to: - */ - u8 ec_idx; - enum bch_data_type data_type:6; - unsigned valid:1; - unsigned on_partial_list:1; - - u8 dev; - u8 gen; - u32 sectors_free; - u64 bucket; - struct ec_stripe_new *ec; -}; - -#define OPEN_BUCKET_LIST_MAX 15 - -struct open_buckets { - open_bucket_idx_t nr; - open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; -}; - -struct dev_stripe_state { - u64 next_alloc[BCH_SB_MEMBERS_MAX]; -}; - -#define WRITE_POINT_STATES() \ - x(stopped) \ - x(waiting_io) \ - x(waiting_work) \ - x(runnable) \ - x(running) - -enum write_point_state { -#define x(n) WRITE_POINT_##n, - WRITE_POINT_STATES() -#undef x - WRITE_POINT_STATE_NR -}; - -struct write_point { - struct { - struct hlist_node node; - struct mutex lock; - u64 last_used; - unsigned long write_point; - enum bch_data_type data_type; - - /* calculated based on how many pointers we're actually going to use: */ - unsigned sectors_free; - - struct open_buckets ptrs; - struct dev_stripe_state stripe; - - u64 sectors_allocated; - } __aligned(SMP_CACHE_BYTES); - - struct { - struct work_struct index_update_work; - - struct list_head writes; - spinlock_t writes_lock; - - enum write_point_state state; - u64 last_state_change; - u64 time[WRITE_POINT_STATE_NR]; - u64 last_runtime; - } __aligned(SMP_CACHE_BYTES); -}; - -struct write_point_specifier { - unsigned long v; -}; - -#endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c deleted file mode 100644 index a7cd1f0f0964..000000000000 --- a/fs/bcachefs/async_objs.c +++ /dev/null @@ -1,132 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Async obj debugging: keep asynchronous objects on (very fast) lists, make - * them visibile in debugfs: - */ - -#include "bcachefs.h" -#include "async_objs.h" -#include "btree_io.h" -#include "debug.h" -#include "io_read.h" -#include "io_write.h" - -#include <linux/debugfs.h> - -static void promote_obj_to_text(struct printbuf *out, void *obj) -{ - bch2_promote_op_to_text(out, obj); -} - -static void rbio_obj_to_text(struct printbuf *out, void *obj) -{ - bch2_read_bio_to_text(out, obj); -} - -static void write_op_obj_to_text(struct printbuf *out, void *obj) -{ - bch2_write_op_to_text(out, obj); -} - -static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj) -{ - struct btree_read_bio *rbio = obj; - bch2_btree_read_bio_to_text(out, rbio); -} - -static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj) -{ - struct btree_write_bio *wbio = obj; - bch2_bio_to_text(out, &wbio->wbio.bio); -} - -static int bch2_async_obj_list_open(struct inode *inode, struct file *file) -{ - struct async_obj_list *list = inode->i_private; - struct dump_iter *i; - - i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) - return -ENOMEM; - - file->private_data = i; - i->from = POS_MIN; - i->iter = 0; - i->c = container_of(list, struct bch_fs, async_objs[list->idx]); - i->list = list; - i->buf = PRINTBUF; - return 0; -} - -static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct async_obj_list *list = i->list; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - struct genradix_iter iter; - void *obj; - fast_list_for_each_from(&list->list, iter, obj, i->iter) { - ret = bch2_debugfs_flush_buf(i); - if (ret) - return ret; - - if (!i->size) - break; - - list->obj_to_text(&i->buf, obj); - } - - if (i->buf.allocation_failure) - ret = -ENOMEM; - else - i->iter = iter.pos; - - if (!ret) - ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; -} - -static const struct file_operations async_obj_ops = { - .owner = THIS_MODULE, - .open = bch2_async_obj_list_open, - .release = bch2_dump_release, - .read = bch2_async_obj_list_read, -}; - -void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) -{ - c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir); - -#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \ - &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops); - BCH_ASYNC_OBJ_LISTS() -#undef x -} - -void bch2_fs_async_obj_exit(struct bch_fs *c) -{ - for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) - fast_list_exit(&c->async_objs[i].list); -} - -int bch2_fs_async_obj_init(struct bch_fs *c) -{ - for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) { - if (fast_list_init(&c->async_objs[i].list)) - return -BCH_ERR_ENOMEM_async_obj_init; - c->async_objs[i].idx = i; - } - -#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text; - BCH_ASYNC_OBJ_LISTS() -#undef x - - return 0; -} diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h deleted file mode 100644 index cd6489b8cf76..000000000000 --- a/fs/bcachefs/async_objs.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ASYNC_OBJS_H -#define _BCACHEFS_ASYNC_OBJS_H - -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS -static inline void __async_object_list_del(struct fast_list *head, unsigned idx) -{ - fast_list_remove(head, idx); -} - -static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx) -{ - int ret = fast_list_add(head, obj); - *idx = ret > 0 ? ret : 0; - return ret < 0 ? ret : 0; -} - -#define async_object_list_del(_c, _list, idx) \ - __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx) - -#define async_object_list_add(_c, _list, obj, idx) \ - __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx) - -void bch2_fs_async_obj_debugfs_init(struct bch_fs *); -void bch2_fs_async_obj_exit(struct bch_fs *); -int bch2_fs_async_obj_init(struct bch_fs *); - -#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ - -#define async_object_list_del(_c, _n, idx) do {} while (0) - -static inline int __async_object_list_add(void) -{ - return 0; -} -#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add() - -static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {} -static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {} -static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; } - -#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ - -#endif /* _BCACHEFS_ASYNC_OBJS_H */ diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h deleted file mode 100644 index 8d713c0f5841..000000000000 --- a/fs/bcachefs/async_objs_types.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H -#define _BCACHEFS_ASYNC_OBJS_TYPES_H - -#define BCH_ASYNC_OBJ_LISTS() \ - x(promote) \ - x(rbio) \ - x(write_op) \ - x(btree_read_bio) \ - x(btree_write_bio) - -enum bch_async_obj_lists { -#define x(n) BCH_ASYNC_OBJ_LIST_##n, - BCH_ASYNC_OBJ_LISTS() -#undef x - BCH_ASYNC_OBJ_NR -}; - -struct async_obj_list { - struct fast_list list; - void (*obj_to_text)(struct printbuf *, void *); - unsigned idx; -}; - -#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c deleted file mode 100644 index 77d93beb3c8f..000000000000 --- a/fs/bcachefs/backpointers.c +++ /dev/null @@ -1,1391 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "bbpos.h" -#include "alloc_background.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "checksum.h" -#include "disk_accounting.h" -#include "error.h" -#include "progress.h" -#include "recovery_passes.h" - -#include <linux/mm.h> - -static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64); - -static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) -{ - return (struct bbpos) { - .btree = bp.btree_id, - .pos = bp.pos, - }; -} - -int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - int ret = 0; - - bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH, - c, backpointer_level_bad, - "backpointer level bad: %u >= %u", - bp.v->level, BTREE_MAX_DEPTH); - - bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, - c, backpointer_dev_bad, - "backpointer for BCH_SB_MEMBER_INVALID"); -fsck_err: - return ret; -} - -void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - - struct bch_dev *ca; - u32 bucket_offset; - struct bpos bucket; - scoped_guard(rcu) { - ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); - if (ca) - bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); - } - - if (ca) - prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); - else - prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); - - bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); - prt_str(out, " data_type="); - bch2_prt_data_type(out, bp.v->data_type); - prt_printf(out, " suboffset=%u len=%u gen=%u pos=", - (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), - bp.v->bucket_len, - bp.v->bucket_gen); - bch2_bpos_to_text(out, bp.v->pos); -} - -void bch2_backpointer_swab(struct bkey_s k) -{ - struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); - - bp.v->bucket_len = swab32(bp.v->bucket_len); - bch2_bpos_swab(&bp.v->pos); -} - -static bool extent_matches_bp(struct bch_fs *c, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, - struct bkey_s_c_backpointer bp) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bkey_i_backpointer bp2; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2); - - if (bpos_eq(bp.k->p, bp2.k.p) && - !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) - return true; - } - - return false; -} - -static noinline int backpointer_mod_err(struct btree_trans *trans, - struct bkey_s_c orig_k, - struct bkey_i_backpointer *new_bp, - struct bkey_s_c found_bp, - bool insert) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - bool will_check = c->recovery.passes_to_run & - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); - int ret = 0; - - if (insert) { - prt_printf(&buf, "existing backpointer found when inserting "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "found "); - bch2_bkey_val_to_text(&buf, c, found_bp); - prt_newline(&buf); - - prt_printf(&buf, "for "); - bch2_bkey_val_to_text(&buf, c, orig_k); - } else if (!will_check) { - prt_printf(&buf, "backpointer not found when deleting\n"); - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "searching for "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); - prt_newline(&buf); - - prt_printf(&buf, "got "); - bch2_bkey_val_to_text(&buf, c, found_bp); - prt_newline(&buf); - - prt_printf(&buf, "for "); - bch2_bkey_val_to_text(&buf, c, orig_k); - } - - if (!will_check && __bch2_inconsistent_error(c, &buf)) - ret = bch_err_throw(c, erofs_unfixed_errors); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - return ret; -} - -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, - struct bkey_s_c orig_k, - struct bkey_i_backpointer *bp, - bool insert) -{ - struct btree_iter bp_iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bp->k.p, - BTREE_ITER_intent| - BTREE_ITER_slots| - BTREE_ITER_with_updates); - int ret = bkey_err(k); - if (ret) - return ret; - - if (insert - ? k.k->type - : (k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { - ret = backpointer_mod_err(trans, orig_k, bp, k, insert); - if (ret) - goto err; - } - - if (!insert) { - bp->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp->k, 0); - } - - ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); -err: - bch2_trans_iter_exit(trans, &bp_iter); - return ret; -} - -static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) -{ - return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) - ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) - : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -} - -static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, - struct bkey_s_c visiting_k, - struct bkey_buf *last_flushed) -{ - return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) - ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) - : 0; -} - -static int backpointer_target_not_found(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct bkey_s_c target_k, - struct bkey_buf *last_flushed, - bool commit) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - /* - * If we're using the btree write buffer, the backpointer we were - * looking at may have already been deleted - failure to find what it - * pointed to is not an error: - */ - ret = last_flushed - ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) - : 0; - if (ret) - return ret; - - prt_printf(&buf, "backpointer doesn't match %s it points to:\n", - bp.v->level ? "btree node" : "extent"); - bch2_bkey_val_to_text(&buf, c, bp.s_c); - - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, target_k); - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) - if (p.ptr.dev == bp.k->p.inode) { - prt_newline(&buf); - struct bkey_i_backpointer bp2; - bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); - } - - if (fsck_err(trans, backpointer_to_missing_ptr, - "%s", buf.buf)) { - ret = bch2_backpointer_del(trans, bp.k->p); - if (ret || !commit) - goto out; - - /* - * Normally, on transaction commit from inside a transaction, - * we'll return -BCH_ERR_transaction_restart_nested, since a - * transaction commit invalidates pointers given out by peek(). - * - * However, since we're updating a write buffer btree, if we - * return a transaction restart and loop we won't see that the - * backpointer has been deleted without an additional write - * buffer flush - and those are expensive. - * - * So we're relying on the caller immediately advancing to the - * next backpointer and starting a new transaction immediately - * after backpointer_get_key() returns NULL: - */ - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - } -out: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - struct bkey_buf *last_flushed, - bool commit) -{ - struct bch_fs *c = trans->c; - - BUG_ON(!bp.v->level); - - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, - bp.v->level - 1, - 0); - struct btree *b = bch2_btree_iter_peek_node(trans, iter); - if (IS_ERR_OR_NULL(b)) - goto err; - - BUG_ON(b->c.level != bp.v->level - 1); - - if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, - bkey_i_to_s_c(&b->key), bp)) - return b; - - if (btree_node_will_make_reachable(b)) { - b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node)); - } else { - int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), - last_flushed, commit); - b = ret ? ERR_PTR(ret) : NULL; - } -err: - bch2_trans_iter_exit(trans, iter); - return b; -} - -static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - unsigned iter_flags, - struct bkey_buf *last_flushed, - bool commit) -{ - struct bch_fs *c = trans->c; - - if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) - return bkey_s_c_null; - - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, - bp.v->level, - iter_flags); - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); - if (bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } - - /* - * peek_slot() doesn't normally return NULL - except when we ask for a - * key at a btree level that doesn't exist. - * - * We may want to revisit this and change peek_slot(): - */ - if (!k.k) { - bkey_init(&iter->k); - iter->k.p = bp.v->pos; - k.k = &iter->k; - } - - if (k.k && - extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) - return k; - - bch2_trans_iter_exit(trans, iter); - - if (!bp.v->level) { - int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; - } else { - struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit); - if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) - return bkey_s_c_null; - if (IS_ERR_OR_NULL(b)) - return ((struct bkey_s_c) { .k = ERR_CAST(b) }); - - return bkey_i_to_s_c(&b->key); - } -} - -struct btree *bch2_backpointer_get_node(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - struct bkey_buf *last_flushed) -{ - return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true); -} - -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - unsigned iter_flags, - struct bkey_buf *last_flushed) -{ - return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true); -} - -static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, - struct bkey_buf *last_flushed) -{ - if (k.k->type != KEY_TYPE_backpointer) - return 0; - - struct bch_fs *c = trans->c; - struct btree_iter alloc_iter = {}; - struct bkey_s_c alloc_k; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bpos bucket; - if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { - ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); - if (ret) - goto out; - - if (fsck_err(trans, backpointer_to_missing_device, - "backpointer for missing device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_backpointer_del(trans, k.k->p); - goto out; - } - - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); - ret = bkey_err(alloc_k); - if (ret) - goto out; - - if (alloc_k.k->type != KEY_TYPE_alloc_v4) { - ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); - if (ret) - goto out; - - if (fsck_err(trans, backpointer_to_missing_alloc, - "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", - alloc_iter.pos.inode, alloc_iter.pos.offset, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_backpointer_del(trans, k.k->p); - } -out: -fsck_err: - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); - return ret; -} - -/* verify that every backpointer has a corresponding alloc key */ -int bch2_check_btree_backpointers(struct bch_fs *c) -{ - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_backpointers, POS_MIN, 0, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; -} - -struct extents_to_bp_state { - struct bpos bp_start; - struct bpos bp_end; - struct bkey_buf last_flushed; -}; - -static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, - struct bkey_s_c extent, unsigned dev) -{ - struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - bch2_bkey_drop_device(bkey_i_to_s(n), dev); - return bch2_btree_insert_trans(trans, btree, n, 0); -} - -static int check_extent_checksum(struct btree_trans *trans, - enum btree_id btree, struct bkey_s_c extent, - enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct printbuf buf = PRINTBUF; - void *data_buf = NULL; - struct bio *bio = NULL; - size_t bytes; - int ret = 0; - - if (bkey_is_btree_ptr(extent.k)) - return false; - - bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) - if (p.ptr.dev == dev) - goto found; - BUG(); -found: - if (!p.crc.csum_type) - return false; - - bytes = p.crc.compressed_size << 9; - - struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ, - BCH_DEV_READ_REF_check_extent_checksums); - if (!ca) - return false; - - data_buf = kvmalloc(bytes, GFP_KERNEL); - if (!data_buf) { - ret = -ENOMEM; - goto err; - } - - bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL); - bio->bi_iter.bi_sector = p.ptr.offset; - bch2_bio_map(bio, data_buf, bytes); - ret = submit_bio_wait(bio); - if (ret) - goto err; - - prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n"); - bch2_btree_id_to_text(&buf, btree); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, extent); - prt_newline(&buf); - bch2_btree_id_to_text(&buf, o_btree); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, extent2); - - struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); - struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); - if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), - trans, dup_backpointer_to_bad_csum_extent, - "%s", buf.buf)) - ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; -fsck_err: -err: - if (bio) - bio_put(bio); - kvfree(data_buf); - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_check_extent_checksums); - printbuf_exit(&buf); - return ret; -} - -static int check_bp_exists(struct btree_trans *trans, - struct extents_to_bp_state *s, - struct bkey_i_backpointer *bp, - struct bkey_s_c orig_k) -{ - struct bch_fs *c = trans->c; - struct btree_iter other_extent_iter = {}; - struct printbuf buf = PRINTBUF; - - if (bpos_lt(bp->k.p, s->bp_start) || - bpos_gt(bp->k.p, s->bp_end)) - return 0; - - struct btree_iter bp_iter; - struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); - int ret = bkey_err(bp_k); - if (ret) - goto err; - - if (bp_k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { - ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); - if (ret) - goto err; - - goto check_existing_bp; - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &other_extent_iter); - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); - return ret; -check_existing_bp: - /* Do we have a backpointer for a different extent? */ - if (bp_k.k->type != KEY_TYPE_backpointer) - goto missing; - - struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); - - struct bkey_s_c other_extent = - __bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false); - ret = bkey_err(other_extent); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - ret = 0; - if (ret) - goto err; - - if (!other_extent.k) - goto missing; - - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode); - if (ca) { - struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent); - bkey_for_each_ptr(other_extent_ptrs, ptr) - if (ptr->dev == bp->k.p.inode && - dev_ptr_stale_rcu(ca, ptr)) { - rcu_read_unlock(); - ret = drop_dev_and_update(trans, other_bp.v->btree_id, - other_extent, bp->k.p.inode); - if (ret) - goto err; - goto out; - } - } - rcu_read_unlock(); - - if (bch2_extents_match(orig_k, other_extent)) { - printbuf_reset(&buf); - prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n"); - bch2_bkey_val_to_text(&buf, c, orig_k); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, other_extent); - bch_err(c, "%s", buf.buf); - - if (other_extent.k->size <= orig_k.k->size) { - ret = drop_dev_and_update(trans, other_bp.v->btree_id, - other_extent, bp->k.p.inode); - if (ret) - goto err; - goto out; - } else { - ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); - if (ret) - goto err; - goto missing; - } - } - - ret = check_extent_checksum(trans, - other_bp.v->btree_id, other_extent, - bp->v.btree_id, orig_k, - bp->k.p.inode); - if (ret < 0) - goto err; - if (ret) { - ret = 0; - goto missing; - } - - ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, - other_bp.v->btree_id, other_extent, bp->k.p.inode); - if (ret < 0) - goto err; - if (ret) { - ret = 0; - goto out; - } - - printbuf_reset(&buf); - prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode); - bch2_bkey_val_to_text(&buf, c, orig_k); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, other_extent); - bch_err(c, "%s", buf.buf); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto err; -missing: - printbuf_reset(&buf); - prt_str(&buf, "missing backpointer\nfor: "); - bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\nwant: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); - prt_printf(&buf, "\ngot: "); - bch2_bkey_val_to_text(&buf, c, bp_k); - - if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) - ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true); - - goto out; -} - -static int check_extent_to_backpointers(struct btree_trans *trans, - struct extents_to_bp_state *s, - enum btree_id btree, unsigned level, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.dev == BCH_SB_MEMBER_INVALID) - continue; - - bool empty; - { - /* scoped_guard() is a loop, so it breaks continue */ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - if (!ca) - continue; - - if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) - continue; - - u64 b = PTR_BUCKET_NR(ca, &p.ptr); - if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) - continue; - - empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); - } - - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); - - int ret = !empty - ? check_bp_exists(trans, s, &bp, k) - : bch2_bucket_backpointer_mod(trans, k, &bp, true); - if (ret) - return ret; - } - - return 0; -} - -static int check_btree_root_to_backpointers(struct btree_trans *trans, - struct extents_to_bp_state *s, - enum btree_id btree_id, - int *level) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct btree *b; - struct bkey_s_c k; - int ret; -retry: - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, - 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); - b = bch2_btree_iter_peek_node(trans, &iter); - ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err; - - if (b != btree_node_root(c, b)) { - bch2_trans_iter_exit(trans, &iter); - goto retry; - } - - *level = b->c.level; - - k = bkey_i_to_s_c(&b->key); - ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static u64 mem_may_pin_bytes(struct bch_fs *c) -{ - struct sysinfo i; - si_meminfo(&i); - - u64 mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); -} - -static size_t btree_nodes_fit_in_ram(struct bch_fs *c) -{ - return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); -} - -static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, - u64 btree_leaf_mask, - u64 btree_interior_mask, - struct bbpos start, struct bbpos *end) -{ - struct bch_fs *c = trans->c; - s64 mem_may_pin = mem_may_pin_bytes(c); - int ret = 0; - - bch2_btree_cache_unpin(c); - - btree_interior_mask |= btree_leaf_mask; - - c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask; - c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask; - c->btree_cache.pinned_nodes_start = start; - c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; - - for (enum btree_id btree = start.btree; - btree < BTREE_ID_NR && !ret; - btree++) { - unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1; - - if (!(BIT_ULL(btree) & btree_leaf_mask) && - !(BIT_ULL(btree) & btree_interior_mask)) - continue; - - ret = __for_each_btree_node(trans, iter, btree, - btree == start.btree ? start.pos : POS_MIN, - 0, depth, BTREE_ITER_prefetch, b, ({ - mem_may_pin -= btree_buf_bytes(b); - if (mem_may_pin <= 0) { - c->btree_cache.pinned_nodes_end = *end = - BBPOS(btree, b->key.k.p); - break; - } - bch2_node_pin(c, b); - 0; - })); - } - - return ret; -} - -static inline int bch2_fs_going_ro(struct bch_fs *c) -{ - return test_bit(BCH_FS_going_ro, &c->flags) - ? -EROFS - : 0; -} - -static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - struct extents_to_bp_state *s) -{ - struct bch_fs *c = trans->c; - struct progress_indicator_state progress; - int ret = 0; - - bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); - - for (enum btree_id btree_id = 0; - btree_id < btree_id_nr_alive(c); - btree_id++) { - int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; - - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - check_btree_root_to_backpointers(trans, s, btree_id, &level)); - if (ret) - return ret; - - while (level >= depth) { - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, - BTREE_ITER_prefetch); - - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); - bch2_fs_going_ro(c) ?: - check_extent_to_backpointers(trans, s, btree_id, level, k) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - })); - if (ret) - return ret; - - --level; - } - } - - return 0; -} - -enum alloc_sector_counter { - ALLOC_dirty, - ALLOC_cached, - ALLOC_stripe, - ALLOC_SECTORS_NR -}; - -static int data_type_to_alloc_counter(enum bch_data_type t) -{ - switch (t) { - case BCH_DATA_btree: - case BCH_DATA_user: - return ALLOC_dirty; - case BCH_DATA_cached: - return ALLOC_cached; - case BCH_DATA_stripe: - case BCH_DATA_parity: - return ALLOC_stripe; - default: - return -1; - } -} - -static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); - -static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, - bool *had_mismatch, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - bool need_commit = false; - - *had_mismatch = false; - - if (a->data_type == BCH_DATA_sb || - a->data_type == BCH_DATA_journal || - a->data_type == BCH_DATA_parity) - return 0; - - u32 sectors[ALLOC_SECTORS_NR]; - memset(sectors, 0, sizeof(sectors)); - - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); - if (!ca) - return 0; - - struct btree_iter iter; - struct bkey_s_c bp_k; - int ret = 0; - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, alloc_k.k->p), - bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { - if (bp_k.k->type != KEY_TYPE_backpointer) - continue; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && - (bp.v->bucket_gen != a->gen || - bp.v->pad)) { - ret = bch2_backpointer_del(trans, bp_k.k->p); - if (ret) - break; - - need_commit = true; - continue; - } - - if (bp.v->bucket_gen != a->gen) - continue; - - int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); - if (alloc_counter < 0) - continue; - - sectors[alloc_counter] += bp.v->bucket_len; - }; - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; - - if (need_commit) { - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - } - - if (sectors[ALLOC_dirty] != a->dirty_sectors || - sectors[ALLOC_cached] != a->cached_sectors || - sectors[ALLOC_stripe] != a->stripe_sectors) { - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { - ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); - if (ret) - goto err; - } - - if (sectors[ALLOC_dirty] > a->dirty_sectors || - sectors[ALLOC_cached] > a->cached_sectors || - sectors[ALLOC_stripe] > a->stripe_sectors) { - ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: - bch_err_throw(c, transaction_restart_nested); - goto err; - } - - bool empty = (sectors[ALLOC_dirty] + - sectors[ALLOC_stripe] + - sectors[ALLOC_cached]) == 0; - - ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch, - alloc_k.k->p.offset) ?: - (empty - ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty, - alloc_k.k->p.offset) - : 0); - - *had_mismatch = true; - } -err: - bch2_dev_put(ca); - return ret; -} - -static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr_v2: { - bool ret = false; - - guard(rcu)(); - struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; - while (pos.inode <= k.k->p.inode) { - if (pos.inode >= c->sb.nr_devices) - break; - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); - if (!ca) - goto next; - - struct bpos bucket = bp_pos_to_bucket(ca, pos); - u64 next = ca->mi.nbuckets; - - unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets); - if (bitmap) - next = min_t(u64, next, - find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset)); - - bucket.offset = next; - if (bucket.offset == ca->mi.nbuckets) - goto next; - - ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p); - if (ret) - break; -next: - pos = SPOS(pos.inode + 1, 0, 0); - } - - return ret; - } - case KEY_TYPE_btree_ptr: - return true; - default: - return false; - } -} - -static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, - enum btree_id btree, unsigned level) -{ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err; - - if (b) - bch2_node_pin(trans->c, b); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, - struct bpos start, struct bpos *end) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - struct bkey_buf tmp; - bch2_bkey_buf_init(&tmp); - - bch2_btree_cache_unpin(c); - - *end = SPOS_MAX; - - s64 mem_may_pin = mem_may_pin_bytes(c); - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, - 0, 1, BTREE_ITER_prefetch); - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - if (!backpointer_node_has_missing(c, k)) - continue; - - mem_may_pin -= c->opts.btree_node_size; - if (mem_may_pin <= 0) - break; - - bch2_bkey_buf_reassemble(&tmp, c, k); - struct btree_path *path = btree_iter_path(trans, &iter); - - BUG_ON(path->level != 1); - - bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); - })); - if (ret) - return ret; - - struct bpos pinned = SPOS_MAX; - mem_may_pin = mem_may_pin_bytes(c); - bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, - 0, 1, BTREE_ITER_prefetch); - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - if (!backpointer_node_has_missing(c, k)) - continue; - - mem_may_pin -= c->opts.btree_node_size; - if (mem_may_pin <= 0) { - *end = pinned; - break; - } - - bch2_bkey_buf_reassemble(&tmp, c, k); - struct btree_path *path = btree_iter_path(trans, &iter); - - BUG_ON(path->level != 1); - - int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1); - - if (!ret2) - pinned = tmp.k->k.p; - - ret; - })); - if (ret) - return ret; - - return ret; -} - -int bch2_check_extents_to_backpointers(struct bch_fs *c) -{ - int ret = 0; - - struct btree_trans *trans = bch2_trans_get(c); - struct extents_to_bp_state s = { .bp_start = POS_MIN }; - - bch2_bkey_buf_init(&s.last_flushed); - bkey_init(&s.last_flushed.k->k); - - ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_prefetch, k, ({ - bool had_mismatch; - bch2_fs_going_ro(c) ?: - check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed); - })); - if (ret) - goto err; - - u64 nr_buckets = 0, nr_mismatches = 0; - for_each_member_device(c, ca) { - nr_buckets += ca->mi.nbuckets; - nr_mismatches += ca->bucket_backpointer_mismatch.nr; - } - - if (!nr_mismatches) - goto err; - - bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", - nr_mismatches, nr_buckets); - - while (1) { - ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); - if (ret) - break; - - if ( bpos_eq(s.bp_start, POS_MIN) && - !bpos_eq(s.bp_end, SPOS_MAX)) - bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", - __func__, btree_nodes_fit_in_ram(c)); - - if (!bpos_eq(s.bp_start, POS_MIN) || - !bpos_eq(s.bp_end, SPOS_MAX)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "check_extents_to_backpointers(): "); - bch2_bpos_to_text(&buf, s.bp_start); - prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, s.bp_end); - - bch_verbose(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_check_extents_to_backpointers_pass(trans, &s); - if (ret || bpos_eq(s.bp_end, SPOS_MAX)) - break; - - s.bp_start = bpos_successor(s.bp_end); - } - - for_each_member_device(c, ca) { - bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); - bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); - } -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&s.last_flushed, c); - bch2_btree_cache_unpin(c); - - bch_err_fn(c, ret); - return ret; -} - -static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans, - struct bpos bucket, - bool *had_mismatch, - struct bkey_buf *last_flushed) -{ - struct btree_iter alloc_iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter, - BTREE_ID_alloc, bucket, - BTREE_ITER_cached); - int ret = bkey_err(k); - if (ret) - return ret; - - ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed); - bch2_trans_iter_exit(trans, &alloc_iter); - return ret; -} - -int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans, - struct bch_dev *ca, u64 bucket, - bool copygc, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - bool had_mismatch; - int ret = lockrestart_do(trans, - check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket), - &had_mismatch, last_flushed)); - if (ret || !had_mismatch) - return ret; - - u64 nr = ca->bucket_backpointer_mismatch.nr; - u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0; - - struct printbuf buf = PRINTBUF; - __bch2_log_msg_start(ca->name, &buf); - - prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n", - bucket, nr, ca->mi.nbuckets); - - bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_extents_to_backpointers, - nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return 0; -} - -/* backpointers -> extents */ - -static int check_one_backpointer(struct btree_trans *trans, - struct bbpos start, - struct bbpos end, - struct bkey_s_c bp_k, - struct bkey_buf *last_flushed) -{ - if (bp_k.k->type != KEY_TYPE_backpointer) - return 0; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); - struct bbpos pos = bp_to_bbpos(*bp.v); - - if (bbpos_cmp(pos, start) < 0 || - bbpos_cmp(pos, end) > 0) - return 0; - - struct btree_iter iter; - struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed); - int ret = bkey_err(k); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - return 0; - if (ret) - return ret; - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int check_bucket_backpointers_to_extents(struct btree_trans *trans, - struct bch_dev *ca, struct bpos bucket) -{ - u32 restart_count = trans->restart_count; - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, bucket), - bucket_pos_to_bp_end(ca, bucket), - 0, k, - check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) - ); - - bch2_bkey_buf_exit(&last_flushed, trans->c); - return ret ?: trans_was_restarted(trans, restart_count); -} - -static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, - struct bbpos start, - struct bbpos end) -{ - struct bch_fs *c = trans->c; - struct bkey_buf last_flushed; - struct progress_indicator_state progress; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); - - int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, - POS_MIN, BTREE_ITER_prefetch, k, ({ - bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); - check_one_backpointer(trans, start, end, k, &last_flushed); - })); - - bch2_bkey_buf_exit(&last_flushed, c); - return ret; -} - -int bch2_check_backpointers_to_extents(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; - int ret; - - while (1) { - ret = bch2_get_btree_in_memory_pos(trans, - BIT_ULL(BTREE_ID_extents)| - BIT_ULL(BTREE_ID_reflink), - ~0, - start, &end); - if (ret) - break; - - if (!bbpos_cmp(start, BBPOS_MIN) && - bbpos_cmp(end, BBPOS_MAX)) - bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass", - __func__, btree_nodes_fit_in_ram(c)); - - if (bbpos_cmp(start, BBPOS_MIN) || - bbpos_cmp(end, BBPOS_MAX)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "check_backpointers_to_extents(): "); - bch2_bbpos_to_text(&buf, start); - prt_str(&buf, "-"); - bch2_bbpos_to_text(&buf, end); - - bch_verbose(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_check_backpointers_to_extents_pass(trans, start, end); - if (ret || !bbpos_cmp(end, BBPOS_MAX)) - break; - - start = bbpos_successor(end); - } - bch2_trans_put(trans); - - bch2_btree_cache_unpin(c); - - bch_err_fn(c, ret); - return ret; -} - -static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit) -{ - scoped_guard(mutex, &b->lock) { - if (!b->buckets) { - b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), - sizeof(unsigned long), GFP_KERNEL); - if (!b->buckets) - return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); - } - - b->nr += !__test_and_set_bit(bit, b->buckets); - } - - return 0; -} - -int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b, - u64 old_size, u64 new_size) -{ - scoped_guard(mutex, &b->lock) { - if (!b->buckets) - return 0; - - unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size), - sizeof(unsigned long), GFP_KERNEL); - if (!n) - return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); - - memcpy(n, b->buckets, - BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long)); - kvfree(b->buckets); - b->buckets = n; - } - - return 0; -} - -void bch2_bucket_bitmap_free(struct bucket_bitmap *b) -{ - mutex_lock(&b->lock); - kvfree(b->buckets); - b->buckets = NULL; - b->nr = 0; - mutex_unlock(&b->lock); -} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h deleted file mode 100644 index 7e71afee1ac0..000000000000 --- a/fs/bcachefs/backpointers.h +++ /dev/null @@ -1,200 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BACKPOINTERS_H -#define _BCACHEFS_BACKPOINTERS_H - -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "buckets.h" -#include "error.h" -#include "super.h" - -static inline u64 swab40(u64 x) -{ - return (((x & 0x00000000ffULL) << 32)| - ((x & 0x000000ff00ULL) << 16)| - ((x & 0x0000ff0000ULL) >> 0)| - ((x & 0x00ff000000ULL) >> 16)| - ((x & 0xff00000000ULL) >> 32)); -} - -int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, - struct bkey_validate_context); -void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_backpointer_swab(struct bkey_s); - -#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ - .key_validate = bch2_backpointer_validate, \ - .val_to_text = bch2_backpointer_to_text, \ - .swab = bch2_backpointer_swab, \ - .min_val_size = 32, \ -}) - -#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 - -/* - * Convert from pos in backpointer btree to pos of corresponding bucket in alloc - * btree: - */ -static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos) -{ - u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - - return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); -} - -static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos, - u32 *bucket_offset) -{ - u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - - return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset)); -} - -static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) -{ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); - if (ca) - *bucket = bp_pos_to_bucket(ca, bp_pos); - return ca != NULL; -} - -static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, - struct bpos bucket, - u64 bucket_offset) -{ - return POS(bucket.inode, - (bucket_to_sector(ca, bucket.offset) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); -} - -/* - * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: - */ -static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca, - struct bpos bucket, - u64 bucket_offset) -{ - struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset); - EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret))); - return ret; -} - -static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket) -{ - return bucket_pos_to_bp(ca, bucket, 0); -} - -static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket) -{ - return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0)); -} - -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, - struct bkey_s_c, - struct bkey_i_backpointer *, - bool); - -static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, - struct bkey_s_c orig_k, - struct bkey_i_backpointer *bp, - bool insert) -{ - if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); - - if (!insert) { - bp->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp->k, 0); - } - - return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i); -} - -static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, - struct extent_ptr_decoded p, - const union bch_extent_entry *entry) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - return BCH_DATA_btree; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - if (p.has_ec) - return BCH_DATA_stripe; - if (p.ptr.cached) - return BCH_DATA_cached; - else - return BCH_DATA_user; - case KEY_TYPE_stripe: { - const struct bch_extent_ptr *ptr = &entry->ptr; - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - BUG_ON(ptr < s.v->ptrs || - ptr >= s.v->ptrs + s.v->nr_blocks); - - return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity - : BCH_DATA_user; - } - default: - BUG(); - } -} - -static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - const union bch_extent_entry *entry, - struct bkey_i_backpointer *bp) -{ - bkey_backpointer_init(&bp->k_i); - bp->k.p.inode = p.ptr.dev; - - if (k.k->type != KEY_TYPE_stripe) - bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset; - else { - /* - * Put stripe backpointers where they won't collide with the - * extent backpointers within the stripe: - */ - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1; - } - - bp->v = (struct bch_backpointer) { - .btree_id = btree_id, - .level = level, - .data_type = bch2_bkey_ptr_data_type(k, p, entry), - .bucket_gen = p.ptr.gen, - .bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p), - .pos = k.k->p, - }; -} - -struct bkey_buf; -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer, - struct btree_iter *, unsigned, struct bkey_buf *); -struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, - struct btree_iter *, struct bkey_buf *); - -int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64, - bool, struct bkey_buf *); - -int bch2_check_btree_backpointers(struct bch_fs *); -int bch2_check_extents_to_backpointers(struct bch_fs *); -int bch2_check_backpointers_to_extents(struct bch_fs *); - -static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i) -{ - unsigned long *bitmap = READ_ONCE(b->buckets); - return bitmap && test_bit(i, bitmap); -} - -int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64); -void bch2_bucket_bitmap_free(struct bucket_bitmap *); - -#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h deleted file mode 100644 index 63abe17f35ea..000000000000 --- a/fs/bcachefs/bbpos.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BBPOS_H -#define _BCACHEFS_BBPOS_H - -#include "bbpos_types.h" -#include "bkey_methods.h" -#include "btree_cache.h" - -static inline int bbpos_cmp(struct bbpos l, struct bbpos r) -{ - return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos); -} - -static inline struct bbpos bbpos_successor(struct bbpos pos) -{ - if (bpos_cmp(pos.pos, SPOS_MAX)) { - pos.pos = bpos_successor(pos.pos); - return pos; - } - - if (pos.btree != BTREE_ID_NR) { - pos.btree++; - pos.pos = POS_MIN; - return pos; - } - - BUG(); -} - -static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) -{ - bch2_btree_id_to_text(out, pos.btree); - prt_char(out, ':'); - bch2_bpos_to_text(out, pos.pos); -} - -#endif /* _BCACHEFS_BBPOS_H */ diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h deleted file mode 100644 index f63893344f80..000000000000 --- a/fs/bcachefs/bbpos_types.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BBPOS_TYPES_H -#define _BCACHEFS_BBPOS_TYPES_H - -struct bbpos { - enum btree_id btree; - struct bpos pos; -}; - -static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) -{ - return (struct bbpos) { btree, pos }; -} - -#define BBPOS_MIN BBPOS(0, POS_MIN) -#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX) - -#endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h deleted file mode 100644 index ddfacad0f70c..000000000000 --- a/fs/bcachefs/bcachefs.h +++ /dev/null @@ -1,1295 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_H -#define _BCACHEFS_H - -/* - * SOME HIGH LEVEL CODE DOCUMENTATION: - * - * Bcache mostly works with cache sets, cache devices, and backing devices. - * - * Support for multiple cache devices hasn't quite been finished off yet, but - * it's about 95% plumbed through. A cache set and its cache devices is sort of - * like a md raid array and its component devices. Most of the code doesn't care - * about individual cache devices, the main abstraction is the cache set. - * - * Multiple cache devices is intended to give us the ability to mirror dirty - * cached data and metadata, without mirroring clean cached data. - * - * Backing devices are different, in that they have a lifetime independent of a - * cache set. When you register a newly formatted backing device it'll come up - * in passthrough mode, and then you can attach and detach a backing device from - * a cache set at runtime - while it's mounted and in use. Detaching implicitly - * invalidates any cached data for that backing device. - * - * A cache set can have multiple (many) backing devices attached to it. - * - * There's also flash only volumes - this is the reason for the distinction - * between struct cached_dev and struct bcache_device. A flash only volume - * works much like a bcache device that has a backing device, except the - * "cached" data is always dirty. The end result is that we get thin - * provisioning with very little additional code. - * - * Flash only volumes work but they're not production ready because the moving - * garbage collector needs more work. More on that later. - * - * BUCKETS/ALLOCATION: - * - * Bcache is primarily designed for caching, which means that in normal - * operation all of our available space will be allocated. Thus, we need an - * efficient way of deleting things from the cache so we can write new things to - * it. - * - * To do this, we first divide the cache device up into buckets. A bucket is the - * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ - * works efficiently. - * - * Each bucket has a 16 bit priority, and an 8 bit generation associated with - * it. The gens and priorities for all the buckets are stored contiguously and - * packed on disk (in a linked list of buckets - aside from the superblock, all - * of bcache's metadata is stored in buckets). - * - * The priority is used to implement an LRU. We reset a bucket's priority when - * we allocate it or on cache it, and every so often we decrement the priority - * of each bucket. It could be used to implement something more sophisticated, - * if anyone ever gets around to it. - * - * The generation is used for invalidating buckets. Each pointer also has an 8 - * bit generation embedded in it; for a pointer to be considered valid, its gen - * must match the gen of the bucket it points into. Thus, to reuse a bucket all - * we have to do is increment its gen (and write its new gen to disk; we batch - * this up). - * - * Bcache is entirely COW - we never write twice to a bucket, even buckets that - * contain metadata (including btree nodes). - * - * THE BTREE: - * - * Bcache is in large part design around the btree. - * - * At a high level, the btree is just an index of key -> ptr tuples. - * - * Keys represent extents, and thus have a size field. Keys also have a variable - * number of pointers attached to them (potentially zero, which is handy for - * invalidating the cache). - * - * The key itself is an inode:offset pair. The inode number corresponds to a - * backing device or a flash only volume. The offset is the ending offset of the - * extent within the inode - not the starting offset; this makes lookups - * slightly more convenient. - * - * Pointers contain the cache device id, the offset on that device, and an 8 bit - * generation number. More on the gen later. - * - * Index lookups are not fully abstracted - cache lookups in particular are - * still somewhat mixed in with the btree code, but things are headed in that - * direction. - * - * Updates are fairly well abstracted, though. There are two different ways of - * updating the btree; insert and replace. - * - * BTREE_INSERT will just take a list of keys and insert them into the btree - - * overwriting (possibly only partially) any extents they overlap with. This is - * used to update the index after a write. - * - * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is - * overwriting a key that matches another given key. This is used for inserting - * data into the cache after a cache miss, and for background writeback, and for - * the moving garbage collector. - * - * There is no "delete" operation; deleting things from the index is - * accomplished by either by invalidating pointers (by incrementing a bucket's - * gen) or by inserting a key with 0 pointers - which will overwrite anything - * previously present at that location in the index. - * - * This means that there are always stale/invalid keys in the btree. They're - * filtered out by the code that iterates through a btree node, and removed when - * a btree node is rewritten. - * - * BTREE NODES: - * - * Our unit of allocation is a bucket, and we can't arbitrarily allocate and - * free smaller than a bucket - so, that's how big our btree nodes are. - * - * (If buckets are really big we'll only use part of the bucket for a btree node - * - no less than 1/4th - but a bucket still contains no more than a single - * btree node. I'd actually like to change this, but for now we rely on the - * bucket's gen for deleting btree nodes when we rewrite/split a node.) - * - * Anyways, btree nodes are big - big enough to be inefficient with a textbook - * btree implementation. - * - * The way this is solved is that btree nodes are internally log structured; we - * can append new keys to an existing btree node without rewriting it. This - * means each set of keys we write is sorted, but the node is not. - * - * We maintain this log structure in memory - keeping 1Mb of keys sorted would - * be expensive, and we have to distinguish between the keys we have written and - * the keys we haven't. So to do a lookup in a btree node, we have to search - * each sorted set. But we do merge written sets together lazily, so the cost of - * these extra searches is quite low (normally most of the keys in a btree node - * will be in one big set, and then there'll be one or two sets that are much - * smaller). - * - * This log structure makes bcache's btree more of a hybrid between a - * conventional btree and a compacting data structure, with some of the - * advantages of both. - * - * GARBAGE COLLECTION: - * - * We can't just invalidate any bucket - it might contain dirty data or - * metadata. If it once contained dirty data, other writes might overwrite it - * later, leaving no valid pointers into that bucket in the index. - * - * Thus, the primary purpose of garbage collection is to find buckets to reuse. - * It also counts how much valid data it each bucket currently contains, so that - * allocation can reuse buckets sooner when they've been mostly overwritten. - * - * It also does some things that are really internal to the btree - * implementation. If a btree node contains pointers that are stale by more than - * some threshold, it rewrites the btree node to avoid the bucket's generation - * wrapping around. It also merges adjacent btree nodes if they're empty enough. - * - * THE JOURNAL: - * - * Bcache's journal is not necessary for consistency; we always strictly - * order metadata writes so that the btree and everything else is consistent on - * disk in the event of an unclean shutdown, and in fact bcache had writeback - * caching (with recovery from unclean shutdown) before journalling was - * implemented. - * - * Rather, the journal is purely a performance optimization; we can't complete a - * write until we've updated the index on disk, otherwise the cache would be - * inconsistent in the event of an unclean shutdown. This means that without the - * journal, on random write workloads we constantly have to update all the leaf - * nodes in the btree, and those writes will be mostly empty (appending at most - * a few keys each) - highly inefficient in terms of amount of metadata writes, - * and it puts more strain on the various btree resorting/compacting code. - * - * The journal is just a log of keys we've inserted; on startup we just reinsert - * all the keys in the open journal entries. That means that when we're updating - * a node in the btree, we can wait until a 4k block of keys fills up before - * writing them out. - * - * For simplicity, we only journal updates to leaf nodes; updates to parent - * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth - * the complexity to deal with journalling them (in particular, journal replay) - * - updates to non leaf nodes just happen synchronously (see btree_split()). - */ - -#undef pr_fmt -#ifdef __KERNEL__ -#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ -#else -#define pr_fmt(fmt) "%s() " fmt "\n", __func__ -#endif - -#ifdef CONFIG_BCACHEFS_DEBUG -#define ENUMERATED_REF_DEBUG -#endif - -#ifndef dynamic_fault -#define dynamic_fault(...) 0 -#endif - -#define race_fault(...) dynamic_fault("bcachefs:race") - -#include <linux/backing-dev-defs.h> -#include <linux/bug.h> -#include <linux/bio.h> -#include <linux/closure.h> -#include <linux/kobject.h> -#include <linux/list.h> -#include <linux/math64.h> -#include <linux/mutex.h> -#include <linux/percpu-refcount.h> -#include <linux/percpu-rwsem.h> -#include <linux/refcount.h> -#include <linux/rhashtable.h> -#include <linux/rwsem.h> -#include <linux/semaphore.h> -#include <linux/seqlock.h> -#include <linux/shrinker.h> -#include <linux/srcu.h> -#include <linux/types.h> -#include <linux/workqueue.h> -#include <linux/zstd.h> -#include <linux/unicode.h> - -#include "bcachefs_format.h" -#include "btree_journal_iter_types.h" -#include "disk_accounting_types.h" -#include "errcode.h" -#include "fast_list.h" -#include "fifo.h" -#include "nocow_locking_types.h" -#include "opts.h" -#include "sb-errors_types.h" -#include "seqmutex.h" -#include "snapshot_types.h" -#include "time_stats.h" -#include "util.h" - -#include "alloc_types.h" -#include "async_objs_types.h" -#include "btree_gc_types.h" -#include "btree_types.h" -#include "btree_node_scan_types.h" -#include "btree_write_buffer_types.h" -#include "buckets_types.h" -#include "buckets_waiting_for_journal_types.h" -#include "clock_types.h" -#include "disk_groups_types.h" -#include "ec_types.h" -#include "enumerated_ref_types.h" -#include "journal_types.h" -#include "keylist_types.h" -#include "quota_types.h" -#include "rebalance_types.h" -#include "recovery_passes_types.h" -#include "replicas_types.h" -#include "sb-members_types.h" -#include "subvolume_types.h" -#include "super_types.h" -#include "thread_with_file_types.h" - -#include "trace.h" - -#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]) - -#define trace_and_count(_c, _name, ...) \ -do { \ - count_event(_c, _name); \ - trace_##_name(__VA_ARGS__); \ -} while (0) - -#define bch2_fs_init_fault(name) \ - dynamic_fault("bcachefs:bch_fs_init:" name) -#define bch2_meta_read_fault(name) \ - dynamic_fault("bcachefs:meta:read:" name) -#define bch2_meta_write_fault(name) \ - dynamic_fault("bcachefs:meta:write:" name) - -#ifdef __KERNEL__ -#define BCACHEFS_LOG_PREFIX -#endif - -#ifdef BCACHEFS_LOG_PREFIX - -#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name) -#define bch2_fmt_dev(_ca, fmt) "bcachefs (%s): " fmt "\n", ((_ca)->name) -#define bch2_fmt_dev_offset(_ca, _offset, fmt) "bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset) -#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) -#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ - "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset) - -#else - -#define bch2_log_msg(_c, fmt) fmt -#define bch2_fmt_dev(_ca, fmt) "%s: " fmt "\n", ((_ca)->name) -#define bch2_fmt_dev_offset(_ca, _offset, fmt) "%s sector %llu: " fmt "\n", ((_ca)->name), (_offset) -#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) -#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ - "inum %llu offset %llu: " fmt "\n", (_inum), (_offset) - -#endif - -#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") - -void bch2_print_str(struct bch_fs *, const char *, const char *); - -__printf(2, 3) -void bch2_print_opts(struct bch_opts *, const char *, ...); - -__printf(2, 3) -void __bch2_print(struct bch_fs *c, const char *fmt, ...); - -#define maybe_dev_to_fs(_c) _Generic((_c), \ - struct bch_dev *: ((struct bch_dev *) (_c))->fs, \ - struct bch_fs *: (_c)) - -#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__) - -#define bch2_print_ratelimited(_c, ...) \ -do { \ - static DEFINE_RATELIMIT_STATE(_rs, \ - DEFAULT_RATELIMIT_INTERVAL, \ - DEFAULT_RATELIMIT_BURST); \ - \ - if (__ratelimit(&_rs)) \ - bch2_print(_c, __VA_ARGS__); \ -} while (0) - -#define bch2_print_str_ratelimited(_c, ...) \ -do { \ - static DEFINE_RATELIMIT_STATE(_rs, \ - DEFAULT_RATELIMIT_INTERVAL, \ - DEFAULT_RATELIMIT_BURST); \ - \ - if (__ratelimit(&_rs)) \ - bch2_print_str(_c, __VA_ARGS__); \ -} while (0) - -#define bch_info(c, fmt, ...) \ - bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_info_ratelimited(c, fmt, ...) \ - bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_notice(c, fmt, ...) \ - bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_warn(c, fmt, ...) \ - bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_warn_ratelimited(c, fmt, ...) \ - bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) - -#define bch_err(c, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_err_dev(ca, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) -#define bch_err_dev_offset(ca, _offset, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) -#define bch_err_inum(c, _inum, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) -#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) - -#define bch_err_ratelimited(c, fmt, ...) \ - bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_err_dev_ratelimited(ca, fmt, ...) \ - bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) -#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ - bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) -#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ - bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) -#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \ - bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) - -static inline bool should_print_err(int err) -{ - return err && !bch2_err_matches(err, BCH_ERR_transaction_restart); -} - -#define bch_err_fn(_c, _ret) \ -do { \ - if (should_print_err(_ret)) \ - bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ -} while (0) - -#define bch_err_fn_ratelimited(_c, _ret) \ -do { \ - if (should_print_err(_ret)) \ - bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ -} while (0) - -#define bch_err_msg(_c, _ret, _msg, ...) \ -do { \ - if (should_print_err(_ret)) \ - bch_err(_c, "%s(): error " _msg " %s", __func__, \ - ##__VA_ARGS__, bch2_err_str(_ret)); \ -} while (0) - -#define bch_verbose(c, fmt, ...) \ -do { \ - if ((c)->opts.verbose) \ - bch_info(c, fmt, ##__VA_ARGS__); \ -} while (0) - -#define bch_verbose_ratelimited(c, fmt, ...) \ -do { \ - if ((c)->opts.verbose) \ - bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \ -} while (0) - -#define pr_verbose_init(opts, fmt, ...) \ -do { \ - if (opt_get(opts, verbose)) \ - pr_info(fmt, ##__VA_ARGS__); \ -} while (0) - -static inline int __bch2_err_trace(struct bch_fs *c, int err) -{ - trace_error_throw(c, err, _THIS_IP_); - return err; -} - -#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err) - -/* Parameters that are useful for debugging, but should always be compiled in: */ -#define BCH_DEBUG_PARAMS_ALWAYS() \ - BCH_DEBUG_PARAM(key_merging_disabled, \ - "Disables merging of extents") \ - BCH_DEBUG_PARAM(btree_node_merging_disabled, \ - "Disables merging of btree nodes") \ - BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ - "Causes mark and sweep to compact and rewrite every " \ - "btree node it traverses") \ - BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ - "Disables rewriting of btree nodes during mark and sweep")\ - BCH_DEBUG_PARAM(btree_shrinker_disabled, \ - "Disables the shrinker callback for the btree node cache")\ - BCH_DEBUG_PARAM(verify_btree_ondisk, \ - "Reread btree nodes at various points to verify the " \ - "mergesort in the read path against modifications " \ - "done in memory") \ - BCH_DEBUG_PARAM(verify_all_btree_replicas, \ - "When reading btree nodes, read all replicas and " \ - "compare them") \ - BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ - "Don't use the write buffer for backpointers, enabling "\ - "extra runtime checks") \ - BCH_DEBUG_PARAM(debug_check_btree_locking, \ - "Enable additional asserts for btree locking") \ - BCH_DEBUG_PARAM(debug_check_iterators, \ - "Enables extra verification for btree iterators") \ - BCH_DEBUG_PARAM(debug_check_bset_lookups, \ - "Enables extra verification for bset lookups") \ - BCH_DEBUG_PARAM(debug_check_btree_accounting, \ - "Verify btree accounting for keys within a node") \ - BCH_DEBUG_PARAM(debug_check_bkey_unpack, \ - "Enables extra verification for bkey unpack") - -/* Parameters that should only be compiled in debug mode: */ -#define BCH_DEBUG_PARAMS_DEBUG() \ - BCH_DEBUG_PARAM(journal_seq_verify, \ - "Store the journal sequence number in the version " \ - "number of every btree key, and verify that btree " \ - "update ordering is preserved during recovery") \ - BCH_DEBUG_PARAM(inject_invalid_keys, \ - "Store the journal sequence number in the version " \ - "number of every btree key, and verify that btree " \ - "update ordering is preserved during recovery") \ - BCH_DEBUG_PARAM(test_alloc_startup, \ - "Force allocator startup to use the slowpath where it" \ - "can't find enough free buckets without invalidating" \ - "cached data") \ - BCH_DEBUG_PARAM(force_reconstruct_read, \ - "Force reads to use the reconstruct path, when reading" \ - "from erasure coded extents") \ - BCH_DEBUG_PARAM(test_restart_gc, \ - "Test restarting mark and sweep gc when bucket gens change") - -#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() - -#ifdef CONFIG_BCACHEFS_DEBUG -#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() -#else -#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() -#endif - -#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name; -BCH_DEBUG_PARAMS_ALL() -#undef BCH_DEBUG_PARAM - -#define BCH_TIME_STATS() \ - x(btree_node_mem_alloc) \ - x(btree_node_split) \ - x(btree_node_compact) \ - x(btree_node_merge) \ - x(btree_node_sort) \ - x(btree_node_get) \ - x(btree_node_read) \ - x(btree_node_read_done) \ - x(btree_node_write) \ - x(btree_interior_update_foreground) \ - x(btree_interior_update_total) \ - x(btree_gc) \ - x(data_write) \ - x(data_write_to_submit) \ - x(data_write_to_queue) \ - x(data_write_to_btree_update) \ - x(data_write_btree_update) \ - x(data_read) \ - x(data_promote) \ - x(journal_flush_write) \ - x(journal_noflush_write) \ - x(journal_flush_seq) \ - x(blocked_journal_low_on_space) \ - x(blocked_journal_low_on_pin) \ - x(blocked_journal_max_in_flight) \ - x(blocked_journal_max_open) \ - x(blocked_key_cache_flush) \ - x(blocked_allocate) \ - x(blocked_allocate_open_bucket) \ - x(blocked_write_buffer_full) \ - x(nocow_lock_contended) - -enum bch_time_stats { -#define x(name) BCH_TIME_##name, - BCH_TIME_STATS() -#undef x - BCH_TIME_STAT_NR -}; - -/* Number of nodes btree coalesce will try to coalesce at once */ -#define GC_MERGE_NODES 4U - -/* Maximum number of nodes we might need to allocate atomically: */ -#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) - -/* Size of the freelist we allocate btree nodes from: */ -#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) - -#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) - -struct btree; - -struct io_count { - u64 sectors[2][BCH_DATA_NR]; -}; - -struct discard_in_flight { - bool in_progress:1; - u64 bucket:63; -}; - -#define BCH_DEV_READ_REFS() \ - x(bch2_online_devs) \ - x(trans_mark_dev_sbs) \ - x(read_fua_test) \ - x(sb_field_resize) \ - x(write_super) \ - x(journal_read) \ - x(fs_journal_alloc) \ - x(fs_resize_on_mount) \ - x(btree_node_read) \ - x(btree_node_read_all_replicas) \ - x(btree_node_scrub) \ - x(btree_node_write) \ - x(btree_node_scan) \ - x(btree_verify_replicas) \ - x(btree_node_ondisk_to_text) \ - x(io_read) \ - x(check_extent_checksums) \ - x(ec_block) - -enum bch_dev_read_ref { -#define x(n) BCH_DEV_READ_REF_##n, - BCH_DEV_READ_REFS() -#undef x - BCH_DEV_READ_REF_NR, -}; - -#define BCH_DEV_WRITE_REFS() \ - x(journal_write) \ - x(journal_do_discards) \ - x(dev_do_discards) \ - x(discard_one_bucket_fast) \ - x(do_invalidates) \ - x(nocow_flush) \ - x(io_write) \ - x(ec_block) \ - x(ec_bucket_zero) - -enum bch_dev_write_ref { -#define x(n) BCH_DEV_WRITE_REF_##n, - BCH_DEV_WRITE_REFS() -#undef x - BCH_DEV_WRITE_REF_NR, -}; - -struct bucket_bitmap { - unsigned long *buckets; - u64 nr; - struct mutex lock; -}; - -struct bch_dev { - struct kobject kobj; -#ifdef CONFIG_BCACHEFS_DEBUG - atomic_long_t ref; - bool dying; - unsigned long last_put; -#else - struct percpu_ref ref; -#endif - struct completion ref_completion; - struct enumerated_ref io_ref[2]; - - struct bch_fs *fs; - - u8 dev_idx; - /* - * Cached version of this device's member info from superblock - * Committed by bch2_write_super() -> bch_fs_mi_update() - */ - struct bch_member_cpu mi; - atomic64_t errors[BCH_MEMBER_ERROR_NR]; - unsigned long write_errors_start; - - __uuid_t uuid; - char name[BDEVNAME_SIZE]; - - struct bch_sb_handle disk_sb; - struct bch_sb *sb_read_scratch; - int sb_write_error; - dev_t dev; - atomic_t flush_seq; - - struct bch_devs_mask self; - - /* - * Buckets: - * Per-bucket arrays are protected by either rcu_read_lock or - * state_lock, for device resize. - */ - GENRADIX(struct bucket) buckets_gc; - struct bucket_gens __rcu *bucket_gens; - u8 *oldest_gen; - unsigned long *buckets_nouse; - - struct bucket_bitmap bucket_backpointer_mismatch; - struct bucket_bitmap bucket_backpointer_empty; - - struct bch_dev_usage_full __percpu - *usage; - - /* Allocator: */ - u64 alloc_cursor[3]; - - unsigned nr_open_buckets; - unsigned nr_partial_buckets; - unsigned nr_btree_reserve; - - struct work_struct invalidate_work; - struct work_struct discard_work; - struct mutex discard_buckets_in_flight_lock; - DARRAY(struct discard_in_flight) discard_buckets_in_flight; - struct work_struct discard_fast_work; - - atomic64_t rebalance_work; - - struct journal_device journal; - u64 prev_journal_sector; - - struct work_struct io_error_work; - - /* The rest of this all shows up in sysfs */ - atomic64_t cur_latency[2]; - struct bch2_time_stats_quantiles io_latency[2]; - -#define CONGESTED_MAX 1024 - atomic_t congested; - u64 congested_last; - - struct io_count __percpu *io_done; -}; - -/* - * initial_gc_unfixed - * error - * topology error - */ - -#define BCH_FS_FLAGS() \ - x(new_fs) \ - x(started) \ - x(clean_recovery) \ - x(btree_running) \ - x(accounting_replay_done) \ - x(may_go_rw) \ - x(rw) \ - x(rw_init_done) \ - x(was_rw) \ - x(stopping) \ - x(emergency_ro) \ - x(going_ro) \ - x(write_disable_complete) \ - x(clean_shutdown) \ - x(in_recovery) \ - x(in_fsck) \ - x(initial_gc_unfixed) \ - x(need_delete_dead_snapshots) \ - x(error) \ - x(topology_error) \ - x(errors_fixed) \ - x(errors_not_fixed) \ - x(no_invalid_checks) \ - x(discard_mount_opt_set) \ - -enum bch_fs_flags { -#define x(n) BCH_FS_##n, - BCH_FS_FLAGS() -#undef x -}; - -struct btree_debug { - unsigned id; -}; - -#define BCH_TRANSACTIONS_NR 128 - -struct btree_transaction_stats { - struct bch2_time_stats duration; - struct bch2_time_stats lock_hold_times; - struct mutex lock; - unsigned nr_max_paths; - unsigned max_mem; -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_trans_kmalloc_trace trans_kmalloc_trace; -#endif - char *max_paths_text; -}; - -struct bch_fs_pcpu { - u64 sectors_available; -}; - -struct journal_seq_blacklist_table { - size_t nr; - struct journal_seq_blacklist_table_entry { - u64 start; - u64 end; - bool dirty; - } entries[]; -}; - -struct btree_trans_buf { - struct btree_trans *trans; -}; - -#define BCH_WRITE_REFS() \ - x(journal) \ - x(trans) \ - x(write) \ - x(promote) \ - x(node_rewrite) \ - x(stripe_create) \ - x(stripe_delete) \ - x(reflink) \ - x(fallocate) \ - x(fsync) \ - x(dio_write) \ - x(discard) \ - x(discard_fast) \ - x(check_discard_freespace_key) \ - x(invalidate) \ - x(delete_dead_snapshots) \ - x(gc_gens) \ - x(snapshot_delete_pagecache) \ - x(sysfs) \ - x(btree_write_buffer) \ - x(btree_node_scrub) \ - x(async_recovery_passes) \ - x(ioctl_data) - -enum bch_write_ref { -#define x(n) BCH_WRITE_REF_##n, - BCH_WRITE_REFS() -#undef x - BCH_WRITE_REF_NR, -}; - -#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0) - -struct bch_fs { - struct closure cl; - - struct list_head list; - struct kobject kobj; - struct kobject counters_kobj; - struct kobject internal; - struct kobject opts_dir; - struct kobject time_stats; - unsigned long flags; - - int minor; - struct device *chardev; - struct super_block *vfs_sb; - dev_t dev; - char name[40]; - struct stdio_redirect *stdio; - struct task_struct *stdio_filter; - - /* ro/rw, add/remove/resize devices: */ - struct rw_semaphore state_lock; - - /* Counts outstanding writes, for clean transition to read-only */ - struct enumerated_ref writes; - /* - * Certain operations are only allowed in single threaded mode, during - * recovery, and we want to assert that this is the case: - */ - struct task_struct *recovery_task; - - /* - * Analagous to c->writes, for asynchronous ops that don't necessarily - * need fs to be read-write - */ - refcount_t ro_ref; - wait_queue_head_t ro_ref_wait; - - struct work_struct read_only_work; - - struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; - - struct bch_accounting_mem accounting; - - struct bch_replicas_cpu replicas; - struct bch_replicas_cpu replicas_gc; - struct mutex replicas_gc_lock; - - struct journal_entry_res btree_root_journal_res; - struct journal_entry_res clock_journal_res; - - struct bch_disk_groups_cpu __rcu *disk_groups; - - struct bch_opts opts; - - /* Updated by bch2_sb_update():*/ - struct { - __uuid_t uuid; - __uuid_t user_uuid; - - u16 version; - u16 version_incompat; - u16 version_incompat_allowed; - u16 version_min; - u16 version_upgrade_complete; - - u8 nr_devices; - u8 clean; - bool multi_device; /* true if we've ever had more than one device */ - - u8 encryption_type; - - u64 time_base_lo; - u32 time_base_hi; - unsigned time_units_per_sec; - unsigned nsec_per_time_unit; - u64 features; - u64 compat; - u64 recovery_passes_required; - unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; - u64 btrees_lost_data; - } sb; - DARRAY(enum bcachefs_metadata_version) - incompat_versions_requested; - - struct unicode_map *cf_encoding; - - struct bch_sb_handle disk_sb; - - unsigned short block_bits; /* ilog2(block_size) */ - - u16 btree_foreground_merge_threshold; - - struct closure sb_write; - struct mutex sb_lock; - - /* snapshot.c: */ - struct snapshot_table __rcu *snapshots; - struct mutex snapshot_table_lock; - struct rw_semaphore snapshot_create_lock; - - struct snapshot_delete snapshot_delete; - struct work_struct snapshot_wait_for_pagecache_and_delete_work; - snapshot_id_list snapshots_unlinked; - struct mutex snapshots_unlinked_lock; - - /* BTREE CACHE */ - struct bio_set btree_bio; - struct workqueue_struct *btree_read_complete_wq; - struct workqueue_struct *btree_write_submit_wq; - - struct btree_root btree_roots_known[BTREE_ID_NR]; - DARRAY(struct btree_root) btree_roots_extra; - struct mutex btree_root_lock; - - struct btree_cache btree_cache; - - /* - * Cache of allocated btree nodes - if we allocate a btree node and - * don't use it, if we free it that space can't be reused until going - * _all_ the way through the allocator (which exposes us to a livelock - * when allocating btree reserves fail halfway through) - instead, we - * can stick them here: - */ - struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; - unsigned btree_reserve_cache_nr; - struct mutex btree_reserve_cache_lock; - - mempool_t btree_interior_update_pool; - struct list_head btree_interior_update_list; - struct list_head btree_interior_updates_unwritten; - struct mutex btree_interior_update_lock; - struct closure_waitlist btree_interior_update_wait; - - struct workqueue_struct *btree_interior_update_worker; - struct work_struct btree_interior_update_work; - - struct workqueue_struct *btree_node_rewrite_worker; - struct list_head btree_node_rewrites; - struct list_head btree_node_rewrites_pending; - spinlock_t btree_node_rewrites_lock; - struct closure_waitlist btree_node_rewrites_wait; - - /* btree_io.c: */ - spinlock_t btree_write_error_lock; - struct btree_write_stats { - atomic64_t nr; - atomic64_t bytes; - } btree_write_stats[BTREE_WRITE_TYPE_NR]; - - /* btree_iter.c: */ - struct seqmutex btree_trans_lock; - struct list_head btree_trans_list; - mempool_t btree_trans_pool; - mempool_t btree_trans_mem_pool; - struct btree_trans_buf __percpu *btree_trans_bufs; - - struct srcu_struct btree_trans_barrier; - bool btree_trans_barrier_initialized; - - struct btree_key_cache btree_key_cache; - unsigned btree_key_cache_btrees; - - struct btree_write_buffer btree_write_buffer; - - struct workqueue_struct *btree_update_wq; - struct workqueue_struct *btree_write_complete_wq; - /* copygc needs its own workqueue for index updates.. */ - struct workqueue_struct *copygc_wq; - /* - * Use a dedicated wq for write ref holder tasks. Required to avoid - * dependency problems with other wq tasks that can block on ref - * draining, such as read-only transition. - */ - struct workqueue_struct *write_ref_wq; - - /* ALLOCATION */ - struct bch_devs_mask online_devs; - struct bch_devs_mask rw_devs[BCH_DATA_NR]; - unsigned long rw_devs_change_count; - - u64 capacity; /* sectors */ - u64 reserved; /* sectors */ - - /* - * When capacity _decreases_ (due to a disk being removed), we - * increment capacity_gen - this invalidates outstanding reservations - * and forces them to be revalidated - */ - u32 capacity_gen; - unsigned bucket_size_max; - - atomic64_t sectors_available; - struct mutex sectors_available_lock; - - struct bch_fs_pcpu __percpu *pcpu; - - struct percpu_rw_semaphore mark_lock; - - seqcount_t usage_lock; - struct bch_fs_usage_base __percpu *usage; - u64 __percpu *online_reserved; - - unsigned long allocator_last_stuck; - - struct io_clock io_clock[2]; - - /* JOURNAL SEQ BLACKLIST */ - struct journal_seq_blacklist_table * - journal_seq_blacklist_table; - - /* ALLOCATOR */ - spinlock_t freelist_lock; - struct closure_waitlist freelist_wait; - - open_bucket_idx_t open_buckets_freelist; - open_bucket_idx_t open_buckets_nr_free; - struct closure_waitlist open_buckets_wait; - struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; - open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; - - open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; - open_bucket_idx_t open_buckets_partial_nr; - - struct write_point btree_write_point; - struct write_point rebalance_write_point; - - struct write_point write_points[WRITE_POINT_MAX]; - struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; - struct mutex write_points_hash_lock; - unsigned write_points_nr; - - struct buckets_waiting_for_journal buckets_waiting_for_journal; - - /* GARBAGE COLLECTION */ - struct work_struct gc_gens_work; - unsigned long gc_count; - - enum btree_id gc_gens_btree; - struct bpos gc_gens_pos; - - /* - * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] - * has been marked by GC. - * - * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.) - * - * Protected by gc_pos_lock. Only written to by GC thread, so GC thread - * can read without a lock. - */ - seqcount_t gc_pos_lock; - struct gc_pos gc_pos; - - /* - * The allocation code needs gc_mark in struct bucket to be correct, but - * it's not while a gc is in progress. - */ - struct rw_semaphore gc_lock; - struct mutex gc_gens_lock; - - /* IO PATH */ - struct semaphore io_in_flight; - struct bio_set bio_read; - struct bio_set bio_read_split; - struct bio_set bio_write; - struct bio_set replica_set; - struct mutex bio_bounce_pages_lock; - mempool_t bio_bounce_pages; - struct bucket_nocow_lock_table - nocow_locks; - struct rhashtable promote_table; - -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR]; -#endif - - mempool_t compression_bounce[2]; - mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; - size_t zstd_workspace_size; - - struct bch_key chacha20_key; - bool chacha20_key_set; - - atomic64_t key_version; - - mempool_t large_bkey_pool; - - /* MOVE.C */ - struct list_head moving_context_list; - struct mutex moving_context_lock; - - /* REBALANCE */ - struct bch_fs_rebalance rebalance; - - /* COPYGC */ - struct task_struct *copygc_thread; - struct write_point copygc_write_point; - s64 copygc_wait_at; - s64 copygc_wait; - bool copygc_running; - wait_queue_head_t copygc_running_wq; - - /* STRIPES: */ - GENRADIX(struct gc_stripe) gc_stripes; - - struct hlist_head ec_stripes_new[32]; - spinlock_t ec_stripes_new_lock; - - /* ERASURE CODING */ - struct list_head ec_stripe_head_list; - struct mutex ec_stripe_head_lock; - - struct list_head ec_stripe_new_list; - struct mutex ec_stripe_new_lock; - wait_queue_head_t ec_stripe_new_wait; - - struct work_struct ec_stripe_create_work; - u64 ec_stripe_hint; - - struct work_struct ec_stripe_delete_work; - - struct bio_set ec_bioset; - - /* REFLINK */ - reflink_gc_table reflink_gc_table; - size_t reflink_gc_nr; - - /* fs.c */ - struct list_head vfs_inodes_list; - struct mutex vfs_inodes_lock; - struct rhashtable vfs_inodes_table; - struct rhltable vfs_inodes_by_inum_table; - - /* VFS IO PATH - fs-io.c */ - struct bio_set writepage_bioset; - struct bio_set dio_write_bioset; - struct bio_set dio_read_bioset; - struct bio_set nocow_flush_bioset; - - /* QUOTAS */ - struct bch_memquota_type quotas[QTYP_NR]; - - /* RECOVERY */ - u64 journal_replay_seq_start; - u64 journal_replay_seq_end; - struct bch_fs_recovery recovery; - - /* DEBUG JUNK */ - struct dentry *fs_debug_dir; - struct dentry *btree_debug_dir; - struct dentry *async_obj_dir; - struct btree_debug btree_debug[BTREE_ID_NR]; - struct btree *verify_data; - struct btree_node *verify_ondisk; - struct mutex verify_lock; - - /* - * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - have to dynamically allocate them - */ - mempool_t fill_iter; - - mempool_t btree_bounce_pool; - - struct journal journal; - GENRADIX(struct journal_replay *) journal_entries; - u64 journal_entries_base_seq; - struct journal_keys journal_keys; - struct list_head journal_iters; - - struct find_btree_nodes found_btree_nodes; - - u64 last_bucket_seq_cleanup; - - u64 counters_on_mount[BCH_COUNTER_NR]; - u64 __percpu *counters; - - struct bch2_time_stats times[BCH_TIME_STAT_NR]; - - struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; - - /* ERRORS */ - struct list_head fsck_error_msgs; - struct mutex fsck_error_msgs_lock; - bool fsck_alloc_msgs_err; - - bch_sb_errors_cpu fsck_error_counts; - struct mutex fsck_error_counts_lock; -}; - -extern struct wait_queue_head bch2_read_only_wait; - -static inline bool bch2_ro_ref_tryget(struct bch_fs *c) -{ - if (test_bit(BCH_FS_stopping, &c->flags)) - return false; - - return refcount_inc_not_zero(&c->ro_ref); -} - -static inline void bch2_ro_ref_put(struct bch_fs *c) -{ - if (refcount_dec_and_test(&c->ro_ref)) - wake_up(&c->ro_ref_wait); -} - -static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) -{ -#ifndef NO_BCACHEFS_FS - if (c->vfs_sb) - c->vfs_sb->s_bdi->ra_pages = ra_pages; -#endif -} - -static inline unsigned bucket_bytes(const struct bch_dev *ca) -{ - return ca->mi.bucket_size << 9; -} - -static inline unsigned block_bytes(const struct bch_fs *c) -{ - return c->opts.block_size; -} - -static inline unsigned block_sectors(const struct bch_fs *c) -{ - return c->opts.block_size >> 9; -} - -static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) -{ - return c->btree_key_cache_btrees & (1U << btree); -} - -static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) -{ - struct timespec64 t; - s64 sec; - s32 rem; - - time += c->sb.time_base_lo; - - sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); - - set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit); - - return t; -} - -static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) -{ - return (ts.tv_sec * c->sb.time_units_per_sec + - (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; -} - -static inline s64 bch2_current_time(const struct bch_fs *c) -{ - struct timespec64 now; - - ktime_get_coarse_real_ts64(&now); - return timespec_to_bch2_time(c, now); -} - -static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw) -{ - return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX); -} - -static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) -{ - struct stdio_redirect *stdio = c->stdio; - - if (c->stdio_filter && c->stdio_filter != current) - stdio = NULL; - return stdio; -} - -static inline unsigned metadata_replicas_required(struct bch_fs *c) -{ - return min(c->opts.metadata_replicas, - c->opts.metadata_replicas_required); -} - -static inline unsigned data_replicas_required(struct bch_fs *c) -{ - return min(c->opts.data_replicas, - c->opts.data_replicas_required); -} - -#define BKEY_PADDED_ONSTACK(key, pad) \ - struct { struct bkey_i key; __u64 key ## _pad[pad]; } - -/* - * This is needed because discard is both a filesystem option and a device - * option, and mount options are supposed to apply to that mount and not be - * persisted, i.e. if it's set as a mount option we can't propagate it to the - * device. - */ -static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) -{ - return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) - ? c->opts.discard - : ca->mi.discard; -} - -static inline bool bch2_fs_casefold_enabled(struct bch_fs *c) -{ -#ifdef CONFIG_UNICODE - return !c->opts.casefold_disabled; -#else - return false; -#endif -} - -#endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h deleted file mode 100644 index b4a04df5ea95..000000000000 --- a/fs/bcachefs/bcachefs_format.h +++ /dev/null @@ -1,1545 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FORMAT_H -#define _BCACHEFS_FORMAT_H - -/* - * bcachefs on disk data structures - * - * OVERVIEW: - * - * There are three main types of on disk data structures in bcachefs (this is - * reduced from 5 in bcache) - * - * - superblock - * - journal - * - btree - * - * The btree is the primary structure; most metadata exists as keys in the - * various btrees. There are only a small number of btrees, they're not - * sharded - we have one btree for extents, another for inodes, et cetera. - * - * SUPERBLOCK: - * - * The superblock contains the location of the journal, the list of devices in - * the filesystem, and in general any metadata we need in order to decide - * whether we can start a filesystem or prior to reading the journal/btree - * roots. - * - * The superblock is extensible, and most of the contents of the superblock are - * in variable length, type tagged fields; see struct bch_sb_field. - * - * Backup superblocks do not reside in a fixed location; also, superblocks do - * not have a fixed size. To locate backup superblocks we have struct - * bch_sb_layout; we store a copy of this inside every superblock, and also - * before the first superblock. - * - * JOURNAL: - * - * The journal primarily records btree updates in the order they occurred; - * journal replay consists of just iterating over all the keys in the open - * journal entries and re-inserting them into the btrees. - * - * The journal also contains entry types for the btree roots, and blacklisted - * journal sequence numbers (see journal_seq_blacklist.c). - * - * BTREE: - * - * bcachefs btrees are copy on write b+ trees, where nodes are big (typically - * 128k-256k) and log structured. We use struct btree_node for writing the first - * entry in a given node (offset 0), and struct btree_node_entry for all - * subsequent writes. - * - * After the header, btree node entries contain a list of keys in sorted order. - * Values are stored inline with the keys; since values are variable length (and - * keys effectively are variable length too, due to packing) we can't do random - * access without building up additional in memory tables in the btree node read - * path. - * - * BTREE KEYS (struct bkey): - * - * The various btrees share a common format for the key - so as to avoid - * switching in fastpath lookup/comparison code - but define their own - * structures for the key values. - * - * The size of a key/value pair is stored as a u8 in units of u64s, so the max - * size is just under 2k. The common part also contains a type tag for the - * value, and a format field indicating whether the key is packed or not (and - * also meant to allow adding new key fields in the future, if desired). - * - * bkeys, when stored within a btree node, may also be packed. In that case, the - * bkey_format in that node is used to unpack it. Packed bkeys mean that we can - * be generous with field sizes in the common part of the key format (64 bit - * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. - */ - -#include <asm/types.h> -#include <asm/byteorder.h> -#include <linux/kernel.h> -#include <linux/uuid.h> -#include <uapi/linux/magic.h> -#include "vstructs.h" - -#ifdef __KERNEL__ -typedef uuid_t __uuid_t; -#endif - -#define BITMASK(name, type, field, offset, end) \ -static const __maybe_unused unsigned name##_OFFSET = offset; \ -static const __maybe_unused unsigned name##_BITS = (end - offset); \ - \ -static inline __u64 name(const type *k) \ -{ \ - return (k->field >> offset) & ~(~0ULL << (end - offset)); \ -} \ - \ -static inline void SET_##name(type *k, __u64 v) \ -{ \ - k->field &= ~(~(~0ULL << (end - offset)) << offset); \ - k->field |= (v & ~(~0ULL << (end - offset))) << offset; \ -} - -#define LE_BITMASK(_bits, name, type, field, offset, end) \ -static const __maybe_unused unsigned name##_OFFSET = offset; \ -static const __maybe_unused unsigned name##_BITS = (end - offset); \ -static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\ - \ -static inline __u64 name(const type *k) \ -{ \ - return (__le##_bits##_to_cpu(k->field) >> offset) & \ - ~(~0ULL << (end - offset)); \ -} \ - \ -static inline void SET_##name(type *k, __u64 v) \ -{ \ - __u##_bits new = __le##_bits##_to_cpu(k->field); \ - \ - new &= ~(~(~0ULL << (end - offset)) << offset); \ - new |= (v & ~(~0ULL << (end - offset))) << offset; \ - k->field = __cpu_to_le##_bits(new); \ -} - -#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) -#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) -#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) - -struct bkey_format { - __u8 key_u64s; - __u8 nr_fields; - /* One unused slot for now: */ - __u8 bits_per_field[6]; - __le64 field_offset[6]; -}; - -/* Btree keys - all units are in sectors */ - -struct bpos { - /* - * Word order matches machine byte order - btree code treats a bpos as a - * single large integer, for search/comparison purposes - * - * Note that wherever a bpos is embedded in another on disk data - * structure, it has to be byte swabbed when reading in metadata that - * wasn't written in native endian order: - */ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - __u32 snapshot; - __u64 offset; - __u64 inode; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - __u64 inode; - __u64 offset; /* Points to end of extent - sectors */ - __u32 snapshot; -#else -#error edit for your odd byteorder. -#endif -} __packed -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -__aligned(4) -#endif -; - -#define KEY_INODE_MAX ((__u64)~0ULL) -#define KEY_OFFSET_MAX ((__u64)~0ULL) -#define KEY_SNAPSHOT_MAX ((__u32)~0U) -#define KEY_SIZE_MAX ((__u32)~0U) - -static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) -{ - return (struct bpos) { - .inode = inode, - .offset = offset, - .snapshot = snapshot, - }; -} - -#define POS_MIN SPOS(0, 0, 0) -#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0) -#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) -#define POS(_inode, _offset) SPOS(_inode, _offset, 0) - -/* Empty placeholder struct, for container_of() */ -struct bch_val { - __u64 __nothing[0]; -}; - -struct bversion { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - __u64 lo; - __u32 hi; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - __u32 hi; - __u64 lo; -#endif -} __packed -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -__aligned(4) -#endif -; - -struct bkey { - /* Size of combined key and value, in u64s */ - __u8 u64s; - - /* Format of key (0 for format local to btree node) */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 format:7, - needs_whiteout:1; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u8 needs_whiteout:1, - format:7; -#else -#error edit for your odd byteorder. -#endif - - /* Type of the value */ - __u8 type; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - __u8 pad[1]; - - struct bversion bversion; - __u32 size; /* extent size, in sectors */ - struct bpos p; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - struct bpos p; - __u32 size; /* extent size, in sectors */ - struct bversion bversion; - - __u8 pad[1]; -#endif -} __packed -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -/* - * The big-endian version of bkey can't be compiled by rustc with the "aligned" - * attr since it doesn't allow types to have both "packed" and "aligned" attrs. - * So for Rust compatibility, don't include this. It can be included in the LE - * version because the "packed" attr is redundant in that case. - * - * History: (quoting Kent) - * - * Specifically, when i was designing bkey, I wanted the header to be no - * bigger than necessary so that bkey_packed could use the rest. That means that - * decently offten extent keys will fit into only 8 bytes, instead of spilling over - * to 16. - * - * But packed_bkey treats the part after the header - the packed section - - * as a single multi word, variable length integer. And bkey, the unpacked - * version, is just a special case version of a bkey_packed; all the packed - * bkey code will work on keys in any packed format, the in-memory - * representation of an unpacked key also is just one type of packed key... - * - * So that constrains the key part of a bkig endian bkey to start right - * after the header. - * - * If we ever do a bkey_v2 and need to expand the hedaer by another byte for - * some reason - that will clean up this wart. - */ -__aligned(8) -#endif -; - -struct bkey_packed { - __u64 _data[0]; - - /* Size of combined key and value, in u64s */ - __u8 u64s; - - /* Format of key (0 for format local to btree node) */ - - /* - * XXX: next incompat on disk format change, switch format and - * needs_whiteout - bkey_packed() will be cheaper if format is the high - * bits of the bitfield - */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 format:7, - needs_whiteout:1; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u8 needs_whiteout:1, - format:7; -#endif - - /* Type of the value */ - __u8 type; - __u8 key_start[0]; - - /* - * We copy bkeys with struct assignment in various places, and while - * that shouldn't be done with packed bkeys we can't disallow it in C, - * and it's legal to cast a bkey to a bkey_packed - so padding it out - * to the same size as struct bkey should hopefully be safest. - */ - __u8 pad[sizeof(struct bkey) - 3]; -} __packed __aligned(8); - -typedef struct { - __le64 lo; - __le64 hi; -} bch_le128; - -#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) -#define BKEY_U64s_MAX U8_MAX -#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) - -#define KEY_PACKED_BITS_START 24 - -#define KEY_FORMAT_LOCAL_BTREE 0 -#define KEY_FORMAT_CURRENT 1 - -enum bch_bkey_fields { - BKEY_FIELD_INODE, - BKEY_FIELD_OFFSET, - BKEY_FIELD_SNAPSHOT, - BKEY_FIELD_SIZE, - BKEY_FIELD_VERSION_HI, - BKEY_FIELD_VERSION_LO, - BKEY_NR_FIELDS, -}; - -#define bkey_format_field(name, field) \ - [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) - -#define BKEY_FORMAT_CURRENT \ -((struct bkey_format) { \ - .key_u64s = BKEY_U64s, \ - .nr_fields = BKEY_NR_FIELDS, \ - .bits_per_field = { \ - bkey_format_field(INODE, p.inode), \ - bkey_format_field(OFFSET, p.offset), \ - bkey_format_field(SNAPSHOT, p.snapshot), \ - bkey_format_field(SIZE, size), \ - bkey_format_field(VERSION_HI, bversion.hi), \ - bkey_format_field(VERSION_LO, bversion.lo), \ - }, \ -}) - -/* bkey with inline value */ -struct bkey_i { - __u64 _data[0]; - - struct bkey k; - struct bch_val v; -}; - -#define POS_KEY(_pos) \ -((struct bkey) { \ - .u64s = BKEY_U64s, \ - .format = KEY_FORMAT_CURRENT, \ - .p = _pos, \ -}) - -#define KEY(_inode, _offset, _size) \ -((struct bkey) { \ - .u64s = BKEY_U64s, \ - .format = KEY_FORMAT_CURRENT, \ - .p = POS(_inode, _offset), \ - .size = _size, \ -}) - -static inline void bkey_init(struct bkey *k) -{ - *k = KEY(0, 0, 0); -} - -#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) - -#define __BKEY_PADDED(key, pad) \ - struct bkey_i key; __u64 key ## _pad[pad] - -enum bch_bkey_type_flags { - BKEY_TYPE_strict_btree_checks = BIT(0), -}; - -/* - * - DELETED keys are used internally to mark keys that should be ignored but - * override keys in composition order. Their version number is ignored. - * - * - DISCARDED keys indicate that the data is all 0s because it has been - * discarded. DISCARDs may have a version; if the version is nonzero the key - * will be persistent, otherwise the key will be dropped whenever the btree - * node is rewritten (like DELETED keys). - * - * - ERROR: any read of the data returns a read error, as the data was lost due - * to a failing device. Like DISCARDED keys, they can be removed (overridden) - * by new writes or cluster-wide GC. Node repair can also overwrite them with - * the same or a more recent version number, but not with an older version - * number. - * - * - WHITEOUT: for hash table btrees - */ -#define BCH_BKEY_TYPES() \ - x(deleted, 0, 0) \ - x(whiteout, 1, 0) \ - x(error, 2, 0) \ - x(cookie, 3, 0) \ - x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \ - x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \ - x(extent, 6, BKEY_TYPE_strict_btree_checks) \ - x(reservation, 7, BKEY_TYPE_strict_btree_checks) \ - x(inode, 8, BKEY_TYPE_strict_btree_checks) \ - x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \ - x(dirent, 10, BKEY_TYPE_strict_btree_checks) \ - x(xattr, 11, BKEY_TYPE_strict_btree_checks) \ - x(alloc, 12, BKEY_TYPE_strict_btree_checks) \ - x(quota, 13, BKEY_TYPE_strict_btree_checks) \ - x(stripe, 14, BKEY_TYPE_strict_btree_checks) \ - x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \ - x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \ - x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \ - x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \ - x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \ - x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \ - x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \ - x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \ - x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \ - x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \ - x(set, 25, 0) \ - x(lru, 26, BKEY_TYPE_strict_btree_checks) \ - x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \ - x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \ - x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \ - x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \ - x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \ - x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \ - x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \ - x(accounting, 34, BKEY_TYPE_strict_btree_checks) \ - x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) - -enum bch_bkey_type { -#define x(name, nr, ...) KEY_TYPE_##name = nr, - BCH_BKEY_TYPES() -#undef x - KEY_TYPE_MAX, -}; - -struct bch_deleted { - struct bch_val v; -}; - -struct bch_whiteout { - struct bch_val v; -}; - -struct bch_error { - struct bch_val v; -}; - -struct bch_cookie { - struct bch_val v; - __le64 cookie; -}; - -struct bch_hash_whiteout { - struct bch_val v; -}; - -struct bch_set { - struct bch_val v; -}; - -/* 128 bits, sufficient for cryptographic MACs: */ -struct bch_csum { - __le64 lo; - __le64 hi; -} __packed __aligned(8); - -struct bch_backpointer { - struct bch_val v; - __u8 btree_id; - __u8 level; - __u8 data_type; - __u8 bucket_gen; - __u32 pad; - __u32 bucket_len; - struct bpos pos; -} __packed __aligned(8); - -/* Optional/variable size superblock sections: */ - -struct bch_sb_field { - __u64 _data[0]; - __le32 u64s; - __le32 type; -}; - -#define BCH_SB_FIELDS() \ - x(journal, 0) \ - x(members_v1, 1) \ - x(crypt, 2) \ - x(replicas_v0, 3) \ - x(quota, 4) \ - x(disk_groups, 5) \ - x(clean, 6) \ - x(replicas, 7) \ - x(journal_seq_blacklist, 8) \ - x(journal_v2, 9) \ - x(counters, 10) \ - x(members_v2, 11) \ - x(errors, 12) \ - x(ext, 13) \ - x(downgrade, 14) \ - x(recovery_passes, 15) - -#include "alloc_background_format.h" -#include "dirent_format.h" -#include "disk_accounting_format.h" -#include "disk_groups_format.h" -#include "extents_format.h" -#include "ec_format.h" -#include "inode_format.h" -#include "journal_seq_blacklist_format.h" -#include "logged_ops_format.h" -#include "lru_format.h" -#include "quota_format.h" -#include "recovery_passes_format.h" -#include "reflink_format.h" -#include "replicas_format.h" -#include "snapshot_format.h" -#include "subvolume_format.h" -#include "sb-counters_format.h" -#include "sb-downgrade_format.h" -#include "sb-errors_format.h" -#include "sb-members_format.h" -#include "xattr_format.h" - -enum bch_sb_field_type { -#define x(f, nr) BCH_SB_FIELD_##f = nr, - BCH_SB_FIELDS() -#undef x - BCH_SB_FIELD_NR -}; - -/* - * Most superblock fields are replicated in all device's superblocks - a few are - * not: - */ -#define BCH_SINGLE_DEVICE_SB_FIELDS \ - ((1U << BCH_SB_FIELD_journal)| \ - (1U << BCH_SB_FIELD_journal_v2)) - -/* BCH_SB_FIELD_journal: */ - -struct bch_sb_field_journal { - struct bch_sb_field field; - __le64 buckets[]; -}; - -struct bch_sb_field_journal_v2 { - struct bch_sb_field field; - - struct bch_sb_field_journal_v2_entry { - __le64 start; - __le64 nr; - } d[]; -}; - -/* BCH_SB_FIELD_crypt: */ - -struct nonce { - __le32 d[4]; -}; - -struct bch_key { - __le64 key[4]; -}; - -#define BCH_KEY_MAGIC \ - (((__u64) 'b' << 0)|((__u64) 'c' << 8)| \ - ((__u64) 'h' << 16)|((__u64) '*' << 24)| \ - ((__u64) '*' << 32)|((__u64) 'k' << 40)| \ - ((__u64) 'e' << 48)|((__u64) 'y' << 56)) - -struct bch_encrypted_key { - __le64 magic; - struct bch_key key; -}; - -/* - * If this field is present in the superblock, it stores an encryption key which - * is used encrypt all other data/metadata. The key will normally be encrypted - * with the key userspace provides, but if encryption has been turned off we'll - * just store the master key unencrypted in the superblock so we can access the - * previously encrypted data. - */ -struct bch_sb_field_crypt { - struct bch_sb_field field; - - __le64 flags; - __le64 kdf_flags; - struct bch_encrypted_key key; -}; - -LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); - -enum bch_kdf_types { - BCH_KDF_SCRYPT = 0, - BCH_KDF_NR = 1, -}; - -/* stored as base 2 log of scrypt params: */ -LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); -LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); -LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); - -/* - * On clean shutdown, store btree roots and current journal sequence number in - * the superblock: - */ -struct jset_entry { - __le16 u64s; - __u8 btree_id; - __u8 level; - __u8 type; /* designates what this jset holds */ - __u8 pad[3]; - - struct bkey_i start[0]; - __u64 _data[]; -}; - -struct bch_sb_field_clean { - struct bch_sb_field field; - - __le32 flags; - __le16 _read_clock; /* no longer used */ - __le16 _write_clock; - __le64 journal_seq; - - struct jset_entry start[0]; - __u64 _data[]; -}; - -struct bch_sb_field_ext { - struct bch_sb_field field; - __le64 recovery_passes_required[2]; - __le64 errors_silent[8]; - __le64 btrees_lost_data; -}; - -/* Superblock: */ - -/* - * New versioning scheme: - * One common version number for all on disk data structures - superblock, btree - * nodes, journal entries - */ -#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10)) -#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10))) -#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0) - -/* - * field 1: version name - * field 2: BCH_VERSION(major, minor) - * field 3: recovery passess required on upgrade - */ -#define BCH_METADATA_VERSIONS() \ - x(bkey_renumber, BCH_VERSION(0, 10)) \ - x(inode_btree_change, BCH_VERSION(0, 11)) \ - x(snapshot, BCH_VERSION(0, 12)) \ - x(inode_backpointers, BCH_VERSION(0, 13)) \ - x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \ - x(snapshot_2, BCH_VERSION(0, 15)) \ - x(reflink_p_fix, BCH_VERSION(0, 16)) \ - x(subvol_dirent, BCH_VERSION(0, 17)) \ - x(inode_v2, BCH_VERSION(0, 18)) \ - x(freespace, BCH_VERSION(0, 19)) \ - x(alloc_v4, BCH_VERSION(0, 20)) \ - x(new_data_types, BCH_VERSION(0, 21)) \ - x(backpointers, BCH_VERSION(0, 22)) \ - x(inode_v3, BCH_VERSION(0, 23)) \ - x(unwritten_extents, BCH_VERSION(0, 24)) \ - x(bucket_gens, BCH_VERSION(0, 25)) \ - x(lru_v2, BCH_VERSION(0, 26)) \ - x(fragmentation_lru, BCH_VERSION(0, 27)) \ - x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \ - x(snapshot_trees, BCH_VERSION(0, 29)) \ - x(major_minor, BCH_VERSION(1, 0)) \ - x(snapshot_skiplists, BCH_VERSION(1, 1)) \ - x(deleted_inodes, BCH_VERSION(1, 2)) \ - x(rebalance_work, BCH_VERSION(1, 3)) \ - x(member_seq, BCH_VERSION(1, 4)) \ - x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ - x(btree_subvolume_children, BCH_VERSION(1, 6)) \ - x(mi_btree_bitmap, BCH_VERSION(1, 7)) \ - x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ - x(disk_accounting_v2, BCH_VERSION(1, 9)) \ - x(disk_accounting_v3, BCH_VERSION(1, 10)) \ - x(disk_accounting_inum, BCH_VERSION(1, 11)) \ - x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ - x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \ - x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ - x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ - x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ - x(inode_depth, BCH_VERSION(1, 17)) \ - x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ - x(autofix_errors, BCH_VERSION(1, 19)) \ - x(directory_size, BCH_VERSION(1, 20)) \ - x(cached_backpointers, BCH_VERSION(1, 21)) \ - x(stripe_backpointers, BCH_VERSION(1, 22)) \ - x(stripe_lru, BCH_VERSION(1, 23)) \ - x(casefolding, BCH_VERSION(1, 24)) \ - x(extent_flags, BCH_VERSION(1, 25)) \ - x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ - x(fast_device_removal, BCH_VERSION(1, 27)) \ - x(inode_has_case_insensitive, BCH_VERSION(1, 28)) - -enum bcachefs_metadata_version { - bcachefs_metadata_version_min = 9, -#define x(t, n) bcachefs_metadata_version_##t = n, - BCH_METADATA_VERSIONS() -#undef x - bcachefs_metadata_version_max -}; - -static const __maybe_unused -unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work; - -#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) - -#define BCH_SB_SECTOR 8 - -#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */ - -struct bch_sb_layout { - __uuid_t magic; /* bcachefs superblock UUID */ - __u8 layout_type; - __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ - __u8 nr_superblocks; - __u8 pad[5]; - __le64 sb_offset[61]; -} __packed __aligned(8); - -#define BCH_SB_LAYOUT_SECTOR 7 - -/* - * @offset - sector where this sb was written - * @version - on disk format version - * @version_min - Oldest metadata version this filesystem contains; so we can - * safely drop compatibility code and refuse to mount filesystems - * we'd need it for - * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC) - * @seq - incremented each time superblock is written - * @uuid - used for generating various magic numbers and identifying - * member devices, never changes - * @user_uuid - user visible UUID, may be changed - * @label - filesystem label - * @seq - identifies most recent superblock, incremented each time - * superblock is written - * @features - enabled incompatible features - */ -struct bch_sb { - struct bch_csum csum; - __le16 version; - __le16 version_min; - __le16 pad[2]; - __uuid_t magic; - __uuid_t uuid; - __uuid_t user_uuid; - __u8 label[BCH_SB_LABEL_SIZE]; - __le64 offset; - __le64 seq; - - __le16 block_size; - __u8 dev_idx; - __u8 nr_devices; - __le32 u64s; - - __le64 time_base_lo; - __le32 time_base_hi; - __le32 time_precision; - - __le64 flags[7]; - __le64 write_time; - __le64 features[2]; - __le64 compat[2]; - - struct bch_sb_layout layout; - - struct bch_sb_field start[0]; - __le64 _data[]; -} __packed __aligned(8); - -/* - * Flags: - * BCH_SB_INITALIZED - set on first mount - * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect - * behaviour of mount/recovery path: - * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits - * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 - * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides - * DATA/META_CSUM_TYPE. Also indicates encryption - * algorithm in use, if/when we get more than one - */ - -LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); - -LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); -LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); -LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); -LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); - -LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); - -LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); -LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); - -LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); -LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); - -LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); -LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); - -LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); -LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); -LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); -LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); - -LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); -LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62); - -LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); -LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS, - struct bch_sb, flags[0], 63, 64); - -LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); -LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8); -LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); - -LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); -LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); - -/* - * Max size of an extent that may require bouncing to read or write - * (checksummed, compressed): 64k - */ -LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, - struct bch_sb, flags[1], 14, 20); - -LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); -LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); - -LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); -LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); -LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); - -LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO, - struct bch_sb, flags[2], 0, 4); -LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); - -LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); -LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); -LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); -LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); -LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); -LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); -LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64); -LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); -LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); -LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); -LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54); -LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[4], 54, 56); - -LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60); -LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI, - struct bch_sb, flags[4], 60, 64); - -LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, - struct bch_sb, flags[5], 0, 16); -LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, - struct bch_sb, flags[5], 16, 32); -LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); -LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, - struct bch_sb, flags[5], 48, 64); -LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); -LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); -LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); -LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22); -LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23); -LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24); - -static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) -{ - return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4); -} - -static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) -{ - SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v); - SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4); -} - -static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb) -{ - return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) | - (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4); -} - -static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) -{ - SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v); - SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4); -} - -/* - * Features: - * - * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist - * reflink: gates KEY_TYPE_reflink - * inline_data: gates KEY_TYPE_inline_data - * new_siphash: gates BCH_STR_HASH_siphash - * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE - */ -#define BCH_SB_FEATURES() \ - x(lz4, 0) \ - x(gzip, 1) \ - x(zstd, 2) \ - x(atomic_nlink, 3) \ - x(ec, 4) \ - x(journal_seq_blacklist_v3, 5) \ - x(reflink, 6) \ - x(new_siphash, 7) \ - x(inline_data, 8) \ - x(new_extent_overwrite, 9) \ - x(incompressible, 10) \ - x(btree_ptr_v2, 11) \ - x(extents_above_btree_updates, 12) \ - x(btree_updates_journalled, 13) \ - x(reflink_inline_data, 14) \ - x(new_varint, 15) \ - x(journal_no_flush, 16) \ - x(alloc_v2, 17) \ - x(extents_across_btree_nodes, 18) \ - x(incompat_version_field, 19) \ - x(casefolding, 20) \ - x(no_alloc_info, 21) \ - x(small_image, 22) - -#define BCH_SB_FEATURES_ALWAYS \ - (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ - BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\ - BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\ - BIT_ULL(BCH_FEATURE_alloc_v2)|\ - BIT_ULL(BCH_FEATURE_extents_across_btree_nodes)) - -#define BCH_SB_FEATURES_ALL \ - (BCH_SB_FEATURES_ALWAYS| \ - BIT_ULL(BCH_FEATURE_new_siphash)| \ - BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ - BIT_ULL(BCH_FEATURE_new_varint)| \ - BIT_ULL(BCH_FEATURE_journal_no_flush)| \ - BIT_ULL(BCH_FEATURE_incompat_version_field)) - -enum bch_sb_feature { -#define x(f, n) BCH_FEATURE_##f, - BCH_SB_FEATURES() -#undef x - BCH_FEATURE_NR, -}; - -#define BCH_SB_COMPAT() \ - x(alloc_info, 0) \ - x(alloc_metadata, 1) \ - x(extents_above_btree_updates_done, 2) \ - x(bformat_overflow_done, 3) - -enum bch_sb_compat { -#define x(f, n) BCH_COMPAT_##f, - BCH_SB_COMPAT() -#undef x - BCH_COMPAT_NR, -}; - -/* options: */ - -#define BCH_VERSION_UPGRADE_OPTS() \ - x(compatible, 0) \ - x(incompatible, 1) \ - x(none, 2) - -enum bch_version_upgrade_opts { -#define x(t, n) BCH_VERSION_UPGRADE_##t = n, - BCH_VERSION_UPGRADE_OPTS() -#undef x -}; - -#define BCH_REPLICAS_MAX 4U - -#define BCH_BKEY_PTRS_MAX 16U - -#define BCH_ERROR_ACTIONS() \ - x(continue, 0) \ - x(fix_safe, 1) \ - x(panic, 2) \ - x(ro, 3) - -enum bch_error_actions { -#define x(t, n) BCH_ON_ERROR_##t = n, - BCH_ERROR_ACTIONS() -#undef x - BCH_ON_ERROR_NR -}; - -#define BCH_DEGRADED_ACTIONS() \ - x(ask, 0) \ - x(yes, 1) \ - x(very, 2) \ - x(no, 3) - -enum bch_degraded_actions { -#define x(t, n) BCH_DEGRADED_##t = n, - BCH_DEGRADED_ACTIONS() -#undef x - BCH_DEGRADED_ACTIONS_NR -}; - -#define BCH_STR_HASH_TYPES() \ - x(crc32c, 0) \ - x(crc64, 1) \ - x(siphash_old, 2) \ - x(siphash, 3) - -enum bch_str_hash_type { -#define x(t, n) BCH_STR_HASH_##t = n, - BCH_STR_HASH_TYPES() -#undef x - BCH_STR_HASH_NR -}; - -#define BCH_STR_HASH_OPTS() \ - x(crc32c, 0) \ - x(crc64, 1) \ - x(siphash, 2) - -enum bch_str_hash_opts { -#define x(t, n) BCH_STR_HASH_OPT_##t = n, - BCH_STR_HASH_OPTS() -#undef x - BCH_STR_HASH_OPT_NR -}; - -#define BCH_CSUM_TYPES() \ - x(none, 0) \ - x(crc32c_nonzero, 1) \ - x(crc64_nonzero, 2) \ - x(chacha20_poly1305_80, 3) \ - x(chacha20_poly1305_128, 4) \ - x(crc32c, 5) \ - x(crc64, 6) \ - x(xxhash, 7) - -enum bch_csum_type { -#define x(t, n) BCH_CSUM_##t = n, - BCH_CSUM_TYPES() -#undef x - BCH_CSUM_NR -}; - -static const __maybe_unused unsigned bch_crc_bytes[] = { - [BCH_CSUM_none] = 0, - [BCH_CSUM_crc32c_nonzero] = 4, - [BCH_CSUM_crc32c] = 4, - [BCH_CSUM_crc64_nonzero] = 8, - [BCH_CSUM_crc64] = 8, - [BCH_CSUM_xxhash] = 8, - [BCH_CSUM_chacha20_poly1305_80] = 10, - [BCH_CSUM_chacha20_poly1305_128] = 16, -}; - -static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) -{ - switch (type) { - case BCH_CSUM_chacha20_poly1305_80: - case BCH_CSUM_chacha20_poly1305_128: - return true; - default: - return false; - } -} - -#define BCH_CSUM_OPTS() \ - x(none, 0) \ - x(crc32c, 1) \ - x(crc64, 2) \ - x(xxhash, 3) - -enum bch_csum_opt { -#define x(t, n) BCH_CSUM_OPT_##t = n, - BCH_CSUM_OPTS() -#undef x - BCH_CSUM_OPT_NR -}; - -#define BCH_COMPRESSION_TYPES() \ - x(none, 0) \ - x(lz4_old, 1) \ - x(gzip, 2) \ - x(lz4, 3) \ - x(zstd, 4) \ - x(incompressible, 5) - -enum bch_compression_type { -#define x(t, n) BCH_COMPRESSION_TYPE_##t = n, - BCH_COMPRESSION_TYPES() -#undef x - BCH_COMPRESSION_TYPE_NR -}; - -#define BCH_COMPRESSION_OPTS() \ - x(none, 0) \ - x(lz4, 1) \ - x(gzip, 2) \ - x(zstd, 3) - -enum bch_compression_opts { -#define x(t, n) BCH_COMPRESSION_OPT_##t = n, - BCH_COMPRESSION_OPTS() -#undef x - BCH_COMPRESSION_OPT_NR -}; - -/* - * Magic numbers - * - * The various other data structures have their own magic numbers, which are - * xored with the first part of the cache set's UUID - */ - -#define BCACHE_MAGIC \ - UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca, \ - 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) -#define BCHFS_MAGIC \ - UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \ - 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) - -#define BCACHEFS_STATFS_MAGIC BCACHEFS_SUPER_MAGIC - -#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) -#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) - -static inline __le64 __bch2_sb_magic(struct bch_sb *sb) -{ - __le64 ret; - - memcpy(&ret, &sb->uuid, sizeof(ret)); - return ret; -} - -static inline __u64 __jset_magic(struct bch_sb *sb) -{ - return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); -} - -static inline __u64 __bset_magic(struct bch_sb *sb) -{ - return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); -} - -/* Journal */ - -#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) - -#define BCH_JSET_ENTRY_TYPES() \ - x(btree_keys, 0) \ - x(btree_root, 1) \ - x(prio_ptrs, 2) \ - x(blacklist, 3) \ - x(blacklist_v2, 4) \ - x(usage, 5) \ - x(data_usage, 6) \ - x(clock, 7) \ - x(dev_usage, 8) \ - x(log, 9) \ - x(overwrite, 10) \ - x(write_buffer_keys, 11) \ - x(datetime, 12) \ - x(log_bkey, 13) - -enum bch_jset_entry_type { -#define x(f, nr) BCH_JSET_ENTRY_##f = nr, - BCH_JSET_ENTRY_TYPES() -#undef x - BCH_JSET_ENTRY_NR -}; - -static inline bool jset_entry_is_key(struct jset_entry *e) -{ - switch (e->type) { - case BCH_JSET_ENTRY_btree_keys: - case BCH_JSET_ENTRY_btree_root: - case BCH_JSET_ENTRY_write_buffer_keys: - return true; - } - - return false; -} - -/* - * Journal sequence numbers can be blacklisted: bsets record the max sequence - * number of all the journal entries they contain updates for, so that on - * recovery we can ignore those bsets that contain index updates newer that what - * made it into the journal. - * - * This means that we can't reuse that journal_seq - we have to skip it, and - * then record that we skipped it so that the next time we crash and recover we - * don't think there was a missing journal entry. - */ -struct jset_entry_blacklist { - struct jset_entry entry; - __le64 seq; -}; - -struct jset_entry_blacklist_v2 { - struct jset_entry entry; - __le64 start; - __le64 end; -}; - -#define BCH_FS_USAGE_TYPES() \ - x(reserved, 0) \ - x(inodes, 1) \ - x(key_version, 2) - -enum bch_fs_usage_type { -#define x(f, nr) BCH_FS_USAGE_##f = nr, - BCH_FS_USAGE_TYPES() -#undef x - BCH_FS_USAGE_NR -}; - -struct jset_entry_usage { - struct jset_entry entry; - __le64 v; -} __packed; - -struct jset_entry_data_usage { - struct jset_entry entry; - __le64 v; - struct bch_replicas_entry_v1 r; -} __packed; - -struct jset_entry_clock { - struct jset_entry entry; - __u8 rw; - __u8 pad[7]; - __le64 time; -} __packed; - -struct jset_entry_dev_usage_type { - __le64 buckets; - __le64 sectors; - __le64 fragmented; -} __packed; - -struct jset_entry_dev_usage { - struct jset_entry entry; - __le32 dev; - __u32 pad; - - __le64 _buckets_ec; /* No longer used */ - __le64 _buckets_unavailable; /* No longer used */ - - struct jset_entry_dev_usage_type d[]; -}; - -static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) -{ - return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) / - sizeof(struct jset_entry_dev_usage_type); -} - -struct jset_entry_log { - struct jset_entry entry; - u8 d[]; -} __packed __aligned(8); - -static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l) -{ - unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d); - - while (b && !l->d[b - 1]) - --b; - return b; -} - -struct jset_entry_datetime { - struct jset_entry entry; - __le64 seconds; -} __packed __aligned(8); - -/* - * On disk format for a journal entry: - * seq is monotonically increasing; every journal entry has its own unique - * sequence number. - * - * last_seq is the oldest journal entry that still has keys the btree hasn't - * flushed to disk yet. - * - * version is for on disk format changes. - */ -struct jset { - struct bch_csum csum; - - __le64 magic; - __le64 seq; - __le32 version; - __le32 flags; - - __le32 u64s; /* size of d[] in u64s */ - - __u8 encrypted_start[0]; - - __le16 _read_clock; /* no longer used */ - __le16 _write_clock; - - /* Sequence number of oldest dirty journal entry */ - __le64 last_seq; - - - struct jset_entry start[0]; - __u64 _data[]; -} __packed __aligned(8); - -LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); -LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); -LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); - -#define BCH_JOURNAL_BUCKETS_MIN 8 - -/* Btree: */ - -enum btree_id_flags { - BTREE_IS_extents = BIT(0), - BTREE_IS_snapshots = BIT(1), - BTREE_IS_snapshot_field = BIT(2), - BTREE_IS_data = BIT(3), - BTREE_IS_write_buffer = BIT(4), -}; - -#define BCH_BTREE_IDS() \ - x(extents, 0, \ - BTREE_IS_extents| \ - BTREE_IS_snapshots| \ - BTREE_IS_data, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_error)| \ - BIT_ULL(KEY_TYPE_cookie)| \ - BIT_ULL(KEY_TYPE_extent)| \ - BIT_ULL(KEY_TYPE_reservation)| \ - BIT_ULL(KEY_TYPE_reflink_p)| \ - BIT_ULL(KEY_TYPE_inline_data)) \ - x(inodes, 1, \ - BTREE_IS_snapshots, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_inode)| \ - BIT_ULL(KEY_TYPE_inode_v2)| \ - BIT_ULL(KEY_TYPE_inode_v3)| \ - BIT_ULL(KEY_TYPE_inode_generation)) \ - x(dirents, 2, \ - BTREE_IS_snapshots, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_hash_whiteout)| \ - BIT_ULL(KEY_TYPE_dirent)) \ - x(xattrs, 3, \ - BTREE_IS_snapshots, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_cookie)| \ - BIT_ULL(KEY_TYPE_hash_whiteout)| \ - BIT_ULL(KEY_TYPE_xattr)) \ - x(alloc, 4, 0, \ - BIT_ULL(KEY_TYPE_alloc)| \ - BIT_ULL(KEY_TYPE_alloc_v2)| \ - BIT_ULL(KEY_TYPE_alloc_v3)| \ - BIT_ULL(KEY_TYPE_alloc_v4)) \ - x(quotas, 5, 0, \ - BIT_ULL(KEY_TYPE_quota)) \ - x(stripes, 6, 0, \ - BIT_ULL(KEY_TYPE_stripe)) \ - x(reflink, 7, \ - BTREE_IS_extents| \ - BTREE_IS_data, \ - BIT_ULL(KEY_TYPE_reflink_v)| \ - BIT_ULL(KEY_TYPE_indirect_inline_data)| \ - BIT_ULL(KEY_TYPE_error)) \ - x(subvolumes, 8, 0, \ - BIT_ULL(KEY_TYPE_subvolume)) \ - x(snapshots, 9, 0, \ - BIT_ULL(KEY_TYPE_snapshot)) \ - x(lru, 10, \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_set)) \ - x(freespace, 11, \ - BTREE_IS_extents, \ - BIT_ULL(KEY_TYPE_set)) \ - x(need_discard, 12, 0, \ - BIT_ULL(KEY_TYPE_set)) \ - x(backpointers, 13, \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_backpointer)) \ - x(bucket_gens, 14, 0, \ - BIT_ULL(KEY_TYPE_bucket_gens)) \ - x(snapshot_trees, 15, 0, \ - BIT_ULL(KEY_TYPE_snapshot_tree)) \ - x(deleted_inodes, 16, \ - BTREE_IS_snapshot_field| \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_set)) \ - x(logged_ops, 17, 0, \ - BIT_ULL(KEY_TYPE_logged_op_truncate)| \ - BIT_ULL(KEY_TYPE_logged_op_finsert)| \ - BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \ - x(rebalance_work, 18, \ - BTREE_IS_snapshot_field| \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ - x(subvolume_children, 19, 0, \ - BIT_ULL(KEY_TYPE_set)) \ - x(accounting, 20, \ - BTREE_IS_snapshot_field| \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_accounting)) \ - -enum btree_id { -#define x(name, nr, ...) BTREE_ID_##name = nr, - BCH_BTREE_IDS() -#undef x - BTREE_ID_NR -}; - -/* - * Maximum number of btrees that we will _ever_ have under the current scheme, - * where we refer to them with 64 bit bitfields - and we also need a bit for - * the interior btree node type: - */ -#define BTREE_ID_NR_MAX 63 - -static inline bool btree_id_is_alloc(enum btree_id id) -{ - switch (id) { - case BTREE_ID_alloc: - case BTREE_ID_backpointers: - case BTREE_ID_need_discard: - case BTREE_ID_freespace: - case BTREE_ID_bucket_gens: - case BTREE_ID_lru: - case BTREE_ID_accounting: - return true; - default: - return false; - } -} - -#define BTREE_MAX_DEPTH 4U - -/* Btree nodes */ - -/* - * Btree nodes - * - * On disk a btree node is a list/log of these; within each set the keys are - * sorted - */ -struct bset { - __le64 seq; - - /* - * Highest journal entry this bset contains keys for. - * If on recovery we don't see that journal entry, this bset is ignored: - * this allows us to preserve the order of all index updates after a - * crash, since the journal records a total order of all index updates - * and anything that didn't make it to the journal doesn't get used. - */ - __le64 journal_seq; - - __le32 flags; - __le16 version; - __le16 u64s; /* count of d[] in u64s */ - - struct bkey_packed start[0]; - __u64 _data[]; -} __packed __aligned(8); - -LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); - -LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); -LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, - struct bset, flags, 5, 6); - -/* Sector offset within the btree node: */ -LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32); - -struct btree_node { - struct bch_csum csum; - __le64 magic; - - /* this flags field is encrypted, unlike bset->flags: */ - __le64 flags; - - /* Closed interval: */ - struct bpos min_key; - struct bpos max_key; - struct bch_extent_ptr _ptr; /* not used anymore */ - struct bkey_format format; - - union { - struct bset keys; - struct { - __u8 pad[22]; - __le16 u64s; - __u64 _data[0]; - - }; - }; -} __packed __aligned(8); - -LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags, 0, 4); -LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); -LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, - struct btree_node, flags, 8, 9); -LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags, 9, 25); -/* 25-32 unused */ -LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); - -static inline __u64 BTREE_NODE_ID(struct btree_node *n) -{ - return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4); -} - -static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v) -{ - SET_BTREE_NODE_ID_LO(n, v); - SET_BTREE_NODE_ID_HI(n, v >> 4); -} - -struct btree_node_entry { - struct bch_csum csum; - - union { - struct bset keys; - struct { - __u8 pad[22]; - __le16 u64s; - __u64 _data[0]; - }; - }; -} __packed __aligned(8); - -#endif /* _BCACHEFS_FORMAT_H */ diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h deleted file mode 100644 index 52594e925eb7..000000000000 --- a/fs/bcachefs/bcachefs_ioctl.h +++ /dev/null @@ -1,473 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IOCTL_H -#define _BCACHEFS_IOCTL_H - -#include <linux/uuid.h> -#include <asm/ioctl.h> -#include "bcachefs_format.h" -#include "bkey_types.h" - -/* - * Flags common to multiple ioctls: - */ -#define BCH_FORCE_IF_DATA_LOST (1 << 0) -#define BCH_FORCE_IF_METADATA_LOST (1 << 1) -#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) -#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) - -#define BCH_FORCE_IF_LOST \ - (BCH_FORCE_IF_DATA_LOST| \ - BCH_FORCE_IF_METADATA_LOST) -#define BCH_FORCE_IF_DEGRADED \ - (BCH_FORCE_IF_DATA_DEGRADED| \ - BCH_FORCE_IF_METADATA_DEGRADED) - -/* - * If cleared, ioctl that refer to a device pass it as a pointer to a pathname - * (e.g. /dev/sda1); if set, the dev field is the device's index within the - * filesystem: - */ -#define BCH_BY_INDEX (1 << 4) - -/* - * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem - * wide superblock: - */ -#define BCH_READ_DEV (1 << 5) - -/* global control dev: */ - -/* These are currently broken, and probably unnecessary: */ -#if 0 -#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) -#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) - -struct bch_ioctl_assemble { - __u32 flags; - __u32 nr_devs; - __u64 pad; - __u64 devs[]; -}; - -struct bch_ioctl_incremental { - __u32 flags; - __u64 pad; - __u64 dev; -}; -#endif - -/* filesystem ioctls: */ - -#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) - -/* These only make sense when we also have incremental assembly */ -#if 0 -#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) -#define BCH_IOCTL_STOP _IO(0xbc, 3) -#endif - -#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) -#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) -#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) -#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) -#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) -#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) -#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) -#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) - -#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) -#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) - -#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2) - -#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) -#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) -#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) -#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters) - -/* ioctl below act on a particular file, not the filesystem as a whole: */ - -#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) - -/* - * BCH_IOCTL_QUERY_UUID: get filesystem UUID - * - * Returns user visible UUID, not internal UUID (which may not ever be changed); - * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with - * this UUID. - */ -struct bch_ioctl_query_uuid { - __uuid_t uuid; -}; - -#if 0 -struct bch_ioctl_start { - __u32 flags; - __u32 pad; -}; -#endif - -/* - * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem - * - * The specified device must not be open or in use. On success, the new device - * will be an online member of the filesystem just like any other member. - * - * The device must first be prepared by userspace by formatting with a bcachefs - * superblock, which is only used for passing in superblock options/parameters - * for that device (in struct bch_member). The new device's superblock should - * not claim to be a member of any existing filesystem - UUIDs on it will be - * ignored. - */ - -/* - * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem - * - * Any data present on @dev will be permanently deleted, and @dev will be - * removed from its slot in the filesystem's list of member devices. The device - * may be either offline or offline. - * - * Will fail removing @dev would leave us with insufficient read write devices - * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are - * set. - */ - -/* - * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem - * but is not open (e.g. because we started in degraded mode), bring it online - * - * all existing data on @dev will be available once the device is online, - * exactly as if @dev was present when the filesystem was first mounted - */ - -/* - * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that - * block device, without removing it from the filesystem (so it can be brought - * back online later) - * - * Data present on @dev will be unavailable while @dev is offline (unless - * replicated), but will still be intact and untouched if @dev is brought back - * online - * - * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would - * leave us with insufficient read write devices or degraded/unavailable data, - * unless the approprate BCH_FORCE_IF_* flags are set. - */ - -struct bch_ioctl_disk { - __u32 flags; - __u32 pad; - __u64 dev; -}; - -/* - * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem - * - * @new_state - one of the bch_member_state states (rw, ro, failed, - * spare) - * - * Will refuse to change member state if we would then have insufficient devices - * to write to, or if it would result in degraded data (when @new_state is - * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. - */ -struct bch_ioctl_disk_set_state { - __u32 flags; - __u8 new_state; - __u8 pad[3]; - __u64 dev; -}; - -#define BCH_DATA_OPS() \ - x(scrub, 0) \ - x(rereplicate, 1) \ - x(migrate, 2) \ - x(rewrite_old_nodes, 3) \ - x(drop_extra_replicas, 4) - -enum bch_data_ops { -#define x(t, n) BCH_DATA_OP_##t = n, - BCH_DATA_OPS() -#undef x - BCH_DATA_OP_NR -}; - -/* - * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. - * scrub, rereplicate, migrate). - * - * This ioctl kicks off a job in the background, and returns a file descriptor. - * Reading from the file descriptor returns a struct bch_ioctl_data_event, - * indicating current progress, and closing the file descriptor will stop the - * job. The file descriptor is O_CLOEXEC. - */ -struct bch_ioctl_data { - __u16 op; - __u8 start_btree; - __u8 end_btree; - __u32 flags; - - struct bpos start_pos; - struct bpos end_pos; - - union { - struct { - __u32 dev; - __u32 data_types; - } scrub; - struct { - __u32 dev; - __u32 pad; - } migrate; - struct { - __u64 pad[8]; - }; - }; -} __packed __aligned(8); - -enum bch_data_event { - BCH_DATA_EVENT_PROGRESS = 0, - /* XXX: add an event for reporting errors */ - BCH_DATA_EVENT_NR = 1, -}; - -enum data_progress_data_type_special { - DATA_PROGRESS_DATA_TYPE_phys = 254, - DATA_PROGRESS_DATA_TYPE_done = 255, -}; - -struct bch_ioctl_data_progress { - __u8 data_type; - __u8 btree_id; - __u8 pad[2]; - struct bpos pos; - - __u64 sectors_done; - __u64 sectors_total; - __u64 sectors_error_corrected; - __u64 sectors_error_uncorrected; -} __packed __aligned(8); - -enum bch_ioctl_data_event_ret { - BCH_IOCTL_DATA_EVENT_RET_done = 1, - BCH_IOCTL_DATA_EVENT_RET_device_offline = 2, -}; - -struct bch_ioctl_data_event { - __u8 type; - __u8 ret; - __u8 pad[6]; - union { - struct bch_ioctl_data_progress p; - __u64 pad2[15]; - }; -} __packed __aligned(8); - -struct bch_replicas_usage { - __u64 sectors; - struct bch_replicas_entry_v1 r; -} __packed; - -static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u) -{ - return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r); -} - -static inline struct bch_replicas_usage * -replicas_usage_next(struct bch_replicas_usage *u) -{ - return (void *) u + replicas_usage_bytes(u); -} - -/* Obsolete */ -/* - * BCH_IOCTL_FS_USAGE: query filesystem disk space usage - * - * Returns disk space usage broken out by data type, number of replicas, and - * by component device - * - * @replica_entries_bytes - size, in bytes, allocated for replica usage entries - * - * On success, @replica_entries_bytes will be changed to indicate the number of - * bytes actually used. - * - * Returns -ERANGE if @replica_entries_bytes was too small - */ -struct bch_ioctl_fs_usage { - __u64 capacity; - __u64 used; - __u64 online_reserved; - __u64 persistent_reserved[BCH_REPLICAS_MAX]; - - __u32 replica_entries_bytes; - __u32 pad; - - struct bch_replicas_usage replicas[]; -}; - -/* Obsolete */ -/* - * BCH_IOCTL_DEV_USAGE: query device disk space usage - * - * Returns disk space usage broken out by data type - both by buckets and - * sectors. - */ -struct bch_ioctl_dev_usage { - __u64 dev; - __u32 flags; - __u8 state; - __u8 pad[7]; - - __u32 bucket_size; - __u64 nr_buckets; - - __u64 buckets_ec; - - struct bch_ioctl_dev_usage_type { - __u64 buckets; - __u64 sectors; - __u64 fragmented; - } d[10]; -}; - -/* Obsolete */ -struct bch_ioctl_dev_usage_v2 { - __u64 dev; - __u32 flags; - __u8 state; - __u8 nr_data_types; - __u8 pad[6]; - - __u32 bucket_size; - __u64 nr_buckets; - - struct bch_ioctl_dev_usage_type d[]; -}; - -/* - * BCH_IOCTL_READ_SUPER: read filesystem superblock - * - * Equivalent to reading the superblock directly from the block device, except - * avoids racing with the kernel writing the superblock or having to figure out - * which block device to read - * - * @sb - buffer to read into - * @size - size of userspace allocated buffer - * @dev - device to read superblock for, if BCH_READ_DEV flag is - * specified - * - * Returns -ERANGE if buffer provided is too small - */ -struct bch_ioctl_read_super { - __u32 flags; - __u32 pad; - __u64 dev; - __u64 size; - __u64 sb; -}; - -/* - * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to - * determine if disk is a (online) member - if so, returns device's index - * - * Returns -ENOENT if not found - */ -struct bch_ioctl_disk_get_idx { - __u64 dev; -}; - -/* - * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device - * - * @dev - member to resize - * @nbuckets - new number of buckets - */ -struct bch_ioctl_disk_resize { - __u32 flags; - __u32 pad; - __u64 dev; - __u64 nbuckets; -}; - -/* - * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device - * - * @dev - member to resize - * @nbuckets - new number of buckets - */ -struct bch_ioctl_disk_resize_journal { - __u32 flags; - __u32 pad; - __u64 dev; - __u64 nbuckets; -}; - -struct bch_ioctl_subvolume { - __u32 flags; - __u32 dirfd; - __u16 mode; - __u16 pad[3]; - __u64 dst_ptr; - __u64 src_ptr; -}; - -#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) -#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) - -/* - * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command, - * but with the kernel's implementation of fsck: - */ -struct bch_ioctl_fsck_offline { - __u64 flags; - __u64 opts; /* string */ - __u64 nr_devs; - __u64 devs[] __counted_by(nr_devs); -}; - -/* - * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command, - * but with the kernel's implementation of fsck: - */ -struct bch_ioctl_fsck_online { - __u64 flags; - __u64 opts; /* string */ -}; - -/* - * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting - * - * Returns disk space usage broken out by data type, number of replicas, and - * by component device - * - * @replica_entries_bytes - size, in bytes, allocated for replica usage entries - * - * On success, @replica_entries_bytes will be changed to indicate the number of - * bytes actually used. - * - * Returns -ERANGE if @replica_entries_bytes was too small - */ -struct bch_ioctl_query_accounting { - __u64 capacity; - __u64 used; - __u64 online_reserved; - - __u32 accounting_u64s; /* input parameter */ - __u32 accounting_types_mask; /* input parameter */ - - struct bkey_i_accounting accounting[]; -}; - -#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0) - -struct bch_ioctl_query_counters { - __u16 nr; - __u16 flags; - __u32 pad; - __u64 d[]; -}; - -#endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c deleted file mode 100644 index ee823c640642..000000000000 --- a/fs/bcachefs/bkey.c +++ /dev/null @@ -1,1112 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey.h" -#include "bkey_cmp.h" -#include "bkey_methods.h" -#include "bset.h" -#include "util.h" - -const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; - -void bch2_bkey_packed_to_binary_text(struct printbuf *out, - const struct bkey_format *f, - const struct bkey_packed *k) -{ - const u64 *p = high_word(f, k); - unsigned word_bits = 64 - high_bit_offset; - unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset; - u64 v = *p & (~0ULL >> high_bit_offset); - - if (!nr_key_bits) { - prt_str(out, "(empty)"); - return; - } - - while (1) { - unsigned next_key_bits = nr_key_bits; - - if (nr_key_bits < 64) { - v >>= 64 - nr_key_bits; - next_key_bits = 0; - } else { - next_key_bits -= 64; - } - - bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits)); - - if (!next_key_bits) - break; - - prt_char(out, ' '); - - p = next_word(p); - v = *p; - word_bits = 64; - nr_key_bits = next_key_bits; - } -} - -static void __bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) -{ - struct bkey tmp; - - BUG_ON(bkeyp_val_u64s(format, packed) != - bkey_val_u64s(unpacked)); - - BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); - - tmp = __bch2_bkey_unpack_key(format, packed); - - if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n", - format->key_u64s, - format->bits_per_field[0], - format->bits_per_field[1], - format->bits_per_field[2], - format->bits_per_field[3], - format->bits_per_field[4]); - - prt_printf(&buf, "compiled unpack: "); - bch2_bkey_to_text(&buf, unpacked); - prt_newline(&buf); - - prt_printf(&buf, "c unpack: "); - bch2_bkey_to_text(&buf, &tmp); - prt_newline(&buf); - - prt_printf(&buf, "compiled unpack: "); - bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, - (struct bkey_packed *) unpacked); - prt_newline(&buf); - - prt_printf(&buf, "c unpack: "); - bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, - (struct bkey_packed *) &tmp); - prt_newline(&buf); - - panic("%s", buf.buf); - } -} - -static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) -{ - if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) - __bch2_bkey_pack_verify(packed, unpacked, format); -} - -struct pack_state { - const struct bkey_format *format; - unsigned bits; /* bits remaining in current word */ - u64 w; /* current word */ - u64 *p; /* pointer to next word */ -}; - -__always_inline -static struct pack_state pack_state_init(const struct bkey_format *format, - struct bkey_packed *k) -{ - u64 *p = high_word(format, k); - - return (struct pack_state) { - .format = format, - .bits = 64 - high_bit_offset, - .w = 0, - .p = p, - }; -} - -__always_inline -static void pack_state_finish(struct pack_state *state, - struct bkey_packed *k) -{ - EBUG_ON(state->p < k->_data); - EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s); - - *state->p = state->w; -} - -struct unpack_state { - const struct bkey_format *format; - unsigned bits; /* bits remaining in current word */ - u64 w; /* current word */ - const u64 *p; /* pointer to next word */ -}; - -__always_inline -static struct unpack_state unpack_state_init(const struct bkey_format *format, - const struct bkey_packed *k) -{ - const u64 *p = high_word(format, k); - - return (struct unpack_state) { - .format = format, - .bits = 64 - high_bit_offset, - .w = *p << high_bit_offset, - .p = p, - }; -} - -__always_inline -static u64 get_inc_field(struct unpack_state *state, unsigned field) -{ - unsigned bits = state->format->bits_per_field[field]; - u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); - - if (bits >= state->bits) { - v = state->w >> (64 - bits); - bits -= state->bits; - - state->p = next_word(state->p); - state->w = *state->p; - state->bits = 64; - } - - /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ - v |= (state->w >> 1) >> (63 - bits); - state->w <<= bits; - state->bits -= bits; - - return v + offset; -} - -__always_inline -static void __set_inc_field(struct pack_state *state, unsigned field, u64 v) -{ - unsigned bits = state->format->bits_per_field[field]; - - if (bits) { - if (bits > state->bits) { - bits -= state->bits; - /* avoid shift by 64 if bits is 64 - bits is never 0 here: */ - state->w |= (v >> 1) >> (bits - 1); - - *state->p = state->w; - state->p = next_word(state->p); - state->w = 0; - state->bits = 64; - } - - state->bits -= bits; - state->w |= v << state->bits; - } -} - -__always_inline -static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) -{ - unsigned bits = state->format->bits_per_field[field]; - u64 offset = le64_to_cpu(state->format->field_offset[field]); - - if (v < offset) - return false; - - v -= offset; - - if (fls64(v) > bits) - return false; - - __set_inc_field(state, field, v); - return true; -} - -/* - * Note: does NOT set out->format (we don't know what it should be here!) - * - * Also: doesn't work on extents - it doesn't preserve the invariant that - * if k is packed bkey_start_pos(k) will successfully pack - */ -static bool bch2_bkey_transform_key(const struct bkey_format *out_f, - struct bkey_packed *out, - const struct bkey_format *in_f, - const struct bkey_packed *in) -{ - struct pack_state out_s = pack_state_init(out_f, out); - struct unpack_state in_s = unpack_state_init(in_f, in); - u64 *w = out->_data; - unsigned i; - - *w = 0; - - for (i = 0; i < BKEY_NR_FIELDS; i++) - if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) - return false; - - /* Can't happen because the val would be too big to unpack: */ - EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); - - pack_state_finish(&out_s, out); - out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; - out->needs_whiteout = in->needs_whiteout; - out->type = in->type; - - return true; -} - -bool bch2_bkey_transform(const struct bkey_format *out_f, - struct bkey_packed *out, - const struct bkey_format *in_f, - const struct bkey_packed *in) -{ - if (!bch2_bkey_transform_key(out_f, out, in_f, in)) - return false; - - memcpy_u64s((u64 *) out + out_f->key_u64s, - (u64 *) in + in_f->key_u64s, - (in->u64s - in_f->key_u64s)); - return true; -} - -struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, - const struct bkey_packed *in) -{ - struct unpack_state state = unpack_state_init(format, in); - struct bkey out; - - EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); - EBUG_ON(in->u64s < format->key_u64s); - EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); - EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); - - out.u64s = BKEY_U64s + in->u64s - format->key_u64s; - out.format = KEY_FORMAT_CURRENT; - out.needs_whiteout = in->needs_whiteout; - out.type = in->type; - out.pad[0] = 0; - -#define x(id, field) out.field = get_inc_field(&state, id); - bkey_fields() -#undef x - - return out; -} - -#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -struct bpos __bkey_unpack_pos(const struct bkey_format *format, - const struct bkey_packed *in) -{ - struct unpack_state state = unpack_state_init(format, in); - struct bpos out; - - EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); - EBUG_ON(in->u64s < format->key_u64s); - EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); - - out.inode = get_inc_field(&state, BKEY_FIELD_INODE); - out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); - out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); - - return out; -} -#endif - -/** - * bch2_bkey_pack_key -- pack just the key, not the value - * @out: packed result - * @in: key to pack - * @format: format of packed result - * - * Returns: true on success, false on failure - */ -bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, - const struct bkey_format *format) -{ - struct pack_state state = pack_state_init(format, out); - u64 *w = out->_data; - - EBUG_ON((void *) in == (void *) out); - EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); - EBUG_ON(in->format != KEY_FORMAT_CURRENT); - - *w = 0; - -#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; - bkey_fields() -#undef x - pack_state_finish(&state, out); - out->u64s = format->key_u64s + in->u64s - BKEY_U64s; - out->format = KEY_FORMAT_LOCAL_BTREE; - out->needs_whiteout = in->needs_whiteout; - out->type = in->type; - - bch2_bkey_pack_verify(out, in, format); - return true; -} - -/** - * bch2_bkey_unpack -- unpack the key and the value - * @b: btree node of @src key (for packed format) - * @dst: unpacked result - * @src: packed input - */ -void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, - const struct bkey_packed *src) -{ - __bkey_unpack_key(b, &dst->k, src); - - memcpy_u64s(&dst->v, - bkeyp_val(&b->format, src), - bkeyp_val_u64s(&b->format, src)); -} - -/** - * bch2_bkey_pack -- pack the key and the value - * @dst: packed result - * @src: unpacked input - * @format: format of packed result - * - * Returns: true on success, false on failure - */ -bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src, - const struct bkey_format *format) -{ - struct bkey_packed tmp; - - if (!bch2_bkey_pack_key(&tmp, &src->k, format)) - return false; - - memmove_u64s((u64 *) dst + format->key_u64s, - &src->v, - bkey_val_u64s(&src->k)); - memcpy_u64s_small(dst, &tmp, format->key_u64s); - - return true; -} - -__always_inline -static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) -{ - unsigned bits = state->format->bits_per_field[field]; - u64 offset = le64_to_cpu(state->format->field_offset[field]); - bool ret = true; - - EBUG_ON(v < offset); - v -= offset; - - if (fls64(v) > bits) { - v = ~(~0ULL << bits); - ret = false; - } - - __set_inc_field(state, field, v); - return ret; -} - -static bool bkey_packed_successor(struct bkey_packed *out, - const struct btree *b, - struct bkey_packed k) -{ - const struct bkey_format *f = &b->format; - unsigned nr_key_bits = b->nr_key_bits; - unsigned first_bit, offset; - u64 *p; - - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); - - if (!nr_key_bits) - return false; - - *out = k; - - first_bit = high_bit_offset + nr_key_bits - 1; - p = nth_word(high_word(f, out), first_bit >> 6); - offset = 63 - (first_bit & 63); - - while (nr_key_bits) { - unsigned bits = min(64 - offset, nr_key_bits); - u64 mask = (~0ULL >> (64 - bits)) << offset; - - if ((*p & mask) != mask) { - *p += 1ULL << offset; - EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0); - return true; - } - - *p &= ~mask; - p = prev_word(p); - nr_key_bits -= bits; - offset = 0; - } - - return false; -} - -static bool bkey_format_has_too_big_fields(const struct bkey_format *f) -{ - for (unsigned i = 0; i < f->nr_fields; i++) { - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 packed_max = f->bits_per_field[i] - ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) - : 0; - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (packed_max + field_offset < packed_max || - packed_max + field_offset > unpacked_max) - return true; - } - - return false; -} - -/* - * Returns a packed key that compares <= in - * - * This is used in bset_search_tree(), where we need a packed pos in order to be - * able to compare against the keys in the auxiliary search tree - and it's - * legal to use a packed pos that isn't equivalent to the original pos, - * _provided_ it compares <= to the original pos. - */ -enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, - struct bpos in, - const struct btree *b) -{ - const struct bkey_format *f = &b->format; - struct pack_state state = pack_state_init(f, out); - u64 *w = out->_data; - struct bpos orig = in; - bool exact = true; - unsigned i; - - /* - * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3 - * byte header, but pack_pos() won't if the len/version fields are big - * enough - we need to make sure to zero them out: - */ - for (i = 0; i < f->key_u64s; i++) - w[i] = 0; - - if (unlikely(in.snapshot < - le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { - if (!in.offset-- && - !in.inode--) - return BKEY_PACK_POS_FAIL; - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(in.offset < - le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { - if (!in.inode--) - return BKEY_PACK_POS_FAIL; - in.offset = KEY_OFFSET_MAX; - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(in.inode < - le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) - return BKEY_PACK_POS_FAIL; - - if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) { - in.offset = KEY_OFFSET_MAX; - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) { - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))) - exact = false; - - pack_state_finish(&state, out); - out->u64s = f->key_u64s; - out->format = KEY_FORMAT_LOCAL_BTREE; - out->type = KEY_TYPE_deleted; - - if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { - if (exact) { - BUG_ON(bkey_cmp_left_packed(b, out, &orig)); - } else { - struct bkey_packed successor; - - BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); - BUG_ON(bkey_packed_successor(&successor, b, *out) && - bkey_cmp_left_packed(b, &successor, &orig) < 0 && - !bkey_format_has_too_big_fields(f)); - } - } - - return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; -} - -void bch2_bkey_format_init(struct bkey_format_state *s) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(s->field_min); i++) - s->field_min[i] = U64_MAX; - - for (i = 0; i < ARRAY_SIZE(s->field_max); i++) - s->field_max[i] = 0; - - /* Make sure we can store a size of 0: */ - s->field_min[BKEY_FIELD_SIZE] = 0; -} - -void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) -{ - unsigned field = 0; - - __bkey_format_add(s, field++, p.inode); - __bkey_format_add(s, field++, p.offset); - __bkey_format_add(s, field++, p.snapshot); -} - -/* - * We don't want it to be possible for the packed format to represent fields - * bigger than a u64... that will cause confusion and issues (like with - * bkey_packed_successor()) - */ -static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, - unsigned bits, u64 offset) -{ - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - - bits = min(bits, unpacked_bits); - - offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1)); - - f->bits_per_field[i] = bits; - f->field_offset[i] = cpu_to_le64(offset); -} - -struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) -{ - unsigned i, bits = KEY_PACKED_BITS_START; - struct bkey_format ret = { - .nr_fields = BKEY_NR_FIELDS, - }; - - for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { - s->field_min[i] = min(s->field_min[i], s->field_max[i]); - - set_format_field(&ret, i, - fls64(s->field_max[i] - s->field_min[i]), - s->field_min[i]); - - bits += ret.bits_per_field[i]; - } - - /* allow for extent merging: */ - if (ret.bits_per_field[BKEY_FIELD_SIZE]) { - unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]); - - ret.bits_per_field[BKEY_FIELD_SIZE] += b; - bits += b; - } - - ret.key_u64s = DIV_ROUND_UP(bits, 64); - - /* if we have enough spare bits, round fields up to nearest byte */ - bits = ret.key_u64s * 64 - bits; - - for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { - unsigned r = round_up(ret.bits_per_field[i], 8) - - ret.bits_per_field[i]; - - if (r <= bits) { - set_format_field(&ret, i, - ret.bits_per_field[i] + r, - le64_to_cpu(ret.field_offset[i])); - bits -= r; - } - } - - if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { - struct printbuf buf = PRINTBUF; - - BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); - printbuf_exit(&buf); - } - - return ret; -} - -int bch2_bkey_format_invalid(struct bch_fs *c, - struct bkey_format *f, - enum bch_validate_flags flags, - struct printbuf *err) -{ - unsigned bits = KEY_PACKED_BITS_START; - - if (f->nr_fields != BKEY_NR_FIELDS) { - prt_printf(err, "incorrect number of fields: got %u, should be %u", - f->nr_fields, BKEY_NR_FIELDS); - return -BCH_ERR_invalid; - } - - /* - * Verify that the packed format can't represent fields larger than the - * unpacked format: - */ - for (unsigned i = 0; i < f->nr_fields; i++) { - if (bch2_bkey_format_field_overflows(f, i)) { - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - unsigned packed_bits = min(64, f->bits_per_field[i]); - u64 packed_max = packed_bits - ? ~((~0ULL << 1) << (packed_bits - 1)) - : 0; - - prt_printf(err, "field %u too large: %llu + %llu > %llu", - i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max); - return -BCH_ERR_invalid; - } - - bits += f->bits_per_field[i]; - } - - if (f->key_u64s != DIV_ROUND_UP(bits, 64)) { - prt_printf(err, "incorrect key_u64s: got %u, should be %u", - f->key_u64s, DIV_ROUND_UP(bits, 64)); - return -BCH_ERR_invalid; - } - - return 0; -} - -void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f) -{ - prt_printf(out, "u64s %u fields ", f->key_u64s); - - for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) { - if (i) - prt_str(out, ", "); - prt_printf(out, "%u:%llu", - f->bits_per_field[i], - le64_to_cpu(f->field_offset[i])); - } -} - -/* - * Most significant differing bit - * Bits are indexed from 0 - return is [0, nr_key_bits) - */ -__pure -unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, - const struct bkey_packed *l_k, - const struct bkey_packed *r_k) -{ - const u64 *l = high_word(&b->format, l_k); - const u64 *r = high_word(&b->format, r_k); - unsigned nr_key_bits = b->nr_key_bits; - unsigned word_bits = 64 - high_bit_offset; - u64 l_v, r_v; - - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); - - /* for big endian, skip past header */ - l_v = *l & (~0ULL >> high_bit_offset); - r_v = *r & (~0ULL >> high_bit_offset); - - while (nr_key_bits) { - if (nr_key_bits < word_bits) { - l_v >>= word_bits - nr_key_bits; - r_v >>= word_bits - nr_key_bits; - nr_key_bits = 0; - } else { - nr_key_bits -= word_bits; - } - - if (l_v != r_v) - return fls64(l_v ^ r_v) - 1 + nr_key_bits; - - l = next_word(l); - r = next_word(r); - - l_v = *l; - r_v = *r; - word_bits = 64; - } - - return 0; -} - -/* - * First set bit - * Bits are indexed from 0 - return is [0, nr_key_bits) - */ -__pure -unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) -{ - const u64 *p = high_word(&b->format, k); - unsigned nr_key_bits = b->nr_key_bits; - unsigned ret = 0, offset; - - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); - - offset = nr_key_bits; - while (offset > 64) { - p = next_word(p); - offset -= 64; - } - - offset = 64 - offset; - - while (nr_key_bits) { - unsigned bits = nr_key_bits + offset < 64 - ? nr_key_bits - : 64 - offset; - - u64 mask = (~0ULL >> (64 - bits)) << offset; - - if (*p & mask) - return ret + __ffs64(*p & mask) - offset; - - p = prev_word(p); - nr_key_bits -= bits; - ret += bits; - offset = 0; - } - - return 0; -} - -#ifdef HAVE_BCACHEFS_COMPILED_UNPACK - -#define I(_x) (*(out)++ = (_x)) -#define I1(i0) I(i0) -#define I2(i0, i1) (I1(i0), I(i1)) -#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) -#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) -#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) - -static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, - enum bch_bkey_fields field, - unsigned dst_offset, unsigned dst_size, - bool *eax_zeroed) -{ - unsigned bits = format->bits_per_field[field]; - u64 offset = le64_to_cpu(format->field_offset[field]); - unsigned i, byte, bit_offset, align, shl, shr; - - if (!bits && !offset) { - if (!*eax_zeroed) { - /* xor eax, eax */ - I2(0x31, 0xc0); - } - - *eax_zeroed = true; - goto set_field; - } - - if (!bits) { - /* just return offset: */ - - switch (dst_size) { - case 8: - if (offset > S32_MAX) { - /* mov [rdi + dst_offset], offset */ - I3(0xc7, 0x47, dst_offset); - memcpy(out, &offset, 4); - out += 4; - - I3(0xc7, 0x47, dst_offset + 4); - memcpy(out, (void *) &offset + 4, 4); - out += 4; - } else { - /* mov [rdi + dst_offset], offset */ - /* sign extended */ - I4(0x48, 0xc7, 0x47, dst_offset); - memcpy(out, &offset, 4); - out += 4; - } - break; - case 4: - /* mov [rdi + dst_offset], offset */ - I3(0xc7, 0x47, dst_offset); - memcpy(out, &offset, 4); - out += 4; - break; - default: - BUG(); - } - - return out; - } - - bit_offset = format->key_u64s * 64; - for (i = 0; i <= field; i++) - bit_offset -= format->bits_per_field[i]; - - byte = bit_offset / 8; - bit_offset -= byte * 8; - - *eax_zeroed = false; - - if (bit_offset == 0 && bits == 8) { - /* movzx eax, BYTE PTR [rsi + imm8] */ - I4(0x0f, 0xb6, 0x46, byte); - } else if (bit_offset == 0 && bits == 16) { - /* movzx eax, WORD PTR [rsi + imm8] */ - I4(0x0f, 0xb7, 0x46, byte); - } else if (bit_offset + bits <= 32) { - align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); - byte -= align; - bit_offset += align * 8; - - BUG_ON(bit_offset + bits > 32); - - /* mov eax, [rsi + imm8] */ - I3(0x8b, 0x46, byte); - - if (bit_offset) { - /* shr eax, imm8 */ - I3(0xc1, 0xe8, bit_offset); - } - - if (bit_offset + bits < 32) { - unsigned mask = ~0U >> (32 - bits); - - /* and eax, imm32 */ - I1(0x25); - memcpy(out, &mask, 4); - out += 4; - } - } else if (bit_offset + bits <= 64) { - align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); - byte -= align; - bit_offset += align * 8; - - BUG_ON(bit_offset + bits > 64); - - /* mov rax, [rsi + imm8] */ - I4(0x48, 0x8b, 0x46, byte); - - shl = 64 - bit_offset - bits; - shr = bit_offset + shl; - - if (shl) { - /* shl rax, imm8 */ - I4(0x48, 0xc1, 0xe0, shl); - } - - if (shr) { - /* shr rax, imm8 */ - I4(0x48, 0xc1, 0xe8, shr); - } - } else { - align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); - byte -= align; - bit_offset += align * 8; - - BUG_ON(bit_offset + bits > 96); - - /* mov rax, [rsi + byte] */ - I4(0x48, 0x8b, 0x46, byte); - - /* mov edx, [rsi + byte + 8] */ - I3(0x8b, 0x56, byte + 8); - - /* bits from next word: */ - shr = bit_offset + bits - 64; - BUG_ON(shr > bit_offset); - - /* shr rax, bit_offset */ - I4(0x48, 0xc1, 0xe8, shr); - - /* shl rdx, imm8 */ - I4(0x48, 0xc1, 0xe2, 64 - shr); - - /* or rax, rdx */ - I3(0x48, 0x09, 0xd0); - - shr = bit_offset - shr; - - if (shr) { - /* shr rax, imm8 */ - I4(0x48, 0xc1, 0xe8, shr); - } - } - - /* rax += offset: */ - if (offset > S32_MAX) { - /* mov rdx, imm64 */ - I2(0x48, 0xba); - memcpy(out, &offset, 8); - out += 8; - /* add %rdx, %rax */ - I3(0x48, 0x01, 0xd0); - } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { - /* add rax, imm32 */ - I2(0x48, 0x05); - memcpy(out, &offset, 4); - out += 4; - } else if (offset) { - /* add eax, imm32 */ - I1(0x05); - memcpy(out, &offset, 4); - out += 4; - } -set_field: - switch (dst_size) { - case 8: - /* mov [rdi + dst_offset], rax */ - I4(0x48, 0x89, 0x47, dst_offset); - break; - case 4: - /* mov [rdi + dst_offset], eax */ - I3(0x89, 0x47, dst_offset); - break; - default: - BUG(); - } - - return out; -} - -int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) -{ - bool eax_zeroed = false; - u8 *out = _out; - - /* - * rdi: dst - unpacked key - * rsi: src - packed key - */ - - /* k->u64s, k->format, k->type */ - - /* mov eax, [rsi] */ - I2(0x8b, 0x06); - - /* add eax, BKEY_U64s - format->key_u64s */ - I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); - - /* and eax, imm32: mask out k->pad: */ - I5(0x25, 0xff, 0xff, 0xff, 0); - - /* mov [rdi], eax */ - I2(0x89, 0x07); - -#define x(id, field) \ - out = compile_bkey_field(format, out, id, \ - offsetof(struct bkey, field), \ - sizeof(((struct bkey *) NULL)->field), \ - &eax_zeroed); - bkey_fields() -#undef x - - /* retq */ - I1(0xc3); - - return (void *) out - _out; -} - -#else -#endif - -__pure -int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, - const struct bkey_packed *r, - const struct btree *b) -{ - return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); -} - -__pure __flatten -int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, - const struct bkey_packed *l, - const struct bpos *r) -{ - return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r); -} - -__pure __flatten -int bch2_bkey_cmp_packed(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed_inlined(b, l, r); -} - -__pure __flatten -int __bch2_bkey_cmp_left_packed(const struct btree *b, - const struct bkey_packed *l, - const struct bpos *r) -{ - const struct bkey *l_unpacked; - - return unlikely(l_unpacked = packed_to_bkey_c(l)) - ? bpos_cmp(l_unpacked->p, *r) - : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -} - -void bch2_bpos_swab(struct bpos *p) -{ - u8 *l = (u8 *) p; - u8 *h = ((u8 *) &p[1]) - 1; - - while (l < h) { - swap(*l, *h); - l++; - --h; - } -} - -void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) -{ - const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; - u8 *l = k->key_start; - u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1; - - while (l < h) { - swap(*l, *h); - l++; - --h; - } -} - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_bkey_pack_test(void) -{ - struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); - struct bkey_packed p; - - struct bkey_format test_format = { - .key_u64s = 3, - .nr_fields = BKEY_NR_FIELDS, - .bits_per_field = { - 13, - 64, - 32, - }, - }; - - struct unpack_state in_s = - unpack_state_init(&bch2_bkey_format_current, (void *) &t); - struct pack_state out_s = pack_state_init(&test_format, &p); - unsigned i; - - for (i = 0; i < out_s.format->nr_fields; i++) { - u64 a, v = get_inc_field(&in_s, i); - - switch (i) { -#define x(id, field) case id: a = t.field; break; - bkey_fields() -#undef x - default: - BUG(); - } - - if (a != v) - panic("got %llu actual %llu i %u\n", v, a, i); - - if (!set_inc_field(&out_s, i, v)) - panic("failed at %u\n", i); - } - - BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); -} -#endif diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h deleted file mode 100644 index 3ccd521c190a..000000000000 --- a/fs/bcachefs/bkey.h +++ /dev/null @@ -1,605 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_H -#define _BCACHEFS_BKEY_H - -#include <linux/bug.h> -#include "bcachefs_format.h" -#include "bkey_types.h" -#include "btree_types.h" -#include "util.h" -#include "vstructs.h" - -#if 0 - -/* - * compiled unpack functions are disabled, pending a new interface for - * dynamically allocating executable memory: - */ - -#ifdef CONFIG_X86_64 -#define HAVE_BCACHEFS_COMPILED_UNPACK 1 -#endif -#endif - -void bch2_bkey_packed_to_binary_text(struct printbuf *, - const struct bkey_format *, - const struct bkey_packed *); - -enum bkey_lr_packed { - BKEY_PACKED_BOTH, - BKEY_PACKED_RIGHT, - BKEY_PACKED_LEFT, - BKEY_PACKED_NONE, -}; - -#define bkey_lr_packed(_l, _r) \ - ((_l)->format + ((_r)->format << 1)) - -static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src) -{ - memcpy_u64s_small(dst, src, src->u64s); -} - -static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src) -{ - memcpy_u64s_small(dst, src, src->k.u64s); -} - -struct btree; - -__pure -unsigned bch2_bkey_greatest_differing_bit(const struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); -__pure -unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); - -__pure -int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, - const struct bkey_packed *, - const struct btree *); - -__pure -int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, - const struct bkey_packed *, - const struct bpos *); - -__pure -int bch2_bkey_cmp_packed(const struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); - -__pure -int __bch2_bkey_cmp_left_packed(const struct btree *, - const struct bkey_packed *, - const struct bpos *); - -static inline __pure -int bkey_cmp_left_packed(const struct btree *b, - const struct bkey_packed *l, const struct bpos *r) -{ - return __bch2_bkey_cmp_left_packed(b, l, r); -} - -/* - * The compiler generates better code when we pass bpos by ref, but it's often - * enough terribly convenient to pass it by val... as much as I hate c++, const - * ref would be nice here: - */ -__pure __flatten -static inline int bkey_cmp_left_packed_byval(const struct btree *b, - const struct bkey_packed *l, - struct bpos r) -{ - return bkey_cmp_left_packed(b, l, &r); -} - -static __always_inline bool bpos_eq(struct bpos l, struct bpos r) -{ - return !((l.inode ^ r.inode) | - (l.offset ^ r.offset) | - (l.snapshot ^ r.snapshot)); -} - -static __always_inline bool bpos_lt(struct bpos l, struct bpos r) -{ - return l.inode != r.inode ? l.inode < r.inode : - l.offset != r.offset ? l.offset < r.offset : - l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false; -} - -static __always_inline bool bpos_le(struct bpos l, struct bpos r) -{ - return l.inode != r.inode ? l.inode < r.inode : - l.offset != r.offset ? l.offset < r.offset : - l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true; -} - -static __always_inline bool bpos_gt(struct bpos l, struct bpos r) -{ - return bpos_lt(r, l); -} - -static __always_inline bool bpos_ge(struct bpos l, struct bpos r) -{ - return bpos_le(r, l); -} - -static __always_inline int bpos_cmp(struct bpos l, struct bpos r) -{ - return cmp_int(l.inode, r.inode) ?: - cmp_int(l.offset, r.offset) ?: - cmp_int(l.snapshot, r.snapshot); -} - -static inline struct bpos bpos_min(struct bpos l, struct bpos r) -{ - return bpos_lt(l, r) ? l : r; -} - -static inline struct bpos bpos_max(struct bpos l, struct bpos r) -{ - return bpos_gt(l, r) ? l : r; -} - -static __always_inline bool bkey_eq(struct bpos l, struct bpos r) -{ - return !((l.inode ^ r.inode) | - (l.offset ^ r.offset)); -} - -static __always_inline bool bkey_lt(struct bpos l, struct bpos r) -{ - return l.inode != r.inode - ? l.inode < r.inode - : l.offset < r.offset; -} - -static __always_inline bool bkey_le(struct bpos l, struct bpos r) -{ - return l.inode != r.inode - ? l.inode < r.inode - : l.offset <= r.offset; -} - -static __always_inline bool bkey_gt(struct bpos l, struct bpos r) -{ - return bkey_lt(r, l); -} - -static __always_inline bool bkey_ge(struct bpos l, struct bpos r) -{ - return bkey_le(r, l); -} - -static __always_inline int bkey_cmp(struct bpos l, struct bpos r) -{ - return cmp_int(l.inode, r.inode) ?: - cmp_int(l.offset, r.offset); -} - -static inline struct bpos bkey_min(struct bpos l, struct bpos r) -{ - return bkey_lt(l, r) ? l : r; -} - -static inline struct bpos bkey_max(struct bpos l, struct bpos r) -{ - return bkey_gt(l, r) ? l : r; -} - -static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) -{ - return bpos_eq(l.k->p, r.k->p) && - l.k->size == r.k->size && - bkey_bytes(l.k) == bkey_bytes(r.k) && - !memcmp(l.v, r.v, bkey_val_bytes(l.k)); -} - -void bch2_bpos_swab(struct bpos *); -void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); - -static __always_inline int bversion_cmp(struct bversion l, struct bversion r) -{ - return cmp_int(l.hi, r.hi) ?: - cmp_int(l.lo, r.lo); -} - -#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) -#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) - -static __always_inline bool bversion_zero(struct bversion v) -{ - return bversion_cmp(v, ZERO_VERSION) == 0; -} - -#ifdef CONFIG_BCACHEFS_DEBUG -/* statement expressions confusing unlikely()? */ -#define bkey_packed(_k) \ - ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ - (_k)->format != KEY_FORMAT_CURRENT; }) -#else -#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) -#endif - -/* - * It's safe to treat an unpacked bkey as a packed one, but not the reverse - */ -static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) -{ - return (struct bkey_packed *) k; -} - -static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) -{ - return (const struct bkey_packed *) k; -} - -static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) -{ - return bkey_packed(k) ? NULL : (struct bkey_i *) k; -} - -static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) -{ - return bkey_packed(k) ? NULL : (const struct bkey *) k; -} - -static inline unsigned bkey_format_key_bits(const struct bkey_format *format) -{ - return format->bits_per_field[BKEY_FIELD_INODE] + - format->bits_per_field[BKEY_FIELD_OFFSET] + - format->bits_per_field[BKEY_FIELD_SNAPSHOT]; -} - -static inline struct bpos bpos_successor(struct bpos p) -{ - if (!++p.snapshot && - !++p.offset && - !++p.inode) - BUG(); - - return p; -} - -static inline struct bpos bpos_predecessor(struct bpos p) -{ - if (!p.snapshot-- && - !p.offset-- && - !p.inode--) - BUG(); - - return p; -} - -static inline struct bpos bpos_nosnap_successor(struct bpos p) -{ - p.snapshot = 0; - - if (!++p.offset && - !++p.inode) - BUG(); - - return p; -} - -static inline struct bpos bpos_nosnap_predecessor(struct bpos p) -{ - p.snapshot = 0; - - if (!p.offset-- && - !p.inode--) - BUG(); - - return p; -} - -static inline u64 bkey_start_offset(const struct bkey *k) -{ - return k->p.offset - k->size; -} - -static inline struct bpos bkey_start_pos(const struct bkey *k) -{ - return (struct bpos) { - .inode = k->p.inode, - .offset = bkey_start_offset(k), - .snapshot = k->p.snapshot, - }; -} - -/* Packed helpers */ - -static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return bkey_packed(k) ? format->key_u64s : BKEY_U64s; -} - -static inline bool bkeyp_u64s_valid(const struct bkey_format *f, - const struct bkey_packed *k) -{ - return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s); -} - -static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return bkeyp_key_u64s(format, k) * sizeof(u64); -} - -static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return k->u64s - bkeyp_key_u64s(format, k); -} - -static inline size_t bkeyp_val_bytes(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return bkeyp_val_u64s(format, k) * sizeof(u64); -} - -static inline void set_bkeyp_val_u64s(const struct bkey_format *format, - struct bkey_packed *k, unsigned val_u64s) -{ - k->u64s = bkeyp_key_u64s(format, k) + val_u64s; -} - -#define bkeyp_val(_format, _k) \ - ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k))) - -extern const struct bkey_format bch2_bkey_format_current; - -bool bch2_bkey_transform(const struct bkey_format *, - struct bkey_packed *, - const struct bkey_format *, - const struct bkey_packed *); - -struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, - const struct bkey_packed *); - -#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -struct bpos __bkey_unpack_pos(const struct bkey_format *, - const struct bkey_packed *); -#endif - -bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, - const struct bkey_format *); - -enum bkey_pack_pos_ret { - BKEY_PACK_POS_EXACT, - BKEY_PACK_POS_SMALLER, - BKEY_PACK_POS_FAIL, -}; - -enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, - const struct btree *); - -static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, - const struct btree *b) -{ - return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; -} - -void bch2_bkey_unpack(const struct btree *, struct bkey_i *, - const struct bkey_packed *); -bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, - const struct bkey_format *); - -typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); - -static inline void -__bkey_unpack_key_format_checked(const struct btree *b, - struct bkey *dst, - const struct bkey_packed *src) -{ - if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) { - compiled_unpack_fn unpack_fn = b->aux_data; - unpack_fn(dst, src); - - if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { - struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); - - BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); - } - } else { - *dst = __bch2_bkey_unpack_key(&b->format, src); - } -} - -static inline struct bkey -bkey_unpack_key_format_checked(const struct btree *b, - const struct bkey_packed *src) -{ - struct bkey dst; - - __bkey_unpack_key_format_checked(b, &dst, src); - return dst; -} - -static inline void __bkey_unpack_key(const struct btree *b, - struct bkey *dst, - const struct bkey_packed *src) -{ - if (likely(bkey_packed(src))) - __bkey_unpack_key_format_checked(b, dst, src); - else - *dst = *packed_to_bkey_c(src); -} - -/** - * bkey_unpack_key -- unpack just the key, not the value - */ -static inline struct bkey bkey_unpack_key(const struct btree *b, - const struct bkey_packed *src) -{ - return likely(bkey_packed(src)) - ? bkey_unpack_key_format_checked(b, src) - : *packed_to_bkey_c(src); -} - -static inline struct bpos -bkey_unpack_pos_format_checked(const struct btree *b, - const struct bkey_packed *src) -{ -#ifdef HAVE_BCACHEFS_COMPILED_UNPACK - return bkey_unpack_key_format_checked(b, src).p; -#else - return __bkey_unpack_pos(&b->format, src); -#endif -} - -static inline struct bpos bkey_unpack_pos(const struct btree *b, - const struct bkey_packed *src) -{ - return likely(bkey_packed(src)) - ? bkey_unpack_pos_format_checked(b, src) - : packed_to_bkey_c(src)->p; -} - -/* Disassembled bkeys */ - -static inline struct bkey_s_c bkey_disassemble(const struct btree *b, - const struct bkey_packed *k, - struct bkey *u) -{ - __bkey_unpack_key(b, u, k); - - return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; -} - -/* non const version: */ -static inline struct bkey_s __bkey_disassemble(const struct btree *b, - struct bkey_packed *k, - struct bkey *u) -{ - __bkey_unpack_key(b, u, k); - - return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; -} - -static inline u64 bkey_field_max(const struct bkey_format *f, - enum bch_bkey_fields nr) -{ - return f->bits_per_field[nr] < 64 - ? (le64_to_cpu(f->field_offset[nr]) + - ~(~0ULL << f->bits_per_field[nr])) - : U64_MAX; -} - -#ifdef HAVE_BCACHEFS_COMPILED_UNPACK - -int bch2_compile_bkey_format(const struct bkey_format *, void *); - -#else - -static inline int bch2_compile_bkey_format(const struct bkey_format *format, - void *out) { return 0; } - -#endif - -static inline void bkey_reassemble(struct bkey_i *dst, - struct bkey_s_c src) -{ - dst->k = *src.k; - memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); -} - -/* byte order helpers */ - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - -static inline unsigned high_word_offset(const struct bkey_format *f) -{ - return f->key_u64s - 1; -} - -#define high_bit_offset 0 -#define nth_word(p, n) ((p) - (n)) - -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - -static inline unsigned high_word_offset(const struct bkey_format *f) -{ - return 0; -} - -#define high_bit_offset KEY_PACKED_BITS_START -#define nth_word(p, n) ((p) + (n)) - -#else -#error edit for your odd byteorder. -#endif - -#define high_word(f, k) ((u64 *) (k)->_data + high_word_offset(f)) -#define next_word(p) nth_word(p, 1) -#define prev_word(p) nth_word(p, -1) - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_bkey_pack_test(void); -#else -static inline void bch2_bkey_pack_test(void) {} -#endif - -#define bkey_fields() \ - x(BKEY_FIELD_INODE, p.inode) \ - x(BKEY_FIELD_OFFSET, p.offset) \ - x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ - x(BKEY_FIELD_SIZE, size) \ - x(BKEY_FIELD_VERSION_HI, bversion.hi) \ - x(BKEY_FIELD_VERSION_LO, bversion.lo) - -struct bkey_format_state { - u64 field_min[BKEY_NR_FIELDS]; - u64 field_max[BKEY_NR_FIELDS]; -}; - -void bch2_bkey_format_init(struct bkey_format_state *); - -static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v) -{ - s->field_min[field] = min(s->field_min[field], v); - s->field_max[field] = max(s->field_max[field], v); -} - -/* - * Changes @format so that @k can be successfully packed with @format - */ -static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) -{ -#define x(id, field) __bkey_format_add(s, id, k->field); - bkey_fields() -#undef x -} - -void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); -struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); - -static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i) -{ - unsigned f_bits = f->bits_per_field[i]; - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (f_bits > unpacked_bits) - return true; - - if ((f_bits == unpacked_bits) && field_offset) - return true; - - u64 f_mask = f_bits - ? ~((~0ULL << (f_bits - 1)) << 1) - : 0; - - if (((field_offset + f_mask) & unpacked_mask) < field_offset) - return true; - return false; -} - -int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, - enum bch_validate_flags, struct printbuf *); -void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); - -#endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h deleted file mode 100644 index a30c4ae8eb36..000000000000 --- a/fs/bcachefs/bkey_buf.h +++ /dev/null @@ -1,61 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_BUF_H -#define _BCACHEFS_BKEY_BUF_H - -#include "bcachefs.h" -#include "bkey.h" - -struct bkey_buf { - struct bkey_i *k; - u64 onstack[12]; -}; - -static inline void bch2_bkey_buf_realloc(struct bkey_buf *s, - struct bch_fs *c, unsigned u64s) -{ - if (s->k == (void *) s->onstack && - u64s > ARRAY_SIZE(s->onstack)) { - s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); - memcpy(s->k, s->onstack, sizeof(s->onstack)); - } -} - -static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s, - struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_buf_realloc(s, c, k.k->u64s); - bkey_reassemble(s->k, k); -} - -static inline void bch2_bkey_buf_copy(struct bkey_buf *s, - struct bch_fs *c, - struct bkey_i *src) -{ - bch2_bkey_buf_realloc(s, c, src->k.u64s); - bkey_copy(s->k, src); -} - -static inline void bch2_bkey_buf_unpack(struct bkey_buf *s, - struct bch_fs *c, - struct btree *b, - struct bkey_packed *src) -{ - bch2_bkey_buf_realloc(s, c, BKEY_U64s + - bkeyp_val_u64s(&b->format, src)); - bch2_bkey_unpack(b, s->k, src); -} - -static inline void bch2_bkey_buf_init(struct bkey_buf *s) -{ - s->k = (void *) s->onstack; -} - -static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c) -{ - if (s->k != (void *) s->onstack) - mempool_free(s->k, &c->large_bkey_pool); - s->k = NULL; -} - -#endif /* _BCACHEFS_BKEY_BUF_H */ diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h deleted file mode 100644 index 5f42a6e69360..000000000000 --- a/fs/bcachefs/bkey_cmp.h +++ /dev/null @@ -1,129 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_CMP_H -#define _BCACHEFS_BKEY_CMP_H - -#include "bkey.h" - -#ifdef CONFIG_X86_64 -static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, - unsigned nr_key_bits) -{ - long d0, d1, d2, d3; - int cmp; - - /* we shouldn't need asm for this, but gcc is being retarded: */ - - asm(".intel_syntax noprefix;" - "xor eax, eax;" - "xor edx, edx;" - "1:;" - "mov r8, [rdi];" - "mov r9, [rsi];" - "sub ecx, 64;" - "jl 2f;" - - "cmp r8, r9;" - "jnz 3f;" - - "lea rdi, [rdi - 8];" - "lea rsi, [rsi - 8];" - "jmp 1b;" - - "2:;" - "not ecx;" - "shr r8, 1;" - "shr r9, 1;" - "shr r8, cl;" - "shr r9, cl;" - "cmp r8, r9;" - - "3:\n" - "seta al;" - "setb dl;" - "sub eax, edx;" - ".att_syntax prefix;" - : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) - : "0" (l), "1" (r), "3" (nr_key_bits) - : "r8", "r9", "cc", "memory"); - - return cmp; -} -#else -static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, - unsigned nr_key_bits) -{ - u64 l_v, r_v; - - if (!nr_key_bits) - return 0; - - /* for big endian, skip past header */ - nr_key_bits += high_bit_offset; - l_v = *l & (~0ULL >> high_bit_offset); - r_v = *r & (~0ULL >> high_bit_offset); - - while (1) { - if (nr_key_bits < 64) { - l_v >>= 64 - nr_key_bits; - r_v >>= 64 - nr_key_bits; - nr_key_bits = 0; - } else { - nr_key_bits -= 64; - } - - if (!nr_key_bits || l_v != r_v) - break; - - l = next_word(l); - r = next_word(r); - - l_v = *l; - r_v = *r; - } - - return cmp_int(l_v, r_v); -} -#endif - -static inline __pure __flatten -int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l, - const struct bkey_packed *r, - const struct btree *b) -{ - const struct bkey_format *f = &b->format; - int ret; - - EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); - - ret = __bkey_cmp_bits(high_word(f, l), - high_word(f, r), - b->nr_key_bits); - - EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), - bkey_unpack_pos(b, r))); - return ret; -} - -static inline __pure __flatten -int bch2_bkey_cmp_packed_inlined(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - struct bkey unpacked; - - if (likely(bkey_packed(l) && bkey_packed(r))) - return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); - - if (bkey_packed(l)) { - __bkey_unpack_key_format_checked(b, &unpacked, l); - l = (void *) &unpacked; - } else if (bkey_packed(r)) { - __bkey_unpack_key_format_checked(b, &unpacked, r); - r = (void *) &unpacked; - } - - return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); -} - -#endif /* _BCACHEFS_BKEY_CMP_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c deleted file mode 100644 index fcd8c82cba4f..000000000000 --- a/fs/bcachefs/bkey_methods.c +++ /dev/null @@ -1,497 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "backpointers.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_types.h" -#include "alloc_background.h" -#include "dirent.h" -#include "disk_accounting.h" -#include "ec.h" -#include "error.h" -#include "extents.h" -#include "inode.h" -#include "io_misc.h" -#include "lru.h" -#include "quota.h" -#include "reflink.h" -#include "snapshot.h" -#include "subvolume.h" -#include "xattr.h" - -const char * const bch2_bkey_types[] = { -#define x(name, nr, ...) #name, - BCH_BKEY_TYPES() -#undef x - NULL -}; - -static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return 0; -} - -#define bch2_bkey_ops_deleted ((struct bkey_ops) { \ - .key_validate = deleted_key_validate, \ -}) - -#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \ - .key_validate = deleted_key_validate, \ -}) - -static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_val_bytes(k.k), - c, bkey_val_size_nonzero, - "incorrect value size (%zu != 0)", - bkey_val_bytes(k.k)); -fsck_err: - return ret; -} - -#define bch2_bkey_ops_error ((struct bkey_ops) { \ - .key_validate = empty_val_key_validate, \ -}) - -static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return 0; -} - -static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k); - - prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie)); -} - -#define bch2_bkey_ops_cookie ((struct bkey_ops) { \ - .key_validate = key_type_cookie_validate, \ - .val_to_text = key_type_cookie_to_text, \ - .min_val_size = 8, \ -}) - -#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ - .key_validate = empty_val_key_validate, \ -}) - -static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return 0; -} - -static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); - unsigned datalen = bkey_inline_data_bytes(k.k); - - prt_printf(out, "datalen %u: %*phN", - datalen, min(datalen, 32U), d.v->data); -} - -#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ - .key_validate = key_type_inline_data_validate, \ - .val_to_text = key_type_inline_data_to_text, \ -}) - -static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) -{ - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; -} - -#define bch2_bkey_ops_set ((struct bkey_ops) { \ - .key_validate = empty_val_key_validate, \ - .key_merge = key_type_set_merge, \ -}) - -const struct bkey_ops bch2_bkey_ops[] = { -#define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name, - BCH_BKEY_TYPES() -#undef x -}; - -const struct bkey_ops bch2_bkey_null_ops = { -}; - -int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) - return 0; - - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - int ret = 0; - - bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, - c, bkey_val_size_too_small, - "bad val size (%zu < %u)", - bkey_val_bytes(k.k), ops->min_val_size); - - if (!ops->key_validate) - return 0; - - ret = ops->key_validate(c, k, from); -fsck_err: - return ret; -} - -static u64 bch2_key_types_allowed[] = { - [BKEY_TYPE_btree] = - BIT_ULL(KEY_TYPE_deleted)| - BIT_ULL(KEY_TYPE_btree_ptr)| - BIT_ULL(KEY_TYPE_btree_ptr_v2), -#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, - BCH_BTREE_IDS() -#undef x -}; - -static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = { -#define x(name, nr, flags) [KEY_TYPE_##name] = flags, - BCH_BKEY_TYPES() -#undef x -}; - -const char *bch2_btree_node_type_str(enum btree_node_type type) -{ - return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); -} - -int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - enum btree_node_type type = __btree_node_type(from.level, from.btree); - - if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) - return 0; - - int ret = 0; - - bkey_fsck_err_on(k.k->u64s < BKEY_U64s, - c, bkey_u64s_too_small, - "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); - - if (type >= BKEY_TYPE_NR) - return 0; - - enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX - ? bch2_bkey_type_flags[k.k->type] - : 0; - - bool strict_key_type_allowed = - (from.flags & BCH_VALIDATE_commit) || - type == BKEY_TYPE_btree || - (from.btree < BTREE_ID_NR && - (bkey_flags & BKEY_TYPE_strict_btree_checks)); - - bkey_fsck_err_on(strict_key_type_allowed && - k.k->type < KEY_TYPE_MAX && - !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), - c, bkey_invalid_type_for_btree, - "invalid key type for btree %s (%s)", - bch2_btree_node_type_str(type), - k.k->type < KEY_TYPE_MAX - ? bch2_bkey_types[k.k->type] - : "(unknown)"); - - if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { - bkey_fsck_err_on(k.k->size == 0, - c, bkey_extent_size_zero, - "size == 0"); - - bkey_fsck_err_on(k.k->size > k.k->p.offset, - c, bkey_extent_size_greater_than_offset, - "size greater than offset (%u > %llu)", - k.k->size, k.k->p.offset); - } else { - bkey_fsck_err_on(k.k->size, - c, bkey_size_nonzero, - "size != 0"); - } - - if (type != BKEY_TYPE_btree) { - enum btree_id btree = type - 1; - - if (btree_type_has_snapshots(btree)) { - bkey_fsck_err_on(!k.k->p.snapshot, - c, bkey_snapshot_zero, - "snapshot == 0"); - } else if (!btree_type_has_snapshot_field(btree)) { - bkey_fsck_err_on(k.k->p.snapshot, - c, bkey_snapshot_nonzero, - "nonzero snapshot"); - } else { - /* - * btree uses snapshot field but it's not required to be - * nonzero - */ - } - - bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), - c, bkey_at_pos_max, - "key at POS_MAX"); - } -fsck_err: - return ret; -} - -int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return __bch2_bkey_validate(c, k, from) ?: - bch2_bkey_val_validate(c, k, from); -} - -int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), - c, bkey_before_start_of_btree_node, - "key before start of btree node"); - - bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), - c, bkey_after_end_of_btree_node, - "key past end of btree node"); -fsck_err: - return ret; -} - -void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) -{ - if (bpos_eq(pos, POS_MIN)) - prt_printf(out, "POS_MIN"); - else if (bpos_eq(pos, POS_MAX)) - prt_printf(out, "POS_MAX"); - else if (bpos_eq(pos, SPOS_MAX)) - prt_printf(out, "SPOS_MAX"); - else { - if (pos.inode == U64_MAX) - prt_printf(out, "U64_MAX"); - else - prt_printf(out, "%llu", pos.inode); - prt_printf(out, ":"); - if (pos.offset == U64_MAX) - prt_printf(out, "U64_MAX"); - else - prt_printf(out, "%llu", pos.offset); - prt_printf(out, ":"); - if (pos.snapshot == U32_MAX) - prt_printf(out, "U32_MAX"); - else - prt_printf(out, "%u", pos.snapshot); - } -} - -void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) -{ - if (k) { - prt_printf(out, "u64s %u type ", k->u64s); - - if (k->type < KEY_TYPE_MAX) - prt_printf(out, "%s ", bch2_bkey_types[k->type]); - else - prt_printf(out, "%u ", k->type); - - bch2_bpos_to_text(out, k->p); - - prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo); - } else { - prt_printf(out, "(null)"); - } -} - -void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - - if (likely(ops->val_to_text)) - ops->val_to_text(out, c, k); -} - -void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_to_text(out, k.k); - - if (bkey_val_bytes(k.k)) { - prt_printf(out, ": "); - bch2_val_to_text(out, c, k); - } -} - -void bch2_bkey_swab_val(struct bkey_s k) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - - if (ops->swab) - ops->swab(k); -} - -bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - - return ops->key_normalize - ? ops->key_normalize(c, k) - : false; -} - -bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type); - - return ops->key_merge && - bch2_bkey_maybe_mergable(l.k, r.k) && - (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && - !static_branch_unlikely(&bch2_key_merging_disabled) && - ops->key_merge(c, l, r); -} - -static const struct old_bkey_type { - u8 btree_node_type; - u8 old; - u8 new; -} bkey_renumber_table[] = { - {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr }, - {BKEY_TYPE_extents, 128, KEY_TYPE_extent }, - {BKEY_TYPE_extents, 129, KEY_TYPE_extent }, - {BKEY_TYPE_extents, 130, KEY_TYPE_reservation }, - {BKEY_TYPE_inodes, 128, KEY_TYPE_inode }, - {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation }, - {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent }, - {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout }, - {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr }, - {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout }, - {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc }, - {BKEY_TYPE_quotas, 128, KEY_TYPE_quota }, -}; - -void bch2_bkey_renumber(enum btree_node_type btree_node_type, - struct bkey_packed *k, - int write) -{ - const struct old_bkey_type *i; - - for (i = bkey_renumber_table; - i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); - i++) - if (btree_node_type == i->btree_node_type && - k->type == (write ? i->new : i->old)) { - k->type = write ? i->old : i->new; - break; - } -} - -void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, - struct bkey_format *f, - struct bkey_packed *k) -{ - const struct bkey_ops *ops; - struct bkey uk; - unsigned nr_compat = 5; - int i; - - /* - * Do these operations in reverse order in the write path: - */ - - for (i = 0; i < nr_compat; i++) - switch (!write ? i : nr_compat - 1 - i) { - case 0: - if (big_endian != CPU_BIG_ENDIAN) { - bch2_bkey_swab_key(f, k); - } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - bch2_bkey_swab_key(f, k); - bch2_bkey_swab_key(f, k); - } - break; - case 1: - if (version < bcachefs_metadata_version_bkey_renumber) - bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); - break; - case 2: - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id == BTREE_ID_inodes) { - if (!bkey_packed(k)) { - struct bkey_i *u = packed_to_bkey(k); - - swap(u->k.p.inode, u->k.p.offset); - } else if (f->bits_per_field[BKEY_FIELD_INODE] && - f->bits_per_field[BKEY_FIELD_OFFSET]) { - struct bkey_format tmp = *f, *in = f, *out = &tmp; - - swap(tmp.bits_per_field[BKEY_FIELD_INODE], - tmp.bits_per_field[BKEY_FIELD_OFFSET]); - swap(tmp.field_offset[BKEY_FIELD_INODE], - tmp.field_offset[BKEY_FIELD_OFFSET]); - - if (!write) - swap(in, out); - - uk = __bch2_bkey_unpack_key(in, k); - swap(uk.p.inode, uk.p.offset); - BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); - } - } - break; - case 3: - if (version < bcachefs_metadata_version_snapshot && - (level || btree_type_has_snapshots(btree_id))) { - struct bkey_i *u = packed_to_bkey(k); - - if (u) { - u->k.p.snapshot = write - ? 0 : U32_MAX; - } else { - u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]); - u64 max_packed = min_packed + - ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); - - uk = __bch2_bkey_unpack_key(f, k); - uk.p.snapshot = write - ? min_packed : min_t(u64, U32_MAX, max_packed); - - BUG_ON(!bch2_bkey_pack_key(k, &uk, f)); - } - } - - break; - case 4: { - struct bkey_s u; - - if (!bkey_packed(k)) { - u = bkey_i_to_s(packed_to_bkey(k)); - } else { - uk = __bch2_bkey_unpack_key(f, k); - u.k = &uk; - u.v = bkeyp_val(f, k); - } - - if (big_endian != CPU_BIG_ENDIAN) - bch2_bkey_swab_val(u); - - ops = bch2_bkey_type_ops(k->type); - - if (ops->compat) - ops->compat(btree_id, version, big_endian, write, u); - break; - } - default: - BUG(); - } -} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h deleted file mode 100644 index bf34111cdf00..000000000000 --- a/fs/bcachefs/bkey_methods.h +++ /dev/null @@ -1,139 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_METHODS_H -#define _BCACHEFS_BKEY_METHODS_H - -#include "bkey.h" - -struct bch_fs; -struct btree; -struct btree_trans; -struct bkey; -enum btree_node_type; - -extern const char * const bch2_bkey_types[]; -extern const struct bkey_ops bch2_bkey_null_ops; - -/* - * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If - * invalid, entire key will be deleted. - * - * When invalid, error string is returned via @err. @rw indicates whether key is - * being read or written; more aggressive checks can be enabled when rw == WRITE. - */ -struct bkey_ops { - int (*key_validate)(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from); - void (*val_to_text)(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - void (*swab)(struct bkey_s); - bool (*key_normalize)(struct bch_fs *, struct bkey_s); - bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); - int (*trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - void (*compat)(enum btree_id id, unsigned version, - unsigned big_endian, int write, - struct bkey_s); - - /* Size of value type when first created: */ - unsigned min_val_size; -}; - -extern const struct bkey_ops bch2_bkey_ops[]; - -static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) -{ - return likely(type < KEY_TYPE_MAX) - ? &bch2_bkey_ops[type] - : &bch2_bkey_null_ops; -} - -int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, - struct bkey_validate_context from); - -void bch2_bpos_to_text(struct printbuf *, struct bpos); -void bch2_bkey_to_text(struct printbuf *, const struct bkey *); -void bch2_val_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); -void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - -void bch2_bkey_swab_val(struct bkey_s); - -bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); - -static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) -{ - return l->type == r->type && - !bversion_cmp(l->bversion, r->bversion) && - bpos_eq(l->p, bkey_start_pos(r)); -} - -bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); - -static inline int bch2_key_trigger(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); - - return ops->trigger - ? ops->trigger(trans, btree, level, old, new, flags) - : 0; -} - -static inline int bch2_key_trigger_old(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_i deleted; - - bkey_init(&deleted.k); - deleted.k.p = old.k->p; - - return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted), - BTREE_TRIGGER_overwrite|flags); -} - -static inline int bch2_key_trigger_new(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_i deleted; - - bkey_init(&deleted.k); - deleted.k.p = new.k->p; - - return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, - BTREE_TRIGGER_insert|flags); -} - -void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); - -void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, - int, struct bkey_format *, struct bkey_packed *); - -static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, - struct bkey_format *f, - struct bkey_packed *k) -{ - if (version < bcachefs_metadata_version_current || - big_endian != CPU_BIG_ENDIAN || - IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - __bch2_bkey_compat(level, btree_id, version, - big_endian, write, f, k); - -} - -#endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c deleted file mode 100644 index 4536eb50fc40..000000000000 --- a/fs/bcachefs/bkey_sort.c +++ /dev/null @@ -1,214 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "bkey_buf.h" -#include "bkey_cmp.h" -#include "bkey_sort.h" -#include "bset.h" -#include "extents.h" - -typedef int (*sort_cmp_fn)(const struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); - -static inline bool sort_iter_end(struct sort_iter *iter) -{ - return !iter->used; -} - -static inline void sort_iter_sift(struct sort_iter *iter, unsigned from, - sort_cmp_fn cmp) -{ - unsigned i; - - for (i = from; - i + 1 < iter->used && - cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; - i++) - swap(iter->data[i], iter->data[i + 1]); -} - -static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) -{ - unsigned i = iter->used; - - while (i--) - sort_iter_sift(iter, i, cmp); -} - -static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) -{ - return !sort_iter_end(iter) ? iter->data->k : NULL; -} - -static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) -{ - struct sort_iter_set *i = iter->data; - - BUG_ON(!iter->used); - - i->k = bkey_p_next(i->k); - - BUG_ON(i->k > i->end); - - if (i->k == i->end) - array_remove_item(iter->data, iter->used, 0); - else - sort_iter_sift(iter, 0, cmp); -} - -static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, - sort_cmp_fn cmp) -{ - struct bkey_packed *ret = sort_iter_peek(iter); - - if (ret) - sort_iter_advance(iter, cmp); - - return ret; -} - -/* - * If keys compare equal, compare by pointer order: - */ -static inline int key_sort_fix_overlapping_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed(b, l, r) ?: - cmp_int((unsigned long) l, (unsigned long) r); -} - -static inline bool should_drop_next_key(struct sort_iter *iter) -{ - /* - * key_sort_cmp() ensures that when keys compare equal the older key - * comes first; so if l->k compares equal to r->k then l->k is older - * and should be dropped. - */ - return iter->used >= 2 && - !bch2_bkey_cmp_packed(iter->b, - iter->data[0].k, - iter->data[1].k); -} - -struct btree_nr_keys -bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, - struct sort_iter *iter) -{ - struct bkey_packed *out = dst->start; - struct bkey_packed *k; - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - - sort_iter_sort(iter, key_sort_fix_overlapping_cmp); - - while ((k = sort_iter_peek(iter))) { - if (!bkey_deleted(k) && - !should_drop_next_key(iter)) { - bkey_p_copy(out, k); - btree_keys_account_key_add(&nr, 0, out); - out = bkey_p_next(out); - } - - sort_iter_advance(iter, key_sort_fix_overlapping_cmp); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -/* Sort + repack in a new format: */ -struct btree_nr_keys -bch2_sort_repack(struct bset *dst, struct btree *src, - struct btree_node_iter *src_iter, - struct bkey_format *out_f, - bool filter_whiteouts) -{ - struct bkey_format *in_f = &src->format; - struct bkey_packed *in, *out = vstruct_last(dst); - struct btree_nr_keys nr; - bool transform = memcmp(out_f, &src->format, sizeof(*out_f)); - - memset(&nr, 0, sizeof(nr)); - - while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { - if (filter_whiteouts && bkey_deleted(in)) - continue; - - if (!transform) - bkey_p_copy(out, in); - else if (bch2_bkey_transform(out_f, out, bkey_packed(in) - ? in_f : &bch2_bkey_format_current, in)) - out->format = KEY_FORMAT_LOCAL_BTREE; - else - bch2_bkey_unpack(src, (void *) out, in); - - out->needs_whiteout = false; - - btree_keys_account_key_add(&nr, 0, out); - out = bkey_p_next(out); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -static inline int keep_unwritten_whiteouts_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed_inlined(b, l, r) ?: - (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: - (long) l - (long) r; -} - -#include "btree_update_interior.h" - -/* - * For sorting in the btree node write path: whiteouts not in the unwritten - * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are - * dropped if overwritten by real keys: - */ -unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter) -{ - struct bkey_packed *in, *next, *out = dst; - - sort_iter_sort(iter, keep_unwritten_whiteouts_cmp); - - while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) { - if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b)) - continue; - - if ((next = sort_iter_peek(iter)) && - !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) - continue; - - bkey_p_copy(out, in); - out = bkey_p_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - -/* - * Main sort routine for compacting a btree node in memory: we always drop - * whiteouts because any whiteouts that need to be written are in the unwritten - * whiteouts area: - */ -unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter) -{ - struct bkey_packed *in, *out = dst; - - sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined); - - while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) { - if (bkey_deleted(in)) - continue; - - bkey_p_copy(out, in); - out = bkey_p_next(out); - } - - return (u64 *) out - (u64 *) dst; -} diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h deleted file mode 100644 index 9be969d46890..000000000000 --- a/fs/bcachefs/bkey_sort.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_SORT_H -#define _BCACHEFS_BKEY_SORT_H - -struct sort_iter { - struct btree *b; - unsigned used; - unsigned size; - - struct sort_iter_set { - struct bkey_packed *k, *end; - } data[]; -}; - -static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size) -{ - iter->b = b; - iter->used = 0; - iter->size = size; -} - -struct sort_iter_stack { - struct sort_iter iter; - struct sort_iter_set sets[MAX_BSETS + 1]; -}; - -static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b) -{ - sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets)); -} - -static inline void sort_iter_add(struct sort_iter *iter, - struct bkey_packed *k, - struct bkey_packed *end) -{ - BUG_ON(iter->used >= iter->size); - - if (k != end) - iter->data[iter->used++] = (struct sort_iter_set) { k, end }; -} - -struct btree_nr_keys -bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, - struct sort_iter *); - -struct btree_nr_keys -bch2_sort_repack(struct bset *, struct btree *, - struct btree_node_iter *, - struct bkey_format *, bool); - -unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *); -unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *); - -#endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h deleted file mode 100644 index b4f328f9853c..000000000000 --- a/fs/bcachefs/bkey_types.h +++ /dev/null @@ -1,241 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_TYPES_H -#define _BCACHEFS_BKEY_TYPES_H - -#include "bcachefs_format.h" - -/* - * bkey_i - bkey with inline value - * bkey_s - bkey with split value - * bkey_s_c - bkey with split value, const - */ - -#define bkey_p_next(_k) vstruct_next(_k) - -static inline struct bkey_i *bkey_next(struct bkey_i *k) -{ - return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); -} - -#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) - -static inline size_t bkey_val_bytes(const struct bkey *k) -{ - return bkey_val_u64s(k) * sizeof(u64); -} - -static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -{ - unsigned u64s = BKEY_U64s + val_u64s; - - BUG_ON(u64s > U8_MAX); - k->u64s = u64s; -} - -static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -{ - set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); -} - -#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) - -#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) - -#define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) - -/* bkey with split value, const */ -struct bkey_s_c { - const struct bkey *k; - const struct bch_val *v; -}; - -/* bkey with split value */ -struct bkey_s { - union { - struct { - struct bkey *k; - struct bch_val *v; - }; - struct bkey_s_c s_c; - }; -}; - -#define bkey_s_null ((struct bkey_s) { .k = NULL }) -#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) - -#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) - -static inline struct bkey_s bkey_to_s(struct bkey *k) -{ - return (struct bkey_s) { .k = k, .v = NULL }; -} - -static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -{ - return (struct bkey_s_c) { .k = k, .v = NULL }; -} - -static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -{ - return (struct bkey_s) { .k = &k->k, .v = &k->v }; -} - -static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -{ - return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -} - -/* - * For a given type of value (e.g. struct bch_extent), generates the types for - * bkey + bch_extent - inline, split, split const - and also all the conversion - * functions, which also check that the value is of the correct type. - * - * We use anonymous unions for upcasting - e.g. converting from e.g. a - * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion - * functions. - */ -#define x(name, ...) \ -struct bkey_i_##name { \ - union { \ - struct bkey k; \ - struct bkey_i k_i; \ - }; \ - struct bch_##name v; \ -}; \ - \ -struct bkey_s_c_##name { \ - union { \ - struct { \ - const struct bkey *k; \ - const struct bch_##name *v; \ - }; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -struct bkey_s_##name { \ - union { \ - struct { \ - struct bkey *k; \ - struct bch_##name *v; \ - }; \ - struct bkey_s_c_##name c; \ - struct bkey_s s; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline const struct bkey_i_##name * \ -bkey_i_to_##name##_c(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -{ \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -name##_i_to_s_c(const struct bkey_i_##name *k) \ -{ \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -bkey_i_to_s_c_##name(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -{ \ - struct bkey_i_##name *k = \ - container_of(&_k->k, struct bkey_i_##name, k); \ - \ - bkey_init(&k->k); \ - memset(&k->v, 0, sizeof(k->v)); \ - k->k.type = KEY_TYPE_##name; \ - set_bkey_val_bytes(&k->k, sizeof(k->v)); \ - \ - return k; \ -} - -BCH_BKEY_TYPES(); -#undef x - -enum bch_validate_flags { - BCH_VALIDATE_write = BIT(0), - BCH_VALIDATE_commit = BIT(1), - BCH_VALIDATE_silent = BIT(2), -}; - -#define BKEY_VALIDATE_CONTEXTS() \ - x(unknown) \ - x(superblock) \ - x(journal) \ - x(btree_root) \ - x(btree_node) \ - x(commit) - -struct bkey_validate_context { - enum { -#define x(n) BKEY_VALIDATE_##n, - BKEY_VALIDATE_CONTEXTS() -#undef x - } from:8; - enum bch_validate_flags flags:8; - u8 level; - enum btree_id btree; - bool root:1; - unsigned journal_offset; - u64 journal_seq; -}; - -#endif /* _BCACHEFS_BKEY_TYPES_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c deleted file mode 100644 index 32841f762eb2..000000000000 --- a/fs/bcachefs/bset.c +++ /dev/null @@ -1,1576 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Code for working with individual keys, and sorted sets of keys with in a - * btree node - * - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "btree_cache.h" -#include "bset.h" -#include "eytzinger.h" -#include "trace.h" -#include "util.h" - -#include <linux/unaligned.h> -#include <linux/console.h> -#include <linux/random.h> -#include <linux/prefetch.h> - -static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, - struct btree *); - -static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) -{ - unsigned n = ARRAY_SIZE(iter->data); - - while (n && __btree_node_iter_set_end(iter, n - 1)) - --n; - - return n; -} - -struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) -{ - return bch2_bkey_to_bset_inlined(b, k); -} - -/* - * There are never duplicate live keys in the btree - but including keys that - * have been flagged as deleted (and will be cleaned up later) we _will_ see - * duplicates. - * - * Thus the sort order is: usual key comparison first, but for keys that compare - * equal the deleted key(s) come first, and the (at most one) live version comes - * last. - * - * The main reason for this is insertion: to handle overwrites, we first iterate - * over keys that compare equal to our insert key, and then insert immediately - * prior to the first key greater than the key we're inserting - our insert - * position will be after all keys that compare equal to our insert key, which - * by the time we actually do the insert will all be deleted. - */ - -void bch2_dump_bset(struct bch_fs *c, struct btree *b, - struct bset *i, unsigned set) -{ - struct bkey_packed *_k, *_n; - struct bkey uk, n; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - - if (!i->u64s) - return; - - for (_k = i->start; - _k < vstruct_last(i); - _k = _n) { - _n = bkey_p_next(_k); - - if (!_k->u64s) { - printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set, - _k->_data - i->_data); - break; - } - - k = bkey_disassemble(b, _k, &uk); - - printbuf_reset(&buf); - if (c) - bch2_bkey_val_to_text(&buf, c, k); - else - bch2_bkey_to_text(&buf, k.k); - printk(KERN_ERR "block %u key %5zu: %s\n", set, - _k->_data - i->_data, buf.buf); - - if (_n == vstruct_last(i)) - continue; - - n = bkey_unpack_key(b, _n); - - if (bpos_lt(n.p, k.k->p)) { - printk(KERN_ERR "Key skipped backwards\n"); - continue; - } - - if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p)) - printk(KERN_ERR "Duplicate keys\n"); - } - - printbuf_exit(&buf); -} - -void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) -{ - console_lock(); - for_each_bset(b, t) - bch2_dump_bset(c, b, bset(b, t), t - b->set); - console_unlock(); -} - -void bch2_dump_btree_node_iter(struct btree *b, - struct btree_node_iter *iter) -{ - struct btree_node_iter_set *set; - struct printbuf buf = PRINTBUF; - - printk(KERN_ERR "btree node iter with %u/%u sets:\n", - __btree_node_iter_used(iter), b->nsets); - - btree_node_iter_for_each(iter, set) { - struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); - struct bset_tree *t = bch2_bkey_to_bset(b, k); - struct bkey uk = bkey_unpack_key(b, k); - - printbuf_reset(&buf); - bch2_bkey_to_text(&buf, &uk); - printk(KERN_ERR "set %zu key %u: %s\n", - t - b->set, set->k, buf.buf); - } - - printbuf_exit(&buf); -} - -struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) -{ - struct bkey_packed *k; - struct btree_nr_keys nr = {}; - - for_each_bset(b, t) - bset_tree_for_each_key(b, t, k) - if (!bkey_deleted(k)) - btree_keys_account_key_add(&nr, t - b->set, k); - return nr; -} - -void __bch2_verify_btree_nr_keys(struct btree *b) -{ - struct btree_nr_keys nr = bch2_btree_node_count_keys(b); - - BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); -} - -static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, - struct btree *b) -{ - struct btree_node_iter iter = *_iter; - const struct bkey_packed *k, *n; - - k = bch2_btree_node_iter_peek_all(&iter, b); - __bch2_btree_node_iter_advance(&iter, b); - n = bch2_btree_node_iter_peek_all(&iter, b); - - bkey_unpack_key(b, k); - - if (n && - bkey_iter_cmp(b, k, n) > 0) { - struct btree_node_iter_set *set; - struct bkey ku = bkey_unpack_key(b, k); - struct bkey nu = bkey_unpack_key(b, n); - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&buf1, &ku); - bch2_bkey_to_text(&buf2, &nu); - printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", - buf1.buf, buf2.buf); - printk(KERN_ERR "iter was:"); - - btree_node_iter_for_each(_iter, set) { - struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k); - struct bset_tree *t = bch2_bkey_to_bset(b, k2); - printk(" [%zi %zi]", t - b->set, - k2->_data - bset(b, t)->_data); - } - panic("\n"); - } -} - -void __bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) -{ - struct btree_node_iter_set *set, *s2; - struct bkey_packed *k, *p; - - if (bch2_btree_node_iter_end(iter)) - return; - - /* Verify no duplicates: */ - btree_node_iter_for_each(iter, set) { - BUG_ON(set->k > set->end); - btree_node_iter_for_each(iter, s2) - BUG_ON(set != s2 && set->end == s2->end); - } - - /* Verify that set->end is correct: */ - btree_node_iter_for_each(iter, set) { - for_each_bset(b, t) - if (set->end == t->end_offset) { - BUG_ON(set->k < btree_bkey_first_offset(t) || - set->k >= t->end_offset); - goto found; - } - BUG(); -found: - do {} while (0); - } - - /* Verify iterator is sorted: */ - btree_node_iter_for_each(iter, set) - BUG_ON(set != iter->data && - btree_node_iter_cmp(b, set[-1], set[0]) > 0); - - k = bch2_btree_node_iter_peek_all(iter, b); - - for_each_bset(b, t) { - if (iter->data[0].end == t->end_offset) - continue; - - p = bch2_bkey_prev_all(b, t, - bch2_btree_node_iter_bset_pos(iter, b, t)); - - BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); - } -} - -static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, - struct bkey_packed *insert, unsigned clobber_u64s) -{ - struct bset_tree *t = bch2_bkey_to_bset(b, where); - struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); - struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s); - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; -#if 0 - BUG_ON(prev && - bkey_iter_cmp(b, prev, insert) > 0); -#else - if (prev && - bkey_iter_cmp(b, prev, insert) > 0) { - struct bkey k1 = bkey_unpack_key(b, prev); - struct bkey k2 = bkey_unpack_key(b, insert); - - bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&buf1, &k1); - bch2_bkey_to_text(&buf2, &k2); - - panic("prev > insert:\n" - "prev key %s\n" - "insert key %s\n", - buf1.buf, buf2.buf); - } -#endif -#if 0 - BUG_ON(next != btree_bkey_last(b, t) && - bkey_iter_cmp(b, insert, next) > 0); -#else - if (next != btree_bkey_last(b, t) && - bkey_iter_cmp(b, insert, next) > 0) { - struct bkey k1 = bkey_unpack_key(b, insert); - struct bkey k2 = bkey_unpack_key(b, next); - - bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&buf1, &k1); - bch2_bkey_to_text(&buf2, &k2); - - panic("insert > next:\n" - "insert key %s\n" - "next key %s\n", - buf1.buf, buf2.buf); - } -#endif -} - -static inline void bch2_verify_insert_pos(struct btree *b, - struct bkey_packed *where, - struct bkey_packed *insert, - unsigned clobber_u64s) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) - __bch2_verify_insert_pos(b, where, insert, clobber_u64s); -} - - -/* Auxiliary search trees */ - -#define BFLOAT_FAILED_UNPACKED U8_MAX -#define BFLOAT_FAILED U8_MAX - -struct bkey_float { - u8 exponent; - u8 key_offset; - u16 mantissa; -}; -#define BKEY_MANTISSA_BITS 16 - -struct ro_aux_tree { - u8 nothing[0]; - struct bkey_float f[]; -}; - -struct rw_aux_tree { - u16 offset; - struct bpos k; -}; - -static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) -{ - BUG_ON(t->aux_data_offset == U16_MAX); - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - return t->aux_data_offset; - case BSET_RO_AUX_TREE: - return t->aux_data_offset + - DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8); - case BSET_RW_AUX_TREE: - return t->aux_data_offset + - DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); - default: - BUG(); - } -} - -static unsigned bset_aux_tree_buf_start(const struct btree *b, - const struct bset_tree *t) -{ - return t == b->set - ? DIV_ROUND_UP(b->unpack_fn_len, 8) - : bset_aux_tree_buf_end(t - 1); -} - -static void *__aux_tree_base(const struct btree *b, - const struct bset_tree *t) -{ - return b->aux_data + t->aux_data_offset * 8; -} - -static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, - const struct bset_tree *t) -{ - EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); - - return __aux_tree_base(b, t); -} - -static struct bkey_float *bkey_float(const struct btree *b, - const struct bset_tree *t, - unsigned idx) -{ - return ro_aux_tree_base(b, t)->f + idx; -} - -static void __bset_aux_tree_verify(struct btree *b) -{ - for_each_bset(b, t) { - if (t->aux_data_offset == U16_MAX) - continue; - - BUG_ON(t != b->set && - t[-1].aux_data_offset == U16_MAX); - - BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); - BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); - BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); - } -} - -static inline void bset_aux_tree_verify(struct btree *b) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) - __bset_aux_tree_verify(b); -} - -void bch2_btree_keys_init(struct btree *b) -{ - unsigned i; - - b->nsets = 0; - memset(&b->nr, 0, sizeof(b->nr)); - - for (i = 0; i < MAX_BSETS; i++) - b->set[i].data_offset = U16_MAX; - - bch2_bset_set_no_aux_tree(b, b->set); -} - -/* Binary tree stuff for auxiliary search trees */ - -/* - * Cacheline/offset <-> bkey pointer arithmetic: - * - * t->tree is a binary search tree in an array; each node corresponds to a key - * in one cacheline in t->set (BSET_CACHELINE bytes). - * - * This means we don't have to store the full index of the key that a node in - * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and - * then bkey_float->m gives us the offset within that cacheline, in units of 8 - * bytes. - * - * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to - * make this work. - * - * To construct the bfloat for an arbitrary key we need to know what the key - * immediately preceding it is: we have to check if the two keys differ in the - * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size - * of the previous key so we can walk backwards to it from t->tree[j]'s key. - */ - -static inline void *bset_cacheline(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline) -{ - return (void *) round_down((unsigned long) btree_bkey_first(b, t), - L1_CACHE_BYTES) + - cacheline * BSET_CACHELINE; -} - -static struct bkey_packed *cacheline_to_bkey(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline, - unsigned offset) -{ - return bset_cacheline(b, t, cacheline) + offset * 8; -} - -static unsigned bkey_to_cacheline(const struct btree *b, - const struct bset_tree *t, - const struct bkey_packed *k) -{ - return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; -} - -static ssize_t __bkey_to_cacheline_offset(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline, - const struct bkey_packed *k) -{ - return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); -} - -static unsigned bkey_to_cacheline_offset(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline, - const struct bkey_packed *k) -{ - size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); - - EBUG_ON(m > U8_MAX); - return m; -} - -static inline struct bkey_packed *tree_to_bkey(const struct btree *b, - const struct bset_tree *t, - unsigned j) -{ - return cacheline_to_bkey(b, t, - __eytzinger1_to_inorder(j, t->size - 1, t->extra), - bkey_float(b, t, j)->key_offset); -} - -static struct rw_aux_tree *rw_aux_tree(const struct btree *b, - const struct bset_tree *t) -{ - EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); - - return __aux_tree_base(b, t); -} - -/* - * For the write set - the one we're currently inserting keys into - we don't - * maintain a full search tree, we just keep a simple lookup table in t->prev. - */ -static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, - struct bset_tree *t, - unsigned j) -{ - return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); -} - -static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, - unsigned j, struct bkey_packed *k) -{ - EBUG_ON(k >= btree_bkey_last(b, t)); - - rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { - .offset = __btree_node_key_to_offset(b, k), - .k = bkey_unpack_pos(b, k), - }; -} - -static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t) -{ - struct bkey_packed *k = btree_bkey_first(b, t); - unsigned j = 0; - - BUG_ON(bset_has_ro_aux_tree(t)); - - if (!bset_has_rw_aux_tree(t)) - return; - - BUG_ON(t->size < 1); - BUG_ON(rw_aux_to_bkey(b, t, j) != k); - - goto start; - while (1) { - if (rw_aux_to_bkey(b, t, j) == k) { - BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k, - bkey_unpack_pos(b, k))); -start: - if (++j == t->size) - break; - - BUG_ON(rw_aux_tree(b, t)[j].offset <= - rw_aux_tree(b, t)[j - 1].offset); - } - - k = bkey_p_next(k); - BUG_ON(k >= btree_bkey_last(b, t)); - } -} - -static inline void bch2_bset_verify_rw_aux_tree(struct btree *b, - struct bset_tree *t) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) - __bch2_bset_verify_rw_aux_tree(b, t); -} - -/* returns idx of first entry >= offset: */ -static unsigned rw_aux_tree_bsearch(struct btree *b, - struct bset_tree *t, - unsigned offset) -{ - unsigned bset_offs = offset - btree_bkey_first_offset(t); - unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); - unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; - - EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); - EBUG_ON(!t->size); - EBUG_ON(idx > t->size); - - while (idx < t->size && - rw_aux_tree(b, t)[idx].offset < offset) - idx++; - - while (idx && - rw_aux_tree(b, t)[idx - 1].offset >= offset) - idx--; - - EBUG_ON(idx < t->size && - rw_aux_tree(b, t)[idx].offset < offset); - EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); - EBUG_ON(idx + 1 < t->size && - rw_aux_tree(b, t)[idx].offset == - rw_aux_tree(b, t)[idx + 1].offset); - - return idx; -} - -static inline unsigned bkey_mantissa(const struct bkey_packed *k, - const struct bkey_float *f) -{ - u64 v; - - EBUG_ON(!bkey_packed(k)); - - v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); - - /* - * In little endian, we're shifting off low bits (and then the bits we - * want are at the low end), in big endian we're shifting off high bits - * (and then the bits we want are at the high end, so we shift them - * back down): - */ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - v >>= f->exponent & 7; -#else - v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; -#endif - return (u16) v; -} - -static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, - unsigned j, - struct bkey_packed *min_key, - struct bkey_packed *max_key) -{ - struct bkey_float *f = bkey_float(b, t, j); - struct bkey_packed *m = tree_to_bkey(b, t, j); - struct bkey_packed *l = is_power_of_2(j) - ? min_key - : tree_to_bkey(b, t, j >> ffs(j)); - struct bkey_packed *r = is_power_of_2(j + 1) - ? max_key - : tree_to_bkey(b, t, j >> (ffz(j) + 1)); - unsigned mantissa; - int shift, exponent, high_bit; - - /* - * for failed bfloats, the lookup code falls back to comparing against - * the original key. - */ - - if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || - !b->nr_key_bits) { - f->exponent = BFLOAT_FAILED_UNPACKED; - return; - } - - /* - * The greatest differing bit of l and r is the first bit we must - * include in the bfloat mantissa we're creating in order to do - * comparisons - that bit always becomes the high bit of - * bfloat->mantissa, and thus the exponent we're calculating here is - * the position of what will become the low bit in bfloat->mantissa: - * - * Note that this may be negative - we may be running off the low end - * of the key: we handle this later: - */ - high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), - min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); - exponent = high_bit - (BKEY_MANTISSA_BITS - 1); - - /* - * Then we calculate the actual shift value, from the start of the key - * (k->_data), to get the key bits starting at exponent: - */ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; - - EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); -#else - shift = high_bit_offset + - b->nr_key_bits - - exponent - - BKEY_MANTISSA_BITS; - - EBUG_ON(shift < KEY_PACKED_BITS_START); -#endif - EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); - - f->exponent = shift; - mantissa = bkey_mantissa(m, f); - - /* - * If we've got garbage bits, set them to all 1s - it's legal for the - * bfloat to compare larger than the original key, but not smaller: - */ - if (exponent < 0) - mantissa |= ~(~0U << -exponent); - - f->mantissa = mantissa; -} - -/* bytes remaining - only valid for last bset: */ -static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t) -{ - bset_aux_tree_verify(b); - - return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); -} - -static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t) -{ - return __bset_tree_capacity(b, t) / sizeof(struct bkey_float); -} - -static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t) -{ - return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); -} - -static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) -{ - struct bkey_packed *k; - - t->size = 1; - t->extra = BSET_RW_AUX_TREE_VAL; - rw_aux_tree(b, t)[0].offset = - __btree_node_key_to_offset(b, btree_bkey_first(b, t)); - - bset_tree_for_each_key(b, t, k) { - if (t->size == bset_rw_tree_capacity(b, t)) - break; - - if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > - L1_CACHE_BYTES) - rw_aux_tree_set(b, t, t->size++, k); - } -} - -static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) -{ - struct bkey_packed *k = btree_bkey_first(b, t); - struct bkey_i min_key, max_key; - unsigned cacheline = 1; - - t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), - bset_ro_tree_capacity(b, t)); -retry: - if (t->size < 2) { - t->size = 0; - t->extra = BSET_NO_AUX_TREE_VAL; - return; - } - - t->extra = eytzinger1_extra(t->size - 1); - - /* First we figure out where the first key in each cacheline is */ - eytzinger1_for_each(j, t->size - 1) { - while (bkey_to_cacheline(b, t, k) < cacheline) - k = bkey_p_next(k); - - if (k >= btree_bkey_last(b, t)) { - /* XXX: this path sucks */ - t->size--; - goto retry; - } - - bkey_float(b, t, j)->key_offset = - bkey_to_cacheline_offset(b, t, cacheline++, k); - - EBUG_ON(tree_to_bkey(b, t, j) != k); - } - - if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { - bkey_init(&min_key.k); - min_key.k.p = b->data->min_key; - } - - if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { - bkey_init(&max_key.k); - max_key.k.p = b->data->max_key; - } - - /* Then we build the tree */ - eytzinger1_for_each(j, t->size - 1) - make_bfloat(b, t, j, - bkey_to_packed(&min_key), - bkey_to_packed(&max_key)); -} - -static void bset_alloc_tree(struct btree *b, struct bset_tree *t) -{ - struct bset_tree *i; - - for (i = b->set; i != t; i++) - BUG_ON(bset_has_rw_aux_tree(i)); - - bch2_bset_set_no_aux_tree(b, t); - - /* round up to next cacheline: */ - t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), - SMP_CACHE_BYTES / sizeof(u64)); - - bset_aux_tree_verify(b); -} - -void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, - bool writeable) -{ - if (writeable - ? bset_has_rw_aux_tree(t) - : bset_has_ro_aux_tree(t)) - return; - - bset_alloc_tree(b, t); - - if (!__bset_tree_capacity(b, t)) - return; - - if (writeable) - __build_rw_aux_tree(b, t); - else - __build_ro_aux_tree(b, t); - - bset_aux_tree_verify(b); -} - -void bch2_bset_init_first(struct btree *b, struct bset *i) -{ - struct bset_tree *t; - - BUG_ON(b->nsets); - - memset(i, 0, sizeof(*i)); - get_random_bytes(&i->seq, sizeof(i->seq)); - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - - t = &b->set[b->nsets++]; - set_btree_bset(b, t, i); -} - -void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne) -{ - struct bset *i = &bne->keys; - struct bset_tree *t; - - BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b)); - BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); - BUG_ON(b->nsets >= MAX_BSETS); - - memset(i, 0, sizeof(*i)); - i->seq = btree_bset_first(b)->seq; - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - - t = &b->set[b->nsets++]; - set_btree_bset(b, t, i); -} - -/* - * find _some_ key in the same bset as @k that precedes @k - not necessarily the - * immediate predecessor: - */ -static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - struct bkey_packed *p; - unsigned offset; - int j; - - EBUG_ON(k < btree_bkey_first(b, t) || - k > btree_bkey_last(b, t)); - - if (k == btree_bkey_first(b, t)) - return NULL; - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - p = btree_bkey_first(b, t); - break; - case BSET_RO_AUX_TREE: - j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); - - do { - p = j ? tree_to_bkey(b, t, - __inorder_to_eytzinger1(j--, - t->size - 1, t->extra)) - : btree_bkey_first(b, t); - } while (p >= k); - break; - case BSET_RW_AUX_TREE: - offset = __btree_node_key_to_offset(b, k); - j = rw_aux_tree_bsearch(b, t, offset); - p = j ? rw_aux_to_bkey(b, t, j - 1) - : btree_bkey_first(b, t); - break; - } - - return p; -} - -struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, - struct bset_tree *t, - struct bkey_packed *k, - unsigned min_key_type) -{ - struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; - - while ((p = __bkey_prev(b, t, k)) && !ret) { - for (i = p; i != k; i = bkey_p_next(i)) - if (i->type >= min_key_type) - ret = i; - - k = p; - } - - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { - BUG_ON(ret >= orig_k); - - for (i = ret - ? bkey_p_next(ret) - : btree_bkey_first(b, t); - i != orig_k; - i = bkey_p_next(i)) - BUG_ON(i->type >= min_key_type); - } - - return ret; -} - -/* Insert */ - -static void rw_aux_tree_insert_entry(struct btree *b, - struct bset_tree *t, - unsigned idx) -{ - EBUG_ON(!idx || idx > t->size); - struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1); - struct bkey_packed *end = idx < t->size - ? rw_aux_to_bkey(b, t, idx) - : btree_bkey_last(b, t); - - if (t->size < bset_rw_tree_capacity(b, t) && - (void *) end - (void *) start > L1_CACHE_BYTES) { - struct bkey_packed *k = start; - - while (1) { - k = bkey_p_next(k); - if (k == end) - break; - - if ((void *) k - (void *) start >= L1_CACHE_BYTES) { - memmove(&rw_aux_tree(b, t)[idx + 1], - &rw_aux_tree(b, t)[idx], - (void *) &rw_aux_tree(b, t)[t->size] - - (void *) &rw_aux_tree(b, t)[idx]); - t->size++; - rw_aux_tree_set(b, t, idx, k); - break; - } - } - } -} - -static void bch2_bset_fix_lookup_table(struct btree *b, - struct bset_tree *t, - struct bkey_packed *_where, - unsigned clobber_u64s, - unsigned new_u64s) -{ - int shift = new_u64s - clobber_u64s; - unsigned idx, j, where = __btree_node_key_to_offset(b, _where); - - EBUG_ON(bset_has_ro_aux_tree(t)); - - if (!bset_has_rw_aux_tree(t)) - return; - - if (where > rw_aux_tree(b, t)[t->size - 1].offset) { - rw_aux_tree_insert_entry(b, t, t->size); - goto verify; - } - - /* returns first entry >= where */ - idx = rw_aux_tree_bsearch(b, t, where); - - if (rw_aux_tree(b, t)[idx].offset == where) { - if (!idx) { /* never delete first entry */ - idx++; - } else if (where < t->end_offset) { - rw_aux_tree_set(b, t, idx++, _where); - } else { - EBUG_ON(where != t->end_offset); - rw_aux_tree_insert_entry(b, t, --t->size); - goto verify; - } - } - - EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where); - if (idx < t->size && - rw_aux_tree(b, t)[idx].offset + shift == - rw_aux_tree(b, t)[idx - 1].offset) { - memmove(&rw_aux_tree(b, t)[idx], - &rw_aux_tree(b, t)[idx + 1], - (void *) &rw_aux_tree(b, t)[t->size] - - (void *) &rw_aux_tree(b, t)[idx + 1]); - t->size -= 1; - } - - for (j = idx; j < t->size; j++) - rw_aux_tree(b, t)[j].offset += shift; - - EBUG_ON(idx < t->size && - rw_aux_tree(b, t)[idx].offset == - rw_aux_tree(b, t)[idx - 1].offset); - - rw_aux_tree_insert_entry(b, t, idx); - -verify: - bch2_bset_verify_rw_aux_tree(b, t); - bset_aux_tree_verify(b); -} - -void bch2_bset_insert(struct btree *b, - struct bkey_packed *where, - struct bkey_i *insert, - unsigned clobber_u64s) -{ - struct bkey_format *f = &b->format; - struct bset_tree *t = bset_tree_last(b); - struct bkey_packed packed, *src = bkey_to_packed(insert); - - bch2_bset_verify_rw_aux_tree(b, t); - bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); - - if (bch2_bkey_pack_key(&packed, &insert->k, f)) - src = &packed; - - if (!bkey_deleted(&insert->k)) - btree_keys_account_key_add(&b->nr, t - b->set, src); - - if (src->u64s != clobber_u64s) { - u64 *src_p = (u64 *) where->_data + clobber_u64s; - u64 *dst_p = (u64 *) where->_data + src->u64s; - - EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < - (int) clobber_u64s - src->u64s); - - memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); - le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); - set_btree_bset_end(b, t); - } - - memcpy_u64s_small(where, src, - bkeyp_key_u64s(f, src)); - memcpy_u64s(bkeyp_val(f, where), &insert->v, - bkeyp_val_u64s(f, src)); - - if (src->u64s != clobber_u64s) - bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); - - bch2_verify_btree_nr_keys(b); -} - -void bch2_bset_delete(struct btree *b, - struct bkey_packed *where, - unsigned clobber_u64s) -{ - struct bset_tree *t = bset_tree_last(b); - u64 *src_p = (u64 *) where->_data + clobber_u64s; - u64 *dst_p = where->_data; - - bch2_bset_verify_rw_aux_tree(b, t); - - EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); - - memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); - le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); - set_btree_bset_end(b, t); - - bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); -} - -/* Lookup */ - -__flatten -static struct bkey_packed *bset_search_write_set(const struct btree *b, - struct bset_tree *t, - struct bpos *search) -{ - unsigned l = 0, r = t->size; - - while (l + 1 != r) { - unsigned m = (l + r) >> 1; - - if (bpos_lt(rw_aux_tree(b, t)[m].k, *search)) - l = m; - else - r = m; - } - - return rw_aux_to_bkey(b, t, l); -} - -static inline void prefetch_four_cachelines(void *p) -{ -#ifdef CONFIG_X86_64 - asm("prefetcht0 (-127 + 64 * 0)(%0);" - "prefetcht0 (-127 + 64 * 1)(%0);" - "prefetcht0 (-127 + 64 * 2)(%0);" - "prefetcht0 (-127 + 64 * 3)(%0);" - : - : "r" (p + 127)); -#else - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - prefetch(p + L1_CACHE_BYTES * 3); -#endif -} - -static inline bool bkey_mantissa_bits_dropped(const struct btree *b, - const struct bkey_float *f) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; - - return f->exponent > key_bits_start; -#else - unsigned key_bits_end = high_bit_offset + b->nr_key_bits; - - return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; -#endif -} - -__flatten -static struct bkey_packed *bset_search_tree(const struct btree *b, - const struct bset_tree *t, - const struct bpos *search, - const struct bkey_packed *packed_search) -{ - struct ro_aux_tree *base = ro_aux_tree_base(b, t); - struct bkey_float *f; - struct bkey_packed *k; - unsigned inorder, n = 1, l, r; - int cmp; - - do { - if (likely(n << 4 < t->size)) - prefetch(&base->f[n << 4]); - - f = &base->f[n]; - if (unlikely(f->exponent >= BFLOAT_FAILED)) - goto slowpath; - - l = f->mantissa; - r = bkey_mantissa(packed_search, f); - - if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f)) - goto slowpath; - - n = n * 2 + (l < r); - continue; -slowpath: - k = tree_to_bkey(b, t, n); - cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); - if (!cmp) - return k; - - n = n * 2 + (cmp < 0); - } while (n < t->size); - - inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); - - /* - * n would have been the node we recursed to - the low bit tells us if - * we recursed left or recursed right. - */ - if (likely(!(n & 1))) { - --inorder; - if (unlikely(!inorder)) - return btree_bkey_first(b, t); - - f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; - } - - return cacheline_to_bkey(b, t, inorder, f->key_offset); -} - -static __always_inline __flatten -struct bkey_packed *__bch2_bset_search(struct btree *b, - struct bset_tree *t, - struct bpos *search, - const struct bkey_packed *lossy_packed_search) -{ - - /* - * First, we search for a cacheline, then lastly we do a linear search - * within that cacheline. - * - * To search for the cacheline, there's three different possibilities: - * * The set is too small to have a search tree, so we just do a linear - * search over the whole set. - * * The set is the one we're currently inserting into; keeping a full - * auxiliary search tree up to date would be too expensive, so we - * use a much simpler lookup table to do a binary search - - * bset_search_write_set(). - * * Or we use the auxiliary search tree we constructed earlier - - * bset_search_tree() - */ - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - return btree_bkey_first(b, t); - case BSET_RW_AUX_TREE: - return bset_search_write_set(b, t, search); - case BSET_RO_AUX_TREE: - return bset_search_tree(b, t, search, lossy_packed_search); - default: - BUG(); - } -} - -static __always_inline __flatten -struct bkey_packed *bch2_bset_search_linear(struct btree *b, - struct bset_tree *t, - struct bpos *search, - struct bkey_packed *packed_search, - const struct bkey_packed *lossy_packed_search, - struct bkey_packed *m) -{ - if (lossy_packed_search) - while (m != btree_bkey_last(b, t) && - bkey_iter_cmp_p_or_unp(b, m, - lossy_packed_search, search) < 0) - m = bkey_p_next(m); - - if (!packed_search) - while (m != btree_bkey_last(b, t) && - bkey_iter_pos_cmp(b, m, search) < 0) - m = bkey_p_next(m); - - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { - struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); - - BUG_ON(prev && - bkey_iter_cmp_p_or_unp(b, prev, - packed_search, search) >= 0); - } - - return m; -} - -/* Btree node iterator */ - -static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, - struct btree *b, - const struct bkey_packed *k, - const struct bkey_packed *end) -{ - if (k != end) { - struct btree_node_iter_set *pos; - - btree_node_iter_for_each(iter, pos) - ; - - BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); - *pos = (struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k), - __btree_node_key_to_offset(b, end) - }; - } -} - -void bch2_btree_node_iter_push(struct btree_node_iter *iter, - struct btree *b, - const struct bkey_packed *k, - const struct bkey_packed *end) -{ - __bch2_btree_node_iter_push(iter, b, k, end); - bch2_btree_node_iter_sort(iter, b); -} - -noinline __flatten __cold -static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, - struct btree *b, struct bpos *search) -{ - struct bkey_packed *k; - - trace_bkey_pack_pos_fail(search); - - bch2_btree_node_iter_init_from_start(iter, b); - - while ((k = bch2_btree_node_iter_peek(iter, b)) && - bkey_iter_pos_cmp(b, k, search) < 0) - bch2_btree_node_iter_advance(iter, b); -} - -/** - * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a - * given position - * - * @iter: iterator to initialize - * @b: btree node to search - * @search: search key - * - * Main entry point to the lookup code for individual btree nodes: - * - * NOTE: - * - * When you don't filter out deleted keys, btree nodes _do_ contain duplicate - * keys. This doesn't matter for most code, but it does matter for lookups. - * - * Some adjacent keys with a string of equal keys: - * i j k k k k l m - * - * If you search for k, the lookup code isn't guaranteed to return you any - * specific k. The lookup code is conceptually doing a binary search and - * iterating backwards is very expensive so if the pivot happens to land at the - * last k that's what you'll get. - * - * This works out ok, but it's something to be aware of: - * - * - For non extents, we guarantee that the live key comes last - see - * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't - * see will only be deleted keys you don't care about. - * - * - For extents, deleted keys sort last (see the comment at the top of this - * file). But when you're searching for extents, you actually want the first - * key strictly greater than your search key - an extent that compares equal - * to the search key is going to have 0 sectors after the search key. - * - * But this does mean that we can't just search for - * bpos_successor(start_of_range) to get the first extent that overlaps with - * the range we want - if we're unlucky and there's an extent that ends - * exactly where we searched, then there could be a deleted key at the same - * position and we'd get that when we search instead of the preceding extent - * we needed. - * - * So we've got to search for start_of_range, then after the lookup iterate - * past any extents that compare equal to the position we searched for. - */ -__flatten -void bch2_btree_node_iter_init(struct btree_node_iter *iter, - struct btree *b, struct bpos *search) -{ - struct bkey_packed p, *packed_search = NULL; - struct btree_node_iter_set *pos = iter->data; - struct bkey_packed *k[MAX_BSETS]; - unsigned i; - - EBUG_ON(bpos_lt(*search, b->data->min_key)); - EBUG_ON(bpos_gt(*search, b->data->max_key)); - bset_aux_tree_verify(b); - - memset(iter, 0, sizeof(*iter)); - - switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { - case BKEY_PACK_POS_EXACT: - packed_search = &p; - break; - case BKEY_PACK_POS_SMALLER: - packed_search = NULL; - break; - case BKEY_PACK_POS_FAIL: - btree_node_iter_init_pack_failed(iter, b, search); - return; - } - - for (i = 0; i < b->nsets; i++) { - k[i] = __bch2_bset_search(b, b->set + i, search, &p); - prefetch_four_cachelines(k[i]); - } - - for (i = 0; i < b->nsets; i++) { - struct bset_tree *t = b->set + i; - struct bkey_packed *end = btree_bkey_last(b, t); - - k[i] = bch2_bset_search_linear(b, t, search, - packed_search, &p, k[i]); - if (k[i] != end) - *pos++ = (struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k[i]), - __btree_node_key_to_offset(b, end) - }; - } - - bch2_btree_node_iter_sort(iter, b); -} - -void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, - struct btree *b) -{ - memset(iter, 0, sizeof(*iter)); - - for_each_bset(b, t) - __bch2_btree_node_iter_push(iter, b, - btree_bkey_first(b, t), - btree_bkey_last(b, t)); - bch2_btree_node_iter_sort(iter, b); -} - -struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, - struct btree *b, - struct bset_tree *t) -{ - struct btree_node_iter_set *set; - - btree_node_iter_for_each(iter, set) - if (set->end == t->end_offset) - return __btree_node_offset_to_key(b, set->k); - - return btree_bkey_last(b, t); -} - -static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, - struct btree *b, - unsigned first) -{ - bool ret; - - if ((ret = (btree_node_iter_cmp(b, - iter->data[first], - iter->data[first + 1]) > 0))) - swap(iter->data[first], iter->data[first + 1]); - return ret; -} - -void bch2_btree_node_iter_sort(struct btree_node_iter *iter, - struct btree *b) -{ - /* unrolled bubble sort: */ - - if (!__btree_node_iter_set_end(iter, 2)) { - btree_node_iter_sort_two(iter, b, 0); - btree_node_iter_sort_two(iter, b, 1); - } - - if (!__btree_node_iter_set_end(iter, 1)) - btree_node_iter_sort_two(iter, b, 0); -} - -void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, - struct btree_node_iter_set *set) -{ - struct btree_node_iter_set *last = - iter->data + ARRAY_SIZE(iter->data) - 1; - - memmove(&set[0], &set[1], (void *) last - (void *) set); - *last = (struct btree_node_iter_set) { 0, 0 }; -} - -static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, - struct btree *b) -{ - iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; - - EBUG_ON(iter->data->k > iter->data->end); - - if (unlikely(__btree_node_iter_set_end(iter, 0))) { - /* avoid an expensive memmove call: */ - iter->data[0] = iter->data[1]; - iter->data[1] = iter->data[2]; - iter->data[2] = (struct btree_node_iter_set) { 0, 0 }; - return; - } - - if (__btree_node_iter_set_end(iter, 1)) - return; - - if (!btree_node_iter_sort_two(iter, b, 0)) - return; - - if (__btree_node_iter_set_end(iter, 2)) - return; - - btree_node_iter_sort_two(iter, b, 1); -} - -void bch2_btree_node_iter_advance(struct btree_node_iter *iter, - struct btree *b) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { - __bch2_btree_node_iter_verify(iter, b); - __bch2_btree_node_iter_next_check(iter, b); - } - - __bch2_btree_node_iter_advance(iter, b); -} - -/* - * Expensive: - */ -struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, - struct btree *b) -{ - struct bkey_packed *k, *prev = NULL; - struct btree_node_iter_set *set; - unsigned end = 0; - - bch2_btree_node_iter_verify(iter, b); - - for_each_bset(b, t) { - k = bch2_bkey_prev_all(b, t, - bch2_btree_node_iter_bset_pos(iter, b, t)); - if (k && - (!prev || bkey_iter_cmp(b, k, prev) > 0)) { - prev = k; - end = t->end_offset; - } - } - - if (!prev) - return NULL; - - /* - * We're manually memmoving instead of just calling sort() to ensure the - * prev we picked ends up in slot 0 - sort won't necessarily put it - * there because of duplicate deleted keys: - */ - btree_node_iter_for_each(iter, set) - if (set->end == end) - goto found; - - BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); -found: - BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); - - memmove(&iter->data[1], - &iter->data[0], - (void *) set - (void *) &iter->data[0]); - - iter->data[0].k = __btree_node_key_to_offset(b, prev); - iter->data[0].end = end; - - bch2_btree_node_iter_verify(iter, b); - return prev; -} - -struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, - struct btree *b) -{ - struct bkey_packed *prev; - - do { - prev = bch2_btree_node_iter_prev_all(iter, b); - } while (prev && bkey_deleted(prev)); - - return prev; -} - -struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, - struct btree *b, - struct bkey *u) -{ - struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); - - return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; -} - -/* Mergesort */ - -void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) -{ - for_each_bset_c(b, t) { - enum bset_aux_tree_type type = bset_aux_tree_type(t); - size_t j; - - stats->sets[type].nr++; - stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * - sizeof(u64); - - if (bset_has_ro_aux_tree(t)) { - stats->floats += t->size - 1; - - for (j = 1; j < t->size; j++) - stats->failed += - bkey_float(b, t, j)->exponent == - BFLOAT_FAILED; - } - } -} - -void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, - struct bkey_packed *k) -{ - struct bset_tree *t = bch2_bkey_to_bset(b, k); - struct bkey uk; - unsigned j, inorder; - - if (!bset_has_ro_aux_tree(t)) - return; - - inorder = bkey_to_cacheline(b, t, k); - if (!inorder || inorder >= t->size) - return; - - j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); - if (k != tree_to_bkey(b, t, j)) - return; - - switch (bkey_float(b, t, j)->exponent) { - case BFLOAT_FAILED: - uk = bkey_unpack_key(b, k); - prt_printf(out, - " failed unpacked at depth %u\n" - "\t", - ilog2(j)); - bch2_bpos_to_text(out, uk.p); - prt_printf(out, "\n"); - break; - } -} diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h deleted file mode 100644 index a15ecf9d006e..000000000000 --- a/fs/bcachefs/bset.h +++ /dev/null @@ -1,536 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BSET_H -#define _BCACHEFS_BSET_H - -#include <linux/kernel.h> -#include <linux/types.h> - -#include "bcachefs.h" -#include "bkey.h" -#include "bkey_methods.h" -#include "btree_types.h" -#include "util.h" /* for time_stats */ -#include "vstructs.h" - -/* - * BKEYS: - * - * A bkey contains a key, a size field, a variable number of pointers, and some - * ancillary flag bits. - * - * We use two different functions for validating bkeys, bkey_invalid and - * bkey_deleted(). - * - * The one exception to the rule that ptr_invalid() filters out invalid keys is - * that it also filters out keys of size 0 - these are keys that have been - * completely overwritten. It'd be safe to delete these in memory while leaving - * them on disk, just unnecessary work - so we filter them out when resorting - * instead. - * - * We can't filter out stale keys when we're resorting, because garbage - * collection needs to find them to ensure bucket gens don't wrap around - - * unless we're rewriting the btree node those stale keys still exist on disk. - * - * We also implement functions here for removing some number of sectors from the - * front or the back of a bkey - this is mainly used for fixing overlapping - * extents, by removing the overlapping sectors from the older key. - * - * BSETS: - * - * A bset is an array of bkeys laid out contiguously in memory in sorted order, - * along with a header. A btree node is made up of a number of these, written at - * different times. - * - * There could be many of them on disk, but we never allow there to be more than - * 4 in memory - we lazily resort as needed. - * - * We implement code here for creating and maintaining auxiliary search trees - * (described below) for searching an individial bset, and on top of that we - * implement a btree iterator. - * - * BTREE ITERATOR: - * - * Most of the code in bcache doesn't care about an individual bset - it needs - * to search entire btree nodes and iterate over them in sorted order. - * - * The btree iterator code serves both functions; it iterates through the keys - * in a btree node in sorted order, starting from either keys after a specific - * point (if you pass it a search key) or the start of the btree node. - * - * AUXILIARY SEARCH TREES: - * - * Since keys are variable length, we can't use a binary search on a bset - we - * wouldn't be able to find the start of the next key. But binary searches are - * slow anyways, due to terrible cache behaviour; bcache originally used binary - * searches and that code topped out at under 50k lookups/second. - * - * So we need to construct some sort of lookup table. Since we only insert keys - * into the last (unwritten) set, most of the keys within a given btree node are - * usually in sets that are mostly constant. We use two different types of - * lookup tables to take advantage of this. - * - * Both lookup tables share in common that they don't index every key in the - * set; they index one key every BSET_CACHELINE bytes, and then a linear search - * is used for the rest. - * - * For sets that have been written to disk and are no longer being inserted - * into, we construct a binary search tree in an array - traversing a binary - * search tree in an array gives excellent locality of reference and is very - * fast, since both children of any node are adjacent to each other in memory - * (and their grandchildren, and great grandchildren...) - this means - * prefetching can be used to great effect. - * - * It's quite useful performance wise to keep these nodes small - not just - * because they're more likely to be in L2, but also because we can prefetch - * more nodes on a single cacheline and thus prefetch more iterations in advance - * when traversing this tree. - * - * Nodes in the auxiliary search tree must contain both a key to compare against - * (we don't want to fetch the key from the set, that would defeat the purpose), - * and a pointer to the key. We use a few tricks to compress both of these. - * - * To compress the pointer, we take advantage of the fact that one node in the - * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have - * a function (to_inorder()) that takes the index of a node in a binary tree and - * returns what its index would be in an inorder traversal, so we only have to - * store the low bits of the offset. - * - * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To - * compress that, we take advantage of the fact that when we're traversing the - * search tree at every iteration we know that both our search key and the key - * we're looking for lie within some range - bounded by our previous - * comparisons. (We special case the start of a search so that this is true even - * at the root of the tree). - * - * So we know the key we're looking for is between a and b, and a and b don't - * differ higher than bit 50, we don't need to check anything higher than bit - * 50. - * - * We don't usually need the rest of the bits, either; we only need enough bits - * to partition the key range we're currently checking. Consider key n - the - * key our auxiliary search tree node corresponds to, and key p, the key - * immediately preceding n. The lowest bit we need to store in the auxiliary - * search tree is the highest bit that differs between n and p. - * - * Note that this could be bit 0 - we might sometimes need all 80 bits to do the - * comparison. But we'd really like our nodes in the auxiliary search tree to be - * of fixed size. - * - * The solution is to make them fixed size, and when we're constructing a node - * check if p and n differed in the bits we needed them to. If they don't we - * flag that node, and when doing lookups we fallback to comparing against the - * real key. As long as this doesn't happen to often (and it seems to reliably - * happen a bit less than 1% of the time), we win - even on failures, that key - * is then more likely to be in cache than if we were doing binary searches all - * the way, since we're touching so much less memory. - * - * The keys in the auxiliary search tree are stored in (software) floating - * point, with an exponent and a mantissa. The exponent needs to be big enough - * to address all the bits in the original key, but the number of bits in the - * mantissa is somewhat arbitrary; more bits just gets us fewer failures. - * - * We need 7 bits for the exponent and 3 bits for the key's offset (since keys - * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. - * We need one node per 128 bytes in the btree node, which means the auxiliary - * search trees take up 3% as much memory as the btree itself. - * - * Constructing these auxiliary search trees is moderately expensive, and we - * don't want to be constantly rebuilding the search tree for the last set - * whenever we insert another key into it. For the unwritten set, we use a much - * simpler lookup table - it's just a flat array, so index i in the lookup table - * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing - * within each byte range works the same as with the auxiliary search trees. - * - * These are much easier to keep up to date when we insert a key - we do it - * somewhat lazily; when we shift a key up we usually just increment the pointer - * to it, only when it would overflow do we go to the trouble of finding the - * first key in that range of bytes again. - */ - -enum bset_aux_tree_type { - BSET_NO_AUX_TREE, - BSET_RO_AUX_TREE, - BSET_RW_AUX_TREE, -}; - -#define BSET_TREE_NR_TYPES 3 - -#define BSET_NO_AUX_TREE_VAL (U16_MAX) -#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) - -static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) -{ - switch (t->extra) { - case BSET_NO_AUX_TREE_VAL: - EBUG_ON(t->size); - return BSET_NO_AUX_TREE; - case BSET_RW_AUX_TREE_VAL: - EBUG_ON(!t->size); - return BSET_RW_AUX_TREE; - default: - EBUG_ON(!t->size); - return BSET_RO_AUX_TREE; - } -} - -/* - * BSET_CACHELINE was originally intended to match the hardware cacheline size - - * it used to be 64, but I realized the lookup code would touch slightly less - * memory if it was 128. - * - * It definites the number of bytes (in struct bset) per struct bkey_float in - * the auxiliar search tree - when we're done searching the bset_float tree we - * have this many bytes left that we do a linear search over. - * - * Since (after level 5) every level of the bset_tree is on a new cacheline, - * we're touching one fewer cacheline in the bset tree in exchange for one more - * cacheline in the linear search - but the linear search might stop before it - * gets to the second cacheline. - */ - -#define BSET_CACHELINE 256 - -static inline size_t btree_keys_cachelines(const struct btree *b) -{ - return (1U << b->byte_order) / BSET_CACHELINE; -} - -static inline size_t btree_aux_data_bytes(const struct btree *b) -{ - return btree_keys_cachelines(b) * 8; -} - -static inline size_t btree_aux_data_u64s(const struct btree *b) -{ - return btree_aux_data_bytes(b) / sizeof(u64); -} - -#define for_each_bset(_b, _t) \ - for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) - -#define for_each_bset_c(_b, _t) \ - for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) - -#define bset_tree_for_each_key(_b, _t, _k) \ - for (_k = btree_bkey_first(_b, _t); \ - _k != btree_bkey_last(_b, _t); \ - _k = bkey_p_next(_k)) - -static inline bool bset_has_ro_aux_tree(const struct bset_tree *t) -{ - return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; -} - -static inline bool bset_has_rw_aux_tree(struct bset_tree *t) -{ - return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; -} - -static inline void bch2_bset_set_no_aux_tree(struct btree *b, - struct bset_tree *t) -{ - BUG_ON(t < b->set); - - for (; t < b->set + ARRAY_SIZE(b->set); t++) { - t->size = 0; - t->extra = BSET_NO_AUX_TREE_VAL; - t->aux_data_offset = U16_MAX; - } -} - -static inline void btree_node_set_format(struct btree *b, - struct bkey_format f) -{ - int len; - - b->format = f; - b->nr_key_bits = bkey_format_key_bits(&f); - - len = bch2_compile_bkey_format(&b->format, b->aux_data); - BUG_ON(len < 0 || len > U8_MAX); - - b->unpack_fn_len = len; - - bch2_bset_set_no_aux_tree(b, b->set); -} - -static inline struct bset *bset_next_set(struct btree *b, - unsigned block_bytes) -{ - struct bset *i = btree_bset_last(b); - - EBUG_ON(!is_power_of_2(block_bytes)); - - return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); -} - -void bch2_btree_keys_init(struct btree *); - -void bch2_bset_init_first(struct btree *, struct bset *); -void bch2_bset_init_next(struct btree *, struct btree_node_entry *); -void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); - -void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *, - unsigned); -void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); - -/* Bkey utility code */ - -/* packed or unpacked */ -static inline int bkey_cmp_p_or_unp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r_packed, - const struct bpos *r) -{ - EBUG_ON(r_packed && !bkey_packed(r_packed)); - - if (unlikely(!bkey_packed(l))) - return bpos_cmp(packed_to_bkey_c(l)->p, *r); - - if (likely(r_packed)) - return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); - - return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -} - -static inline struct bset_tree * -bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k) -{ - unsigned offset = __btree_node_key_to_offset(b, k); - - for_each_bset(b, t) - if (offset <= t->end_offset) { - EBUG_ON(offset < btree_bkey_first_offset(t)); - return t; - } - - BUG(); -} - -struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); - -struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, - struct bkey_packed *, unsigned); - -static inline struct bkey_packed * -bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -{ - return bch2_bkey_prev_filter(b, t, k, 0); -} - -static inline struct bkey_packed * -bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -{ - return bch2_bkey_prev_filter(b, t, k, 1); -} - -/* Btree key iteration */ - -void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); -void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, - struct bpos *); -void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, - struct btree *); -struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, - struct btree *, - struct bset_tree *); - -void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); -void bch2_btree_node_iter_set_drop(struct btree_node_iter *, - struct btree_node_iter_set *); -void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); - -#define btree_node_iter_for_each(_iter, _set) \ - for (_set = (_iter)->data; \ - _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ - (_set)->k != (_set)->end; \ - _set++) - -static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, - unsigned i) -{ - return iter->data[i].k == iter->data[i].end; -} - -static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) -{ - return __btree_node_iter_set_end(iter, 0); -} - -/* - * When keys compare equal, deleted keys compare first: - * - * XXX: only need to compare pointers for keys that are both within a - * btree_node_iterator - we need to break ties for prev() to work correctly - */ -static inline int bkey_iter_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed(b, l, r) - ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) - ?: cmp_int(l, r); -} - -static inline int btree_node_iter_cmp(const struct btree *b, - struct btree_node_iter_set l, - struct btree_node_iter_set r) -{ - return bkey_iter_cmp(b, - __btree_node_offset_to_key(b, l.k), - __btree_node_offset_to_key(b, r.k)); -} - -/* These assume r (the search key) is not a deleted key: */ -static inline int bkey_iter_pos_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bpos *r) -{ - return bkey_cmp_left_packed(b, l, r) - ?: -((int) bkey_deleted(l)); -} - -static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r_packed, - const struct bpos *r) -{ - return bkey_cmp_p_or_unp(b, l, r_packed, r) - ?: -((int) bkey_deleted(l)); -} - -static inline struct bkey_packed * -__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, - struct btree *b) -{ - return __btree_node_offset_to_key(b, iter->data->k); -} - -static inline struct bkey_packed * -bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b) -{ - return !bch2_btree_node_iter_end(iter) - ? __btree_node_offset_to_key(b, iter->data->k) - : NULL; -} - -static inline struct bkey_packed * -bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) -{ - struct bkey_packed *k; - - while ((k = bch2_btree_node_iter_peek_all(iter, b)) && - bkey_deleted(k)) - bch2_btree_node_iter_advance(iter, b); - - return k; -} - -static inline struct bkey_packed * -bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) -{ - struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); - - if (ret) - bch2_btree_node_iter_advance(iter, b); - - return ret; -} - -struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, - struct btree *); -struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *, - struct btree *); - -struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, - struct btree *, - struct bkey *); - -#define for_each_btree_node_key(b, k, iter) \ - for (bch2_btree_node_iter_init_from_start((iter), (b)); \ - (k = bch2_btree_node_iter_peek((iter), (b))); \ - bch2_btree_node_iter_advance(iter, b)) - -#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ - for (bch2_btree_node_iter_init_from_start((iter), (b)); \ - (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ - bch2_btree_node_iter_advance(iter, b)) - -/* Accounting: */ - -struct btree_nr_keys bch2_btree_node_count_keys(struct btree *); - -static inline void btree_keys_account_key(struct btree_nr_keys *n, - unsigned bset, - struct bkey_packed *k, - int sign) -{ - n->live_u64s += k->u64s * sign; - n->bset_u64s[bset] += k->u64s * sign; - - if (bkey_packed(k)) - n->packed_keys += sign; - else - n->unpacked_keys += sign; -} - -static inline void btree_keys_account_val_delta(struct btree *b, - struct bkey_packed *k, - int delta) -{ - struct bset_tree *t = bch2_bkey_to_bset(b, k); - - b->nr.live_u64s += delta; - b->nr.bset_u64s[t - b->set] += delta; -} - -#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ - btree_keys_account_key(_nr, _bset_idx, _k, 1) -#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ - btree_keys_account_key(_nr, _bset_idx, _k, -1) - -#define btree_account_key_add(_b, _k) \ - btree_keys_account_key(&(_b)->nr, \ - bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) -#define btree_account_key_drop(_b, _k) \ - btree_keys_account_key(&(_b)->nr, \ - bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) - -struct bset_stats { - struct { - size_t nr, bytes; - } sets[BSET_TREE_NR_TYPES]; - - size_t floats; - size_t failed; -}; - -void bch2_btree_keys_stats(const struct btree *, struct bset_stats *); -void bch2_bfloat_to_text(struct printbuf *, struct btree *, - struct bkey_packed *); - -/* Debug stuff */ - -void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); -void bch2_dump_btree_node(struct bch_fs *, struct btree *); -void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); - -void __bch2_verify_btree_nr_keys(struct btree *); -void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); - -static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) - __bch2_btree_node_iter_verify(iter, b); -} - -static inline void bch2_verify_btree_nr_keys(struct btree *b) -{ - if (static_branch_unlikely(&bch2_debug_check_btree_accounting)) - __bch2_verify_btree_nr_keys(b); -} - -#endif /* _BCACHEFS_BSET_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c deleted file mode 100644 index 83c9860e6b82..000000000000 --- a/fs/bcachefs/btree_cache.c +++ /dev/null @@ -1,1516 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bbpos.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "debug.h" -#include "errcode.h" -#include "error.h" -#include "journal.h" -#include "trace.h" - -#include <linux/prefetch.h> -#include <linux/sched/mm.h> -#include <linux/swap.h> - -const char * const bch2_btree_node_flags[] = { - "typebit", - "typebit", - "typebit", -#define x(f) [BTREE_NODE_##f] = #f, - BTREE_FLAGS() -#undef x - NULL -}; - -void bch2_recalc_btree_reserve(struct bch_fs *c) -{ - unsigned reserve = 16; - - if (!c->btree_roots_known[0].b) - reserve += 8; - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (r->b) - reserve += min_t(unsigned, 1, r->b->c.level) * 8; - } - - c->btree_cache.nr_reserve = reserve; -} - -static inline size_t btree_cache_can_free(struct btree_cache_list *list) -{ - struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); - - size_t can_free = list->nr; - if (!list->idx) - can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); - return can_free; -} - -static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) -{ - BUG_ON(!list_empty(&b->list)); - - if (b->c.lock.readers) - list_add(&b->list, &bc->freed_pcpu); - else - list_add(&b->list, &bc->freed_nonpcpu); -} - -static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) -{ - BUG_ON(!list_empty(&b->list)); - BUG_ON(!b->data); - - bc->nr_freeable++; - list_add(&b->list, &bc->freeable); -} - -void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) -{ - struct btree_cache *bc = &c->btree_cache; - - mutex_lock(&bc->lock); - __bch2_btree_node_to_freelist(bc, b); - mutex_unlock(&bc->lock); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); -} - -void __btree_node_data_free(struct btree *b) -{ - BUG_ON(!list_empty(&b->list)); - BUG_ON(btree_node_hashed(b)); - - /* - * This should really be done in slub/vmalloc, but we're using the - * kmalloc_large() path, so we're working around a slub bug by doing - * this here: - */ - if (b->data) - mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); - if (b->aux_data) - mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); - - EBUG_ON(btree_node_write_in_flight(b)); - - clear_btree_node_just_written(b); - - kvfree(b->data); - b->data = NULL; -#ifdef __KERNEL__ - kvfree(b->aux_data); -#else - munmap(b->aux_data, btree_aux_data_bytes(b)); -#endif - b->aux_data = NULL; -} - -static void btree_node_data_free(struct btree_cache *bc, struct btree *b) -{ - BUG_ON(list_empty(&b->list)); - list_del_init(&b->list); - - __btree_node_data_free(b); - - --bc->nr_freeable; - btree_node_to_freedlist(bc, b); -} - -static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct btree *b = obj; - const u64 *v = arg->key; - - return b->hash_val == *v ? 0 : 1; -} - -static const struct rhashtable_params bch_btree_cache_params = { - .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, hash_val), - .key_len = sizeof(u64), - .obj_cmpfn = bch2_btree_cache_cmp_fn, - .automatic_shrinking = true, -}; - -static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) -{ - BUG_ON(b->data || b->aux_data); - - gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; - - b->data = kvmalloc(btree_buf_bytes(b), gfp); - if (!b->data) - return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); -#ifdef __KERNEL__ - b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); -#else - b->aux_data = mmap(NULL, btree_aux_data_bytes(b), - PROT_READ|PROT_WRITE|PROT_EXEC, - MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); - if (b->aux_data == MAP_FAILED) - b->aux_data = NULL; -#endif - if (!b->aux_data) { - kvfree(b->data); - b->data = NULL; - return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); - } - - return 0; -} - -static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) -{ - struct btree *b; - - b = kzalloc(sizeof(struct btree), gfp); - if (!b) - return NULL; - - bkey_btree_ptr_init(&b->key); - INIT_LIST_HEAD(&b->list); - INIT_LIST_HEAD(&b->write_blocked); - b->byte_order = ilog2(c->opts.btree_node_size); - return b; -} - -struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) -{ - struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); - if (!b) - return NULL; - - if (btree_node_data_alloc(c, b, GFP_KERNEL)) { - kfree(b); - return NULL; - } - - bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); - return b; -} - -static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) -{ - struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); - - u64 mask = bc->pinned_nodes_mask[!!b->c.level]; - - return ((mask & BIT_ULL(b->c.btree_id)) && - bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && - bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); -} - -void bch2_node_pin(struct bch_fs *c, struct btree *b) -{ - struct btree_cache *bc = &c->btree_cache; - - mutex_lock(&bc->lock); - if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { - set_btree_node_pinned(b); - list_move(&b->list, &bc->live[1].list); - bc->live[0].nr--; - bc->live[1].nr++; - } - mutex_unlock(&bc->lock); -} - -void bch2_btree_cache_unpin(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - struct btree *b, *n; - - mutex_lock(&bc->lock); - c->btree_cache.pinned_nodes_mask[0] = 0; - c->btree_cache.pinned_nodes_mask[1] = 0; - - list_for_each_entry_safe(b, n, &bc->live[1].list, list) { - clear_btree_node_pinned(b); - list_move(&b->list, &bc->live[0].list); - bc->live[0].nr++; - bc->live[1].nr--; - } - - mutex_unlock(&bc->lock); -} - -/* Btree in memory cache - hash table */ - -void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) -{ - lockdep_assert_held(&bc->lock); - - int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); - BUG_ON(ret); - - /* Cause future lookups for this node to fail: */ - b->hash_val = 0; - - if (b->c.btree_id < BTREE_ID_NR) - --bc->nr_by_btree[b->c.btree_id]; - --bc->live[btree_node_pinned(b)].nr; - list_del_init(&b->list); -} - -void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) -{ - __bch2_btree_node_hash_remove(bc, b); - __bch2_btree_node_to_freelist(bc, b); -} - -int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) -{ - BUG_ON(!list_empty(&b->list)); - BUG_ON(b->hash_val); - - b->hash_val = btree_ptr_hash_val(&b->key); - int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, - bch_btree_cache_params); - if (ret) - return ret; - - if (b->c.btree_id < BTREE_ID_NR) - bc->nr_by_btree[b->c.btree_id]++; - - bool p = __btree_node_pinned(bc, b); - mod_bit(BTREE_NODE_pinned, &b->flags, p); - - list_add_tail(&b->list, &bc->live[p].list); - bc->live[p].nr++; - return 0; -} - -int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, - unsigned level, enum btree_id id) -{ - b->c.level = level; - b->c.btree_id = id; - - mutex_lock(&bc->lock); - int ret = __bch2_btree_node_hash_insert(bc, b); - mutex_unlock(&bc->lock); - - return ret; -} - -void bch2_btree_node_update_key_early(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_i *new) -{ - struct bch_fs *c = trans->c; - struct btree *b; - struct bkey_buf tmp; - int ret; - - bch2_bkey_buf_init(&tmp); - bch2_bkey_buf_reassemble(&tmp, c, old); - - b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); - if (!IS_ERR_OR_NULL(b)) { - mutex_lock(&c->btree_cache.lock); - - __bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, new); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - - mutex_unlock(&c->btree_cache.lock); - six_unlock_read(&b->c.lock); - } - - bch2_bkey_buf_exit(&tmp, c); -} - -__flatten -static inline struct btree *btree_cache_find(struct btree_cache *bc, - const struct bkey_i *k) -{ - u64 v = btree_ptr_hash_val(k); - - return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); -} - -static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, - bool flush, bool locked) -{ - struct btree_cache *bc = &c->btree_cache; - - lockdep_assert_held(&bc->lock); - - if (btree_node_noevict(b)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - if (btree_node_write_blocked(b)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - if (btree_node_will_make_reachable(b)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - if (btree_node_dirty(b)) { - if (!flush) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - if (locked) { - /* - * Using the underscore version because we don't want to compact - * bsets after the write, since this node is about to be evicted - * - unless btree verify mode is enabled, since it runs out of - * the post write cleanup: - */ - if (static_branch_unlikely(&bch2_verify_btree_ondisk)) - bch2_btree_node_write(c, b, SIX_LOCK_intent, - BTREE_WRITE_cache_reclaim); - else - __bch2_btree_node_write(c, b, - BTREE_WRITE_cache_reclaim); - } - } - - if (b->flags & ((1U << BTREE_NODE_read_in_flight)| - (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { - if (btree_node_read_in_flight(b)) - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; - else if (btree_node_write_in_flight(b)) - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - if (locked) - return -EINTR; - - /* XXX: waiting on IO with btree cache lock held */ - bch2_btree_node_wait_on_read(b); - bch2_btree_node_wait_on_write(b); - } - - return 0; -} - -/* - * this version is for btree nodes that have already been freed (we're not - * reaping a real btree node) - */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) -{ - struct btree_cache *bc = &c->btree_cache; - int ret = 0; - - lockdep_assert_held(&bc->lock); -retry_unlocked: - ret = __btree_node_reclaim_checks(c, b, flush, false); - if (ret) - return ret; - - if (!six_trylock_intent(&b->c.lock)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - if (!six_trylock_write(&b->c.lock)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; - six_unlock_intent(&b->c.lock); - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - /* recheck under lock */ - ret = __btree_node_reclaim_checks(c, b, flush, true); - if (ret) { - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - if (ret == -EINTR) - goto retry_unlocked; - return ret; - } - - if (b->hash_val && !ret) - trace_and_count(c, btree_cache_reap, c, b); - return 0; -} - -static int btree_node_reclaim(struct bch_fs *c, struct btree *b) -{ - return __btree_node_reclaim(c, b, false); -} - -static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) -{ - return __btree_node_reclaim(c, b, true); -} - -static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct btree_cache_list *list = shrink->private_data; - struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); - struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); - struct btree *b, *t; - unsigned long nr = sc->nr_to_scan; - unsigned long can_free = 0; - unsigned long freed = 0; - unsigned long touched = 0; - unsigned i, flags; - unsigned long ret = SHRINK_STOP; - bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; - - if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) - return SHRINK_STOP; - - mutex_lock(&bc->lock); - flags = memalloc_nofs_save(); - - /* - * It's _really_ critical that we don't free too many btree nodes - we - * have to always leave ourselves a reserve. The reserve is how we - * guarantee that allocating memory for a new btree node can always - * succeed, so that inserting keys into the btree can always succeed and - * IO can always make forward progress: - */ - can_free = btree_cache_can_free(list); - if (nr > can_free) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free; - nr = can_free; - } - - i = 0; - list_for_each_entry_safe(b, t, &bc->freeable, list) { - /* - * Leave a few nodes on the freeable list, so that a btree split - * won't have to hit the system allocator: - */ - if (++i <= 3) - continue; - - touched++; - - if (touched >= nr) - goto out; - - if (!btree_node_reclaim(c, b)) { - btree_node_data_free(bc, b); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - freed++; - bc->nr_freed++; - } - } -restart: - list_for_each_entry_safe(b, t, &list->list, list) { - touched++; - - if (btree_node_accessed(b)) { - clear_btree_node_accessed(b); - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; - --touched;; - } else if (!btree_node_reclaim(c, b)) { - __bch2_btree_node_hash_remove(bc, b); - __btree_node_data_free(b); - btree_node_to_freedlist(bc, b); - - freed++; - bc->nr_freed++; - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - if (freed == nr) - goto out_rotate; - } else if (trigger_writes && - btree_node_dirty(b) && - !btree_node_will_make_reachable(b) && - !btree_node_write_blocked(b) && - six_trylock_read(&b->c.lock)) { - list_move(&list->list, &b->list); - mutex_unlock(&bc->lock); - __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); - six_unlock_read(&b->c.lock); - if (touched >= nr) - goto out_nounlock; - mutex_lock(&bc->lock); - goto restart; - } - - if (touched >= nr) - break; - } -out_rotate: - if (&t->list != &list->list) - list_move_tail(&list->list, &t->list); -out: - mutex_unlock(&bc->lock); -out_nounlock: - ret = freed; - memalloc_nofs_restore(flags); - trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); - return ret; -} - -static unsigned long bch2_btree_cache_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct btree_cache_list *list = shrink->private_data; - - if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) - return 0; - - return btree_cache_can_free(list); -} - -void bch2_fs_btree_cache_exit(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - struct btree *b, *t; - unsigned long flags; - - shrinker_free(bc->live[1].shrink); - shrinker_free(bc->live[0].shrink); - - /* vfree() can allocate memory: */ - flags = memalloc_nofs_save(); - mutex_lock(&bc->lock); - - if (c->verify_data) - list_move(&c->verify_data->list, &bc->live[0].list); - - kvfree(c->verify_ondisk); - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (r->b) - list_add(&r->b->list, &bc->live[0].list); - } - - list_for_each_entry_safe(b, t, &bc->live[1].list, list) - bch2_btree_node_hash_remove(bc, b); - list_for_each_entry_safe(b, t, &bc->live[0].list, list) - bch2_btree_node_hash_remove(bc, b); - - list_for_each_entry_safe(b, t, &bc->freeable, list) { - BUG_ON(btree_node_read_in_flight(b) || - btree_node_write_in_flight(b)); - - btree_node_data_free(bc, b); - cond_resched(); - } - - BUG_ON(!bch2_journal_error(&c->journal) && - atomic_long_read(&c->btree_cache.nr_dirty)); - - list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); - - list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { - list_del(&b->list); - six_lock_exit(&b->c.lock); - kfree(b); - } - - mutex_unlock(&bc->lock); - memalloc_nofs_restore(flags); - - for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) - BUG_ON(bc->nr_by_btree[i]); - BUG_ON(bc->live[0].nr); - BUG_ON(bc->live[1].nr); - BUG_ON(bc->nr_freeable); - - if (bc->table_init_done) - rhashtable_destroy(&bc->table); -} - -int bch2_fs_btree_cache_init(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - struct shrinker *shrink; - unsigned i; - int ret = 0; - - ret = rhashtable_init(&bc->table, &bch_btree_cache_params); - if (ret) - goto err; - - bc->table_init_done = true; - - bch2_recalc_btree_reserve(c); - - for (i = 0; i < bc->nr_reserve; i++) { - struct btree *b = __bch2_btree_node_mem_alloc(c); - if (!b) - goto err; - __bch2_btree_node_to_freelist(bc, b); - } - - list_splice_init(&bc->live[0].list, &bc->freeable); - - mutex_init(&c->verify_lock); - - shrink = shrinker_alloc(0, "%s-btree_cache", c->name); - if (!shrink) - goto err; - bc->live[0].shrink = shrink; - shrink->count_objects = bch2_btree_cache_count; - shrink->scan_objects = bch2_btree_cache_scan; - shrink->seeks = 2; - shrink->private_data = &bc->live[0]; - shrinker_register(shrink); - - shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); - if (!shrink) - goto err; - bc->live[1].shrink = shrink; - shrink->count_objects = bch2_btree_cache_count; - shrink->scan_objects = bch2_btree_cache_scan; - shrink->seeks = 8; - shrink->private_data = &bc->live[1]; - shrinker_register(shrink); - - return 0; -err: - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); -} - -void bch2_fs_btree_cache_init_early(struct btree_cache *bc) -{ - mutex_init(&bc->lock); - for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { - bc->live[i].idx = i; - INIT_LIST_HEAD(&bc->live[i].list); - } - INIT_LIST_HEAD(&bc->freeable); - INIT_LIST_HEAD(&bc->freed_pcpu); - INIT_LIST_HEAD(&bc->freed_nonpcpu); -} - -/* - * We can only have one thread cannibalizing other cached btree nodes at a time, - * or we'll deadlock. We use an open coded mutex to ensure that, which a - * cannibalize_bucket() will take. This means every time we unlock the root of - * the btree, we need to release this lock if we have it held. - */ -void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - - if (bc->alloc_lock == current) { - trace_and_count(c, btree_cache_cannibalize_unlock, trans); - bc->alloc_lock = NULL; - closure_wake_up(&bc->alloc_wait); - } -} - -int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct task_struct *old; - - old = NULL; - if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) - goto success; - - if (!cl) { - trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock); - } - - closure_wait(&bc->alloc_wait, cl); - - /* Try again, after adding ourselves to waitlist */ - old = NULL; - if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { - /* We raced */ - closure_wake_up(&bc->alloc_wait); - goto success; - } - - trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return bch_err_throw(c, btree_cache_cannibalize_lock_blocked); - -success: - trace_and_count(c, btree_cache_cannibalize_lock, trans); - return 0; -} - -static struct btree *btree_node_cannibalize(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) - list_for_each_entry_reverse(b, &bc->live[i].list, list) - if (!btree_node_reclaim(c, b)) - return b; - - while (1) { - for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) - list_for_each_entry_reverse(b, &bc->live[i].list, list) - if (!btree_node_write_and_reclaim(c, b)) - return b; - - /* - * Rare case: all nodes were intent-locked. - * Just busy-wait. - */ - WARN_ONCE(1, "btree cache cannibalize failed\n"); - cond_resched(); - } -} - -struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct list_head *freed = pcpu_read_locks - ? &bc->freed_pcpu - : &bc->freed_nonpcpu; - struct btree *b, *b2; - u64 start_time = local_clock(); - - mutex_lock(&bc->lock); - - /* - * We never free struct btree itself, just the memory that holds the on - * disk node. Check the freed list before allocating a new one: - */ - list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b)) { - list_del_init(&b->list); - goto got_node; - } - - b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); - if (b) { - bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); - } else { - mutex_unlock(&bc->lock); - bch2_trans_unlock(trans); - b = __btree_node_mem_alloc(c, GFP_KERNEL); - if (!b) - goto err; - bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); - mutex_lock(&bc->lock); - } - - BUG_ON(!six_trylock_intent(&b->c.lock)); - BUG_ON(!six_trylock_write(&b->c.lock)); - -got_node: - /* - * btree_free() doesn't free memory; it sticks the node on the end of - * the list. Check if there's any freed nodes there: - */ - list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2)) { - swap(b->data, b2->data); - swap(b->aux_data, b2->aux_data); - - list_del_init(&b2->list); - --bc->nr_freeable; - btree_node_to_freedlist(bc, b2); - mutex_unlock(&bc->lock); - - six_unlock_write(&b2->c.lock); - six_unlock_intent(&b2->c.lock); - goto got_mem; - } - - mutex_unlock(&bc->lock); - - if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { - bch2_trans_unlock(trans); - if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) - goto err; - } - -got_mem: - BUG_ON(!list_empty(&b->list)); - BUG_ON(btree_node_hashed(b)); - BUG_ON(btree_node_dirty(b)); - BUG_ON(btree_node_write_in_flight(b)); -out: - b->flags = 0; - b->written = 0; - b->nsets = 0; - b->sib_u64s[0] = 0; - b->sib_u64s[1] = 0; - b->whiteout_u64s = 0; - bch2_btree_keys_init(b); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], - start_time); - - int ret = bch2_trans_relock(trans); - if (unlikely(ret)) { - bch2_btree_node_to_freelist(c, b); - return ERR_PTR(ret); - } - - return b; -err: - mutex_lock(&bc->lock); - - /* Try to cannibalize another cached btree node: */ - if (bc->alloc_lock == current) { - b2 = btree_node_cannibalize(c); - clear_btree_node_just_written(b2); - __bch2_btree_node_hash_remove(bc, b2); - - if (b) { - swap(b->data, b2->data); - swap(b->aux_data, b2->aux_data); - btree_node_to_freedlist(bc, b2); - six_unlock_write(&b2->c.lock); - six_unlock_intent(&b2->c.lock); - } else { - b = b2; - } - - BUG_ON(!list_empty(&b->list)); - mutex_unlock(&bc->lock); - - trace_and_count(c, btree_cache_cannibalize, trans); - goto out; - } - - mutex_unlock(&bc->lock); - return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); -} - -/* Slowpath, don't want it inlined into btree_iter_traverse() */ -static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, - struct btree_path *path, - const struct bkey_i *k, - enum btree_id btree_id, - unsigned level, - enum six_lock_type lock_type, - bool sync) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - if (unlikely(level >= BTREE_MAX_DEPTH)) { - int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", - level, BTREE_MAX_DEPTH); - return ERR_PTR(ret); - } - - if (unlikely(!bkey_is_btree_ptr(&k->k))) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - - int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); - printbuf_exit(&buf); - return ERR_PTR(ret); - } - - if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - - int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); - printbuf_exit(&buf); - return ERR_PTR(ret); - } - - /* - * Parent node must be locked, else we could read in a btree node that's - * been freed: - */ - if (path && !bch2_btree_node_relock(trans, path, level + 1)) { - trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); - } - - b = bch2_btree_node_mem_alloc(trans, level != 0); - - if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { - if (!path) - return b; - - trans->memory_allocation_failure = true; - trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); - } - - if (IS_ERR(b)) - return b; - - bkey_copy(&b->key, k); - if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { - /* raced with another fill: */ - - /* mark as unhashed... */ - b->hash_val = 0; - - mutex_lock(&bc->lock); - __bch2_btree_node_to_freelist(bc, b); - mutex_unlock(&bc->lock); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - return NULL; - } - - set_btree_node_read_in_flight(b); - six_unlock_write(&b->c.lock); - - if (path) { - u32 seq = six_lock_seq(&b->c.lock); - - /* Unlock before doing IO: */ - six_unlock_intent(&b->c.lock); - bch2_trans_unlock(trans); - - bch2_btree_node_read(trans, b, sync); - - int ret = bch2_trans_relock(trans); - if (ret) - return ERR_PTR(ret); - - if (!sync) - return NULL; - - if (!six_relock_type(&b->c.lock, lock_type, seq)) - b = NULL; - } else { - bch2_btree_node_read(trans, b, sync); - if (lock_type == SIX_LOCK_read) - six_lock_downgrade(&b->c.lock); - } - - return b; -} - -static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) -{ - struct printbuf buf = PRINTBUF; - - if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) - return; - - prt_printf(&buf, - "btree node header doesn't match ptr: "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, "\nptr: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - prt_str(&buf, "\nheader: "); - bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); - prt_str(&buf, "\nmin "); - bch2_bpos_to_text(&buf, b->data->min_key); - - prt_printf(&buf, "\nmax "); - bch2_bpos_to_text(&buf, b->data->max_key); - - bch2_fs_topology_error(c, "%s", buf.buf); - - printbuf_exit(&buf); -} - -static inline void btree_check_header(struct bch_fs *c, struct btree *b) -{ - if (b->c.btree_id != BTREE_NODE_ID(b->data) || - b->c.level != BTREE_NODE_LEVEL(b->data) || - !bpos_eq(b->data->max_key, b->key.k.p) || - (b->key.k.type == KEY_TYPE_btree_ptr_v2 && - !bpos_eq(b->data->min_key, - bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) - btree_bad_header(c, b); -} - -static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, - const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - bool need_relock = false; - int ret; - - EBUG_ON(level >= BTREE_MAX_DEPTH); -retry: - b = btree_cache_find(bc, k); - if (unlikely(!b)) { - /* - * We must have the parent locked to call bch2_btree_node_fill(), - * else we could read in a btree node from disk that's been - * freed: - */ - b = bch2_btree_node_fill(trans, path, k, path->btree_id, - level, lock_type, true); - need_relock = true; - - /* We raced and found the btree node in the cache */ - if (!b) - goto retry; - - if (IS_ERR(b)) - return b; - } else { - if (btree_node_read_locked(path, level + 1)) - btree_node_unlock(trans, path, level + 1); - - ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ERR_PTR(ret); - - BUG_ON(ret); - - if (unlikely(b->hash_val != btree_ptr_hash_val(k) || - b->c.level != level || - race_fault())) { - six_unlock_type(&b->c.lock, lock_type); - if (bch2_btree_node_relock(trans, path, level + 1)) - goto retry; - - trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); - } - - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - } - - if (unlikely(btree_node_read_in_flight(b))) { - u32 seq = six_lock_seq(&b->c.lock); - - six_unlock_type(&b->c.lock, lock_type); - bch2_trans_unlock(trans); - need_relock = true; - - bch2_btree_node_wait_on_read(b); - - ret = bch2_trans_relock(trans); - if (ret) - return ERR_PTR(ret); - - /* - * should_be_locked is not set on this path yet, so we need to - * relock it specifically: - */ - if (!six_relock_type(&b->c.lock, lock_type, seq)) - goto retry; - } - - if (unlikely(need_relock)) { - ret = bch2_trans_relock(trans) ?: - bch2_btree_path_relock_intent(trans, path); - if (ret) { - six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(ret); - } - } - - prefetch(b->aux_data); - - for_each_bset(b, t) { - void *p = (u64 *) b->aux_data + t->aux_data_offset; - - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - } - - if (unlikely(btree_node_read_error(b))) { - six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); - } - - EBUG_ON(b->c.btree_id != path->btree_id); - EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - btree_check_header(c, b); - - return b; -} - -/** - * bch2_btree_node_get - find a btree node in the cache and lock it, reading it - * in from disk if necessary. - * - * @trans: btree transaction object - * @path: btree_path being traversed - * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) - * @level: level of btree node being looked up (0 == leaf node) - * @lock_type: SIX_LOCK_read or SIX_LOCK_intent - * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) - * - * The btree node will have either a read or a write lock held, depending on - * the @write parameter. - * - * Returns: btree node or ERR_PTR() - */ -struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, - const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree *b; - int ret; - - EBUG_ON(level >= BTREE_MAX_DEPTH); - - b = btree_node_mem_ptr(k); - - /* - * Check b->hash_val _before_ calling btree_node_lock() - this might not - * be the node we want anymore, and trying to lock the wrong node could - * cause an unneccessary transaction restart: - */ - if (unlikely(!c->opts.btree_node_mem_ptr_optimization || - !b || - b->hash_val != btree_ptr_hash_val(k))) - return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); - - if (btree_node_read_locked(path, level + 1)) - btree_node_unlock(trans, path, level + 1); - - ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ERR_PTR(ret); - - BUG_ON(ret); - - if (unlikely(b->hash_val != btree_ptr_hash_val(k) || - b->c.level != level || - race_fault())) { - six_unlock_type(&b->c.lock, lock_type); - if (bch2_btree_node_relock(trans, path, level + 1)) - return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); - - trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); - } - - if (unlikely(btree_node_read_in_flight(b))) { - six_unlock_type(&b->c.lock, lock_type); - return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); - } - - prefetch(b->aux_data); - - for_each_bset(b, t) { - void *p = (u64 *) b->aux_data + t->aux_data_offset; - - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - } - - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - - if (unlikely(btree_node_read_error(b))) { - six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); - } - - EBUG_ON(b->c.btree_id != path->btree_id); - EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - btree_check_header(c, b); - - return b; -} - -struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, - const struct bkey_i *k, - enum btree_id btree_id, - unsigned level, - bool nofill) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - int ret; - - EBUG_ON(level >= BTREE_MAX_DEPTH); - - if (c->opts.btree_node_mem_ptr_optimization) { - b = btree_node_mem_ptr(k); - if (b) - goto lock_node; - } -retry: - b = btree_cache_find(bc, k); - if (unlikely(!b)) { - if (nofill) - goto out; - - b = bch2_btree_node_fill(trans, NULL, k, btree_id, - level, SIX_LOCK_read, true); - - /* We raced and found the btree node in the cache */ - if (!b) - goto retry; - - if (IS_ERR(b) && - !bch2_btree_cache_cannibalize_lock(trans, NULL)) - goto retry; - - if (IS_ERR(b)) - goto out; - } else { -lock_node: - ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ERR_PTR(ret); - - BUG_ON(ret); - - if (unlikely(b->hash_val != btree_ptr_hash_val(k) || - b->c.btree_id != btree_id || - b->c.level != level)) { - six_unlock_read(&b->c.lock); - goto retry; - } - - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - } - - /* XXX: waiting on IO with btree locks held: */ - __bch2_btree_node_wait_on_read(b); - - prefetch(b->aux_data); - - for_each_bset(b, t) { - void *p = (u64 *) b->aux_data + t->aux_data_offset; - - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - } - - if (unlikely(btree_node_read_error(b))) { - six_unlock_read(&b->c.lock); - b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); - goto out; - } - - EBUG_ON(b->c.btree_id != btree_id); - EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - btree_check_header(c, b); -out: - bch2_btree_cache_cannibalize_unlock(trans); - return b; -} - -int bch2_btree_node_prefetch(struct btree_trans *trans, - struct btree_path *path, - const struct bkey_i *k, - enum btree_id btree_id, unsigned level) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - - BUG_ON(path && !btree_node_locked(path, level + 1)); - BUG_ON(level >= BTREE_MAX_DEPTH); - - struct btree *b = btree_cache_find(bc, k); - if (b) - return 0; - - b = bch2_btree_node_fill(trans, path, k, btree_id, - level, SIX_LOCK_read, false); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - return ret; - if (b) - six_unlock_read(&b->c.lock); - return 0; -} - -void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - b = btree_cache_find(bc, k); - if (!b) - return; - - BUG_ON(b == btree_node_root(trans->c, b)); -wait_on_io: - /* not allowed to wait on io with btree locks held: */ - - /* XXX we're called from btree_gc which will be holding other btree - * nodes locked - */ - __bch2_btree_node_wait_on_read(b); - __bch2_btree_node_wait_on_write(b); - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - if (unlikely(b->hash_val != btree_ptr_hash_val(k))) - goto out; - - if (btree_node_dirty(b)) { - __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - goto wait_on_io; - } - - BUG_ON(btree_node_dirty(b)); - - mutex_lock(&bc->lock); - bch2_btree_node_hash_remove(bc, b); - btree_node_data_free(bc, b); - mutex_unlock(&bc->lock); -out: - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); -} - -const char *bch2_btree_id_str(enum btree_id btree) -{ - return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; -} - -void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) -{ - if (btree < BTREE_ID_NR) - prt_str(out, __bch2_btree_ids[btree]); - else - prt_printf(out, "(unknown btree %u)", btree); -} - -void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) -{ - prt_str(out, "btree="); - bch2_btree_id_to_text(out, btree); - prt_printf(out, " level=%u", level); -} - -void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, - enum btree_id btree, unsigned level, struct bkey_s_c k) -{ - bch2_btree_id_to_text(out, btree); - prt_printf(out, " level %u/", level); - struct btree_root *r = bch2_btree_id_root(c, btree); - if (r) - prt_printf(out, "%u", r->level); - else - prt_printf(out, "(unknown)"); - prt_newline(out); - - bch2_bkey_val_to_text(out, c, k); -} - -void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) -{ - __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); -} - -void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) -{ - struct bset_stats stats; - - memset(&stats, 0, sizeof(stats)); - - bch2_btree_keys_stats(b, &stats); - - prt_printf(out, "l %u ", b->c.level); - bch2_bpos_to_text(out, b->data->min_key); - prt_printf(out, " - "); - bch2_bpos_to_text(out, b->data->max_key); - prt_printf(out, ":\n" - " ptrs: "); - bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); - prt_newline(out); - - prt_printf(out, - " format: "); - bch2_bkey_format_to_text(out, &b->format); - - prt_printf(out, - " unpack fn len: %u\n" - " bytes used %zu/%zu (%zu%% full)\n" - " sib u64s: %u, %u (merge threshold %u)\n" - " nr packed keys %u\n" - " nr unpacked keys %u\n" - " floats %zu\n" - " failed unpacked %zu\n", - b->unpack_fn_len, - b->nr.live_u64s * sizeof(u64), - btree_buf_bytes(b) - sizeof(struct btree_node), - b->nr.live_u64s * 100 / btree_max_u64s(c), - b->sib_u64s[0], - b->sib_u64s[1], - c->btree_foreground_merge_threshold, - b->nr.packed_keys, - b->nr.unpacked_keys, - stats.floats, - stats.failed); -} - -static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, - const char *label, size_t nr) -{ - prt_printf(out, "%s\t", label); - prt_human_readable_u64(out, nr * c->opts.btree_node_size); - prt_printf(out, " (%zu)\n", nr); -} - -static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { -#define x(n) #n, - BCH_BTREE_CACHE_NOT_FREED_REASONS() -#undef x - NULL -}; - -void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) -{ - struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); - - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - prt_btree_cache_line(out, c, "live:", bc->live[0].nr); - prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); - prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); - prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); - prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); - prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held"); - prt_newline(out); - - for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { - bch2_btree_id_to_text(out, i); - prt_printf(out, "\t"); - prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); - prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); - } - - prt_newline(out); - prt_printf(out, "counters since mount:\n"); - prt_printf(out, "freed:\t%zu\n", bc->nr_freed); - prt_printf(out, "not freed:\n"); - - for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) - prt_printf(out, " %s\t%llu\n", - bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); -} diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h deleted file mode 100644 index be275f87a60e..000000000000 --- a/fs/bcachefs/btree_cache.h +++ /dev/null @@ -1,157 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_CACHE_H -#define _BCACHEFS_BTREE_CACHE_H - -#include "bcachefs.h" -#include "btree_types.h" -#include "bkey_methods.h" - -extern const char * const bch2_btree_node_flags[]; - -struct btree_iter; - -void bch2_recalc_btree_reserve(struct bch_fs *); - -void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *); - -void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); -void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); - -int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); -int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, - unsigned, enum btree_id); - -void bch2_node_pin(struct bch_fs *, struct btree *); -void bch2_btree_cache_unpin(struct bch_fs *); - -void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *); - -void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); -int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); - -void __btree_node_data_free(struct btree *); -struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); -struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); - -struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, - const struct bkey_i *, unsigned, - enum six_lock_type, unsigned long); - -struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *, - enum btree_id, unsigned, bool); - -int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *, - const struct bkey_i *, enum btree_id, unsigned); - -void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *); - -void bch2_fs_btree_cache_exit(struct bch_fs *); -int bch2_fs_btree_cache_init(struct bch_fs *); -void bch2_fs_btree_cache_init_early(struct btree_cache *); - -static inline u64 btree_ptr_hash_val(const struct bkey_i *k) -{ - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); - case KEY_TYPE_btree_ptr_v2: - /* - * The cast/deref is only necessary to avoid sparse endianness - * warnings: - */ - return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq); - default: - return 0; - } -} - -static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) -{ - return k->k.type == KEY_TYPE_btree_ptr_v2 - ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr - : NULL; -} - -/* is btree node in hash table? */ -static inline bool btree_node_hashed(struct btree *b) -{ - return b->hash_val != 0; -} - -#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ - for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ - &(_c)->btree_cache.table), \ - _iter = 0; _iter < (_tbl)->size; _iter++) \ - rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) - -static inline size_t btree_buf_bytes(const struct btree *b) -{ - return 1UL << b->byte_order; -} - -static inline size_t btree_buf_max_u64s(const struct btree *b) -{ - return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64); -} - -static inline size_t btree_max_u64s(const struct bch_fs *c) -{ - return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64); -} - -static inline size_t btree_sectors(const struct bch_fs *c) -{ - return c->opts.btree_node_size >> SECTOR_SHIFT; -} - -static inline unsigned btree_blocks(const struct bch_fs *c) -{ - return btree_sectors(c) >> c->block_bits; -} - -#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) - -#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) -#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ - (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ - (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2)) - -static inline unsigned btree_id_nr_alive(struct bch_fs *c) -{ - return BTREE_ID_NR + c->btree_roots_extra.nr; -} - -static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id) -{ - if (likely(id < BTREE_ID_NR)) { - return &c->btree_roots_known[id]; - } else { - unsigned idx = id - BTREE_ID_NR; - - /* This can happen when we're called from btree_node_scan */ - if (idx >= c->btree_roots_extra.nr) - return NULL; - - return &c->btree_roots_extra.data[idx]; - } -} - -static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) -{ - struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); - - return r ? r->b : NULL; -} - -const char *bch2_btree_id_str(enum btree_id); /* avoid */ -void bch2_btree_id_to_text(struct printbuf *, enum btree_id); -void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned); - -void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, - enum btree_id, unsigned, struct bkey_s_c); -void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); -void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); -void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); - -#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c deleted file mode 100644 index bac108e93823..000000000000 --- a/fs/bcachefs/btree_gc.c +++ /dev/null @@ -1,1308 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright (C) 2014 Datera Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "bkey_methods.h" -#include "bkey_buf.h" -#include "btree_journal_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_node_scan.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "btree_gc.h" -#include "buckets.h" -#include "clock.h" -#include "debug.h" -#include "disk_accounting.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "journal.h" -#include "keylist.h" -#include "move.h" -#include "progress.h" -#include "recovery_passes.h" -#include "reflink.h" -#include "recovery.h" -#include "replicas.h" -#include "super-io.h" -#include "trace.h" - -#include <linux/slab.h> -#include <linux/bitops.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/preempt.h> -#include <linux/rcupdate.h> -#include <linux/sched/task.h> - -#define DROP_THIS_NODE 10 -#define DROP_PREV_NODE 11 -#define DID_FILL_FROM_SCAN 12 - -/* - * Returns true if it's a btree we can easily reconstruct, or otherwise won't - * cause data loss if it's missing: - */ -static bool btree_id_important(enum btree_id btree) -{ - if (btree_id_is_alloc(btree)) - return false; - - switch (btree) { - case BTREE_ID_quotas: - case BTREE_ID_snapshot_trees: - case BTREE_ID_logged_ops: - case BTREE_ID_rebalance_work: - case BTREE_ID_subvolume_children: - return false; - default: - return true; - } -} - -static const char * const bch2_gc_phase_strs[] = { -#define x(n) #n, - GC_PHASES() -#undef x - NULL -}; - -void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p) -{ - prt_str(out, bch2_gc_phase_strs[p->phase]); - prt_char(out, ' '); - bch2_btree_id_level_to_text(out, p->btree, p->level); - prt_char(out, ' '); - bch2_bpos_to_text(out, p->pos); -} - -static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) -{ - return (struct bkey_s) {{{ - (struct bkey *) k.k, - (struct bch_val *) k.v - }}}; -} - -static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -{ - preempt_disable(); - write_seqcount_begin(&c->gc_pos_lock); - c->gc_pos = new_pos; - write_seqcount_end(&c->gc_pos_lock); - preempt_enable(); -} - -static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -{ - BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0); - __gc_pos_set(c, new_pos); -} - -static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) -{ - switch (b->key.k.type) { - case KEY_TYPE_btree_ptr: { - struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key); - - dst->k.p = src->k.p; - dst->v.mem_ptr = 0; - dst->v.seq = b->data->keys.seq; - dst->v.sectors_written = 0; - dst->v.flags = 0; - dst->v.min_key = b->data->min_key; - set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k)); - memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k)); - break; - } - case KEY_TYPE_btree_ptr_v2: - bkey_copy(&dst->k_i, &b->key); - break; - default: - BUG(); - } -} - -static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) -{ - struct bkey_i_btree_ptr_v2 *new; - int ret; - - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, " -> "); - bch2_bpos_to_text(&buf, new_min); - - bch_info(c, "%s(): %s", __func__, buf.buf); - printbuf_exit(&buf); - } - - new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); - if (!new) - return bch_err_throw(c, ENOMEM_gc_repair_key); - - btree_ptr_to_v2(b, new); - b->data->min_key = new_min; - new->v.min_key = new_min; - SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); - - ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); - if (ret) { - kfree(new); - return ret; - } - - bch2_btree_node_drop_keys_outside_node(b); - bkey_copy(&b->key, &new->k_i); - return 0; -} - -static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) -{ - struct bkey_i_btree_ptr_v2 *new; - int ret; - - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, " -> "); - bch2_bpos_to_text(&buf, new_max); - - bch_info(c, "%s(): %s", __func__, buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); - if (ret) - return ret; - - new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); - if (!new) - return bch_err_throw(c, ENOMEM_gc_repair_key); - - btree_ptr_to_v2(b, new); - b->data->max_key = new_max; - new->k.p = new_max; - SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); - - ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); - if (ret) { - kfree(new); - return ret; - } - - bch2_btree_node_drop_keys_outside_node(b); - - mutex_lock(&c->btree_cache.lock); - __bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, &new->k_i); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - mutex_unlock(&c->btree_cache.lock); - return 0; -} - -static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b, - struct btree *prev, struct btree *cur, - struct bpos *pulled_from_scan) -{ - struct bch_fs *c = trans->c; - struct bpos expected_start = !prev - ? b->data->min_key - : bpos_successor(prev->key.k.p); - struct printbuf buf = PRINTBUF; - int ret = 0; - - BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, - b->data->min_key)); - - if (bpos_eq(expected_start, cur->data->min_key)) - return 0; - - prt_printf(&buf, " at "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_printf(&buf, ":\nparent: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - if (prev) { - prt_printf(&buf, "\nprev: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); - } - - prt_str(&buf, "\nnext: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); - - if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ - if (b->c.level == 1 && - bpos_lt(*pulled_from_scan, cur->data->min_key)) { - ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, - expected_start, - bpos_predecessor(cur->data->min_key)); - if (ret) - goto err; - - *pulled_from_scan = cur->data->min_key; - ret = DID_FILL_FROM_SCAN; - } else { - if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, - "btree node with incorrect min_key%s", buf.buf)) - ret = set_node_min(c, cur, expected_start); - } - } else { /* overlap */ - if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */ - if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ - if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node, - "btree node overwritten by next node%s", buf.buf)) - ret = DROP_PREV_NODE; - } else { - if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, - "btree node with incorrect max_key%s", buf.buf)) - ret = set_node_max(c, prev, - bpos_predecessor(cur->data->min_key)); - } - } else { - if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ - if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node, - "btree node overwritten by prev node%s", buf.buf)) - ret = DROP_THIS_NODE; - } else { - if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, - "btree node with incorrect min_key%s", buf.buf)) - ret = set_node_min(c, cur, expected_start); - } - } - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, - struct btree *child, struct bpos *pulled_from_scan) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (bpos_eq(child->key.k.p, b->key.k.p)) - return 0; - - prt_printf(&buf, "\nat: "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_printf(&buf, "\nparent: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - prt_str(&buf, "\nchild: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); - - if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, - "btree node with incorrect max_key%s", buf.buf)) { - if (b->c.level == 1 && - bpos_lt(*pulled_from_scan, b->key.k.p)) { - ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, - bpos_successor(child->key.k.p), b->key.k.p); - if (ret) - goto err; - - *pulled_from_scan = b->key.k.p; - ret = DID_FILL_FROM_SCAN; - } else { - ret = set_node_max(c, child, b->key.k.p); - } - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b, - struct bpos *pulled_from_scan) -{ - struct bch_fs *c = trans->c; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct bkey_buf prev_k, cur_k; - struct btree *prev = NULL, *cur = NULL; - bool have_child, new_pass = false; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (!b->c.level) - return 0; - - bch2_bkey_buf_init(&prev_k); - bch2_bkey_buf_init(&cur_k); -again: - cur = prev = NULL; - have_child = new_pass = false; - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - iter.prefetch = true; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - BUG_ON(bpos_lt(k.k->p, b->data->min_key)); - BUG_ON(bpos_gt(k.k->p, b->data->max_key)); - - bch2_btree_and_journal_iter_advance(&iter); - bch2_bkey_buf_reassemble(&cur_k, c, k); - - cur = bch2_btree_node_get_noiter(trans, cur_k.k, - b->c.btree_id, b->c.level - 1, - false); - ret = PTR_ERR_OR_ZERO(cur); - - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1); - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - - if (bch2_err_matches(ret, EIO)) { - bch2_btree_node_evict(trans, cur_k.k); - cur = NULL; - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - if (ret) - break; - continue; - } - - bch_err_msg(c, ret, "getting btree node"); - if (ret) - break; - - if (bch2_btree_node_is_stale(c, cur)) { - bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf); - six_unlock_read(&cur->c.lock); - bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - cur = NULL; - if (ret) - break; - continue; - } - - ret = lockrestart_do(trans, - btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan)); - if (ret < 0) - goto err; - - if (ret == DID_FILL_FROM_SCAN) { - new_pass = true; - ret = 0; - } - - if (ret == DROP_THIS_NODE) { - six_unlock_read(&cur->c.lock); - bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - cur = NULL; - if (ret) - break; - continue; - } - - if (prev) - six_unlock_read(&prev->c.lock); - prev = NULL; - - if (ret == DROP_PREV_NODE) { - bch_info(c, "dropped prev node"); - bch2_btree_node_evict(trans, prev_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, prev_k.k->k.p); - if (ret) - break; - - bch2_btree_and_journal_iter_exit(&iter); - goto again; - } else if (ret) - break; - - prev = cur; - cur = NULL; - bch2_bkey_buf_copy(&prev_k, c, cur_k.k); - } - - if (!ret && !IS_ERR_OR_NULL(prev)) { - BUG_ON(cur); - ret = lockrestart_do(trans, - btree_repair_node_end(trans, b, prev, pulled_from_scan)); - if (ret == DID_FILL_FROM_SCAN) { - new_pass = true; - ret = 0; - } - } - - if (!IS_ERR_OR_NULL(prev)) - six_unlock_read(&prev->c.lock); - prev = NULL; - if (!IS_ERR_OR_NULL(cur)) - six_unlock_read(&cur->c.lock); - cur = NULL; - - if (ret) - goto err; - - bch2_btree_and_journal_iter_exit(&iter); - - if (new_pass) - goto again; - - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - iter.prefetch = true; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - bch2_bkey_buf_reassemble(&cur_k, c, k); - bch2_btree_and_journal_iter_advance(&iter); - - cur = bch2_btree_node_get_noiter(trans, cur_k.k, - b->c.btree_id, b->c.level - 1, - false); - ret = PTR_ERR_OR_ZERO(cur); - - bch_err_msg(c, ret, "getting btree node"); - if (ret) - goto err; - - ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan); - six_unlock_read(&cur->c.lock); - cur = NULL; - - if (ret == DROP_THIS_NODE) { - bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - new_pass = true; - } - - if (ret) - goto err; - - have_child = true; - } - - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - /* - * XXX: we're not passing the trans object here because we're not set up - * to handle a transaction restart - this code needs to be rewritten - * when we start doing online topology repair - */ - bch2_trans_unlock_long(trans); - if (mustfix_fsck_err_on(!have_child, - c, btree_node_topology_interior_node_empty, - "empty interior btree node at %s", buf.buf)) - ret = DROP_THIS_NODE; -err: -fsck_err: - if (!IS_ERR_OR_NULL(prev)) - six_unlock_read(&prev->c.lock); - if (!IS_ERR_OR_NULL(cur)) - six_unlock_read(&cur->c.lock); - - bch2_btree_and_journal_iter_exit(&iter); - - if (!ret && new_pass) - goto again; - - BUG_ON(!ret && bch2_btree_node_check_topology(trans, b)); - - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_check_root(struct btree_trans *trans, enum btree_id btree, - bool *reconstructed_root) -{ - struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, btree); - struct printbuf buf = PRINTBUF; - int ret = 0; - - bch2_btree_id_to_text(&buf, btree); - - if (r->error) { - bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); - - ret = bch2_btree_has_scanned_nodes(c, btree); - if (ret < 0) - goto err; - - if (!ret) { - __fsck_err(trans, - FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0), - btree_root_unreadable_and_scan_found_nothing, - "no nodes found for btree %s, continue?", buf.buf); - - r->alive = false; - r->error = 0; - bch2_btree_root_alloc_fake_trans(trans, btree, 0); - } else { - r->alive = false; - r->error = 0; - bch2_btree_root_alloc_fake_trans(trans, btree, 1); - - bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX); - if (ret) - goto err; - } - - *reconstructed_root = true; - } -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -int bch2_check_topology(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bpos pulled_from_scan = POS_MIN; - int ret = 0; - - bch2_trans_srcu_unlock(trans); - - for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - bool reconstructed_root = false; -recover: - ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root)); - if (ret) - break; - - struct btree_root *r = bch2_btree_id_root(c, i); - struct btree *b = r->b; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); - six_unlock_read(&b->c.lock); - - if (ret == DROP_THIS_NODE) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - - r->b = NULL; - - if (!reconstructed_root) { - r->error = -EIO; - goto recover; - } - - struct printbuf buf = PRINTBUF; - bch2_btree_id_to_text(&buf, i); - bch_err(c, "empty btree root %s", buf.buf); - printbuf_exit(&buf); - bch2_btree_root_alloc_fake_trans(trans, i, 0); - r->alive = false; - ret = 0; - } - } - - bch2_trans_put(trans); - return ret; -} - -/* marking of btree keys/nodes: */ - -static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, - unsigned level, struct btree **prev, - struct btree_iter *iter, struct bkey_s_c k, - bool initial) -{ - struct bch_fs *c = trans->c; - - if (iter) { - struct btree_path *path = btree_iter_path(trans, iter); - struct btree *b = path_l(path)->b; - - if (*prev != b) { - int ret = bch2_btree_node_check_topology(trans, b); - if (ret) - return ret; - } - *prev = b; - } - - struct bkey deleted = KEY(0, 0, 0); - struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; - struct printbuf buf = PRINTBUF; - int ret = 0; - - deleted.p = k.k->p; - - if (initial) { - BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) && - k.k->bversion.lo > atomic64_read(&c->journal.seq)); - - if (fsck_err_on(btree_id != BTREE_ID_accounting && - k.k->bversion.lo > atomic64_read(&c->key_version), - trans, bkey_version_in_future, - "key version number higher than recorded %llu\n%s", - atomic64_read(&c->key_version), - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - atomic64_set(&c->key_version, k.k->bversion.lo); - } - - if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), - trans, btree_bitmap_not_marked, - "btree ptr not marked in member info btree allocated bitmap\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - mutex_lock(&c->sb_lock); - bch2_dev_btree_bitmap_mark(c, k); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - - /* - * We require a commit before key_trigger() because - * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the - * wrong result if we run it multiple times. - */ - unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0; - - ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), - BTREE_TRIGGER_check_repair|flags); - if (ret) - goto out; - - if (trans->nr_updates) { - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } - - ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), - BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags); -out: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_gc_btree(struct btree_trans *trans, - struct progress_indicator_state *progress, - enum btree_id btree, bool initial) -{ - struct bch_fs *c = trans->c; - unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; - int ret = 0; - - /* We need to make sure every leaf node is readable before going RW */ - if (initial) - target_depth = 0; - - for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) { - struct btree *prev = NULL; - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level, - BTREE_ITER_prefetch); - - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - bch2_progress_update_iter(trans, progress, &iter, "check_allocations"); - gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); - bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); - })); - if (ret) - goto err; - } - - /* root */ - do { -retry_root: - bch2_trans_begin(trans); - - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, - 0, bch2_btree_id_root(c, btree)->b->c.level, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err_root; - - if (b != btree_node_root(c, b)) { - bch2_trans_iter_exit(trans, &iter); - goto retry_root; - } - - gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX)); - struct bkey_s_c k = bkey_i_to_s_c(&b->key); - ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial); -err_root: - bch2_trans_iter_exit(trans, &iter); - } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); -err: - bch_err_fn(c, ret); - return ret; -} - -static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) -{ - return cmp_int(gc_btree_order(l), gc_btree_order(r)); -} - -static int bch2_gc_btrees(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct progress_indicator_state progress; - bch2_progress_init(&progress, c, ~0ULL); - - enum btree_id ids[BTREE_ID_NR]; - for (unsigned i = 0; i < BTREE_ID_NR; i++) - ids[i] = i; - bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - - for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - unsigned btree = i < BTREE_ID_NR ? ids[i] : i; - - if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) - continue; - - ret = bch2_gc_btree(trans, &progress, btree, true); - } - - printbuf_exit(&buf); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_mark_superblocks(struct bch_fs *c) -{ - gc_pos_set(c, gc_phase(GC_PHASE_sb)); - - return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); -} - -static void bch2_gc_free(struct bch_fs *c) -{ - bch2_accounting_gc_free(c); - - genradix_free(&c->reflink_gc_table); - genradix_free(&c->gc_stripes); - - for_each_member_device(c, ca) - genradix_free(&ca->buckets_gc); -} - -static int bch2_gc_start(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - int ret = bch2_dev_usage_init(ca, true); - if (ret) { - bch2_dev_put(ca); - return ret; - } - } - - return 0; -} - -/* returns true if not equal */ -static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, - struct bch_alloc_v4 r) -{ - return l.gen != r.gen || - l.oldest_gen != r.oldest_gen || - l.data_type != r.data_type || - l.dirty_sectors != r.dirty_sectors || - l.stripe_sectors != r.stripe_sectors || - l.cached_sectors != r.cached_sectors || - l.stripe_redundancy != r.stripe_redundancy || - l.stripe != r.stripe; -} - -static int bch2_alloc_write_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_dev *ca, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_i_alloc_v4 *a; - struct bch_alloc_v4 old_gc, gc, old_convert, new; - const struct bch_alloc_v4 *old; - int ret; - - if (!bucket_valid(ca, k.k->p.offset)) - return 0; - - old = bch2_alloc_to_v4(k, &old_convert); - gc = new = *old; - - __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); - - old_gc = gc; - - if ((old->data_type == BCH_DATA_sb || - old->data_type == BCH_DATA_journal) && - !bch2_dev_is_online(ca)) { - gc.data_type = old->data_type; - gc.dirty_sectors = old->dirty_sectors; - } - - /* - * gc.data_type doesn't yet include need_discard & need_gc_gen states - - * fix that here: - */ - alloc_data_type_set(&gc, gc.data_type); - if (gc.data_type != old_gc.data_type || - gc.dirty_sectors != old_gc.dirty_sectors) { - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc); - if (ret) - return ret; - - /* - * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not - * safe w.r.t. transaction restarts, so fixup the gc_bucket so - * we don't run it twice: - */ - struct bucket *gc_m = gc_bucket(ca, iter->pos.offset); - gc_m->data_type = gc.data_type; - gc_m->dirty_sectors = gc.dirty_sectors; - } - - if (fsck_err_on(new.data_type != gc.data_type, - trans, alloc_key_data_type_wrong, - "bucket %llu:%llu gen %u has wrong data_type" - ": got %s, should be %s", - iter->pos.inode, iter->pos.offset, - gc.gen, - bch2_data_type_str(new.data_type), - bch2_data_type_str(gc.data_type))) - new.data_type = gc.data_type; - -#define copy_bucket_field(_errtype, _f) \ - if (fsck_err_on(new._f != gc._f, \ - trans, _errtype, \ - "bucket %llu:%llu gen %u data type %s has wrong " #_f \ - ": got %llu, should be %llu", \ - iter->pos.inode, iter->pos.offset, \ - gc.gen, \ - bch2_data_type_str(gc.data_type), \ - (u64) new._f, (u64) gc._f)) \ - new._f = gc._f; \ - - copy_bucket_field(alloc_key_gen_wrong, gen); - copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors); - copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors); - copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors); - copy_bucket_field(alloc_key_stripe_wrong, stripe); - copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy); -#undef copy_bucket_field - - if (!bch2_alloc_v4_cmp(*old, new)) - return 0; - - a = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - return ret; - - a->v = new; - - /* - * The trigger normally makes sure these are set, but we're not running - * triggers: - */ - if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) - a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - - ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun); -fsck_err: - return ret; -} - -static int bch2_gc_alloc_done(struct bch_fs *c) -{ - int ret = 0; - - for_each_member_device(c, ca) { - ret = bch2_trans_run(c, - for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, - POS(ca->dev_idx, ca->mi.first_bucket), - POS(ca->dev_idx, ca->mi.nbuckets - 1), - BTREE_ITER_slots|BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_alloc_write_key(trans, &iter, ca, k))); - if (ret) { - bch2_dev_put(ca); - break; - } - } - - bch_err_fn(c, ret); - return ret; -} - -static int bch2_gc_alloc_start(struct bch_fs *c) -{ - int ret = 0; - - for_each_member_device(c, ca) { - ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL); - if (ret) { - bch2_dev_put(ca); - ret = bch_err_throw(c, ENOMEM_gc_alloc_start); - break; - } - } - - bch_err_fn(c, ret); - return ret; -} - -static int bch2_gc_write_stripes_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - const struct bch_stripe *s; - struct gc_stripe *m; - bool bad = false; - unsigned i; - int ret = 0; - - if (k.k->type != KEY_TYPE_stripe) - return 0; - - s = bkey_s_c_to_stripe(k).v; - m = genradix_ptr(&c->gc_stripes, k.k->p.offset); - - for (i = 0; i < s->nr_blocks; i++) { - u32 old = stripe_blockcount_get(s, i); - u32 new = (m ? m->block_sectors[i] : 0); - - if (old != new) { - prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n", - i, old, new); - bad = true; - } - } - - if (bad) - bch2_bkey_val_to_text(&buf, c, k); - - if (fsck_err_on(bad, - trans, stripe_sector_count_wrong, - "%s", buf.buf)) { - struct bkey_i_stripe *new; - - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bkey_reassemble(&new->k_i, k); - - for (i = 0; i < new->v.nr_blocks; i++) - stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); - - ret = bch2_trans_update(trans, iter, &new->k_i, 0); - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int bch2_gc_stripes_done(struct bch_fs *c) -{ - return bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_stripes, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_gc_write_stripes_key(trans, &iter, k))); -} - -/** - * bch2_check_allocations - walk all references to buckets, and recompute them: - * - * @c: filesystem object - * - * Returns: 0 on success, or standard errcode on failure - * - * Order matters here: - * - Concurrent GC relies on the fact that we have a total ordering for - * everything that GC walks - see gc_will_visit_node(), - * gc_will_visit_root() - * - * - also, references move around in the course of index updates and - * various other crap: everything needs to agree on the ordering - * references are allowed to move around in - e.g., we're allowed to - * start with a reference owned by an open_bucket (the allocator) and - * move it to the btree, but not the reverse. - * - * This is necessary to ensure that gc doesn't miss references that - * move around - if references move backwards in the ordering GC - * uses, GC could skip past them - */ -int bch2_check_allocations(struct bch_fs *c) -{ - int ret; - - down_read(&c->state_lock); - down_write(&c->gc_lock); - - bch2_btree_interior_updates_flush(c); - - ret = bch2_gc_accounting_start(c) ?: - bch2_gc_start(c) ?: - bch2_gc_alloc_start(c) ?: - bch2_gc_reflink_start(c); - if (ret) - goto out; - - gc_pos_set(c, gc_phase(GC_PHASE_start)); - - ret = bch2_mark_superblocks(c); - bch_err_msg(c, ret, "marking superblocks"); - if (ret) - goto out; - - ret = bch2_gc_btrees(c); - if (ret) - goto out; - - c->gc_count++; - - ret = bch2_gc_alloc_done(c) ?: - bch2_gc_accounting_done(c) ?: - bch2_gc_stripes_done(c) ?: - bch2_gc_reflink_done(c); -out: - percpu_down_write(&c->mark_lock); - /* Indicates that gc is no longer in progress: */ - __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); - - bch2_gc_free(c); - percpu_up_write(&c->mark_lock); - - up_write(&c->gc_lock); - up_read(&c->state_lock); - - /* - * At startup, allocations can happen directly instead of via the - * allocator thread - issue wakeup in case they blocked on gc_lock: - */ - closure_wake_up(&c->freelist_wait); - - if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) - bch2_sb_members_clean_deleted(c); - - bch_err_fn(c, ret); - return ret; -} - -static int gc_btree_gens_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) - return -EROFS; - - bool too_stale = false; - scoped_guard(rcu) { - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; - - too_stale |= dev_ptr_stale(ca, ptr) > 16; - } - - if (!too_stale) - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; - - u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; - if (gen_after(*gen, ptr->gen)) - *gen = ptr->gen; - } - } - - if (too_stale) { - struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0); - int ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - bch2_extent_normalize(c, bkey_i_to_s(u)); - } - - return 0; -} - -static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca, - struct btree_iter *iter, struct bkey_s_c k) -{ - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - struct bkey_i_alloc_v4 *a_mut; - int ret; - - if (a->oldest_gen == ca->oldest_gen[iter->pos.offset]) - return 0; - - a_mut = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a_mut); - if (ret) - return ret; - - a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; - - return bch2_trans_update(trans, iter, &a_mut->k_i, 0); -} - -int bch2_gc_gens(struct bch_fs *c) -{ - u64 b, start_time = local_clock(); - int ret; - - if (!mutex_trylock(&c->gc_gens_lock)) - return 0; - - trace_and_count(c, gc_gens_start, c); - - /* - * We have to use trylock here. Otherwise, we would - * introduce a deadlock in the RO path - we take the - * state lock at the start of going RO. - */ - if (!down_read_trylock(&c->state_lock)) { - mutex_unlock(&c->gc_gens_lock); - return 0; - } - - for_each_member_device(c, ca) { - struct bucket_gens *gens = bucket_gens(ca); - - BUG_ON(ca->oldest_gen); - - ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); - if (!ca->oldest_gen) { - bch2_dev_put(ca); - ret = bch_err_throw(c, ENOMEM_gc_gens); - goto err; - } - - for (b = gens->first_bucket; - b < gens->nbuckets; b++) - ca->oldest_gen[b] = gens->b[b]; - } - - for (unsigned i = 0; i < BTREE_ID_NR; i++) - if (btree_type_has_ptrs(i)) { - c->gc_gens_btree = i; - c->gc_gens_pos = POS_MIN; - - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, i, - POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, - k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - gc_btree_gens_key(trans, &iter, k))); - if (ret) - goto err; - } - - struct bch_dev *ca = NULL; - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, - BTREE_ITER_prefetch, - k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, ({ - ca = bch2_dev_iterate(c, ca, k.k->p.inode); - if (!ca) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - bch2_alloc_write_oldest_gen(trans, ca, &iter, k); - }))); - bch2_dev_put(ca); - - if (ret) - goto err; - - c->gc_gens_btree = 0; - c->gc_gens_pos = POS_MIN; - - c->gc_count++; - - bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); - trace_and_count(c, gc_gens_end, c); -err: - for_each_member_device(c, ca) { - kvfree(ca->oldest_gen); - ca->oldest_gen = NULL; - } - - up_read(&c->state_lock); - mutex_unlock(&c->gc_gens_lock); - if (!bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); - return ret; -} - -static void bch2_gc_gens_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); - bch2_gc_gens(c); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); -} - -void bch2_gc_gens_async(struct bch_fs *c) -{ - if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) && - !queue_work(c->write_ref_wq, &c->gc_gens_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); -} - -void bch2_fs_btree_gc_init_early(struct bch_fs *c) -{ - seqcount_init(&c->gc_pos_lock); - INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); - - init_rwsem(&c->gc_lock); - mutex_init(&c->gc_gens_lock); -} diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h deleted file mode 100644 index ec77662369a2..000000000000 --- a/fs/bcachefs/btree_gc.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_GC_H -#define _BCACHEFS_BTREE_GC_H - -#include "bkey.h" -#include "btree_gc_types.h" -#include "btree_types.h" - -int bch2_check_topology(struct bch_fs *); -int bch2_check_allocations(struct bch_fs *); - -/* - * For concurrent mark and sweep (with other index updates), we define a total - * ordering of _all_ references GC walks: - * - * Note that some references will have the same GC position as others - e.g. - * everything within the same btree node; in those cases we're relying on - * whatever locking exists for where those references live, i.e. the write lock - * on a btree node. - * - * That locking is also required to ensure GC doesn't pass the updater in - * between the updater adding/removing the reference and updating the GC marks; - * without that, we would at best double count sometimes. - * - * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ - * be held that prevents GC from passing the position the updater is at. - * - * (What about the start of gc, when we're clearing all the marks? GC clears the - * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc - * position inside its cmpxchg loop, so crap magically works). - */ - -/* Position of (the start of) a gc phase: */ -static inline struct gc_pos gc_phase(enum gc_phase phase) -{ - return (struct gc_pos) { .phase = phase, }; -} - -static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level, - struct bpos pos) -{ - return (struct gc_pos) { - .phase = GC_PHASE_btree, - .btree = btree, - .level = level, - .pos = pos, - }; -} - -static inline int gc_btree_order(enum btree_id btree) -{ - if (btree == BTREE_ID_alloc) - return -2; - if (btree == BTREE_ID_stripes) - return -1; - return btree; -} - -static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) -{ - return cmp_int(l.phase, r.phase) ?: - cmp_int(gc_btree_order(l.btree), - gc_btree_order(r.btree)) ?: - cmp_int(l.level, r.level) ?: - bpos_cmp(l.pos, r.pos); -} - -static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) -{ - unsigned seq; - bool ret; - - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - ret = gc_pos_cmp(pos, c->gc_pos) <= 0; - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - - return ret; -} - -void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); - -int bch2_gc_gens(struct bch_fs *); -void bch2_gc_gens_async(struct bch_fs *); - -void bch2_fs_btree_gc_init_early(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h deleted file mode 100644 index c24dd6edf377..000000000000 --- a/fs/bcachefs/btree_gc_types.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_GC_TYPES_H -#define _BCACHEFS_BTREE_GC_TYPES_H - -#include <linux/generic-radix-tree.h> - -#define GC_PHASES() \ - x(not_running) \ - x(start) \ - x(sb) \ - x(btree) - -enum gc_phase { -#define x(n) GC_PHASE_##n, - GC_PHASES() -#undef x -}; - -struct gc_pos { - enum gc_phase phase:8; - enum btree_id btree:8; - u16 level; - struct bpos pos; -}; - -struct reflink_gc { - u64 offset; - u32 size; - u32 refcount; -}; - -typedef GENRADIX(struct reflink_gc) reflink_gc_table; - -#endif /* _BCACHEFS_BTREE_GC_TYPES_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c deleted file mode 100644 index 590cd29f3e86..000000000000 --- a/fs/bcachefs/btree_io.c +++ /dev/null @@ -1,2742 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "async_objs.h" -#include "bkey_buf.h" -#include "bkey_methods.h" -#include "bkey_sort.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "checksum.h" -#include "debug.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "io_write.h" -#include "journal_reclaim.h" -#include "journal_seq_blacklist.h" -#include "recovery.h" -#include "super-io.h" -#include "trace.h" - -#include <linux/sched/mm.h> - -static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) -{ - bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); - prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn)); - prt_str(out, "min: "); - bch2_bpos_to_text(out, bn->min_key); - prt_newline(out); - prt_str(out, "max: "); - bch2_bpos_to_text(out, bn->max_key); -} - -void bch2_btree_node_io_unlock(struct btree *b) -{ - EBUG_ON(!btree_node_write_in_flight(b)); - - clear_btree_node_write_in_flight_inner(b); - clear_btree_node_write_in_flight(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -} - -void bch2_btree_node_io_lock(struct btree *b) -{ - wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} - -void __bch2_btree_node_wait_on_read(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); -} - -void __bch2_btree_node_wait_on_write(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} - -void bch2_btree_node_wait_on_read(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); -} - -void bch2_btree_node_wait_on_write(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} - -static void verify_no_dups(struct btree *b, - struct bkey_packed *start, - struct bkey_packed *end) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct bkey_packed *k, *p; - - if (start == end) - return; - - for (p = start, k = bkey_p_next(start); - k != end; - p = k, k = bkey_p_next(k)) { - struct bkey l = bkey_unpack_key(b, p); - struct bkey r = bkey_unpack_key(b, k); - - BUG_ON(bpos_ge(l.p, bkey_start_pos(&r))); - } -#endif -} - -static void set_needs_whiteout(struct bset *i, int v) -{ - struct bkey_packed *k; - - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) - k->needs_whiteout = v; -} - -static void btree_bounce_free(struct bch_fs *c, size_t size, - bool used_mempool, void *p) -{ - if (used_mempool) - mempool_free(p, &c->btree_bounce_pool); - else - kvfree(p); -} - -static void *btree_bounce_alloc(struct bch_fs *c, size_t size, - bool *used_mempool) -{ - unsigned flags = memalloc_nofs_save(); - void *p; - - BUG_ON(size > c->opts.btree_node_size); - - *used_mempool = false; - p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); - if (!p) { - *used_mempool = true; - p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); - } - memalloc_nofs_restore(flags); - return p; -} - -static void sort_bkey_ptrs(const struct btree *bt, - struct bkey_packed **ptrs, unsigned nr) -{ - unsigned n = nr, a = nr / 2, b, c, d; - - if (!a) - return; - - /* Heap sort: see lib/sort.c: */ - while (1) { - if (a) - a--; - else if (--n) - swap(ptrs[0], ptrs[n]); - else - break; - - for (b = a; c = 2 * b + 1, (d = c + 1) < n;) - b = bch2_bkey_cmp_packed(bt, - ptrs[c], - ptrs[d]) >= 0 ? c : d; - if (d == n) - b = c; - - while (b != a && - bch2_bkey_cmp_packed(bt, - ptrs[a], - ptrs[b]) >= 0) - b = (b - 1) / 2; - c = b; - while (b != a) { - b = (b - 1) / 2; - swap(ptrs[b], ptrs[c]); - } - } -} - -static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) -{ - struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; - bool used_mempool = false; - size_t bytes = b->whiteout_u64s * sizeof(u64); - - if (!b->whiteout_u64s) - return; - - new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); - - ptrs = ptrs_end = ((void *) new_whiteouts + bytes); - - for (k = unwritten_whiteouts_start(b); - k != unwritten_whiteouts_end(b); - k = bkey_p_next(k)) - *--ptrs = k; - - sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); - - k = new_whiteouts; - - while (ptrs != ptrs_end) { - bkey_p_copy(k, *ptrs); - k = bkey_p_next(k); - ptrs++; - } - - verify_no_dups(b, new_whiteouts, - (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); - - memcpy_u64s(unwritten_whiteouts_start(b), - new_whiteouts, b->whiteout_u64s); - - btree_bounce_free(c, bytes, used_mempool, new_whiteouts); -} - -static bool should_compact_bset(struct btree *b, struct bset_tree *t, - bool compacting, enum compact_mode mode) -{ - if (!bset_dead_u64s(b, t)) - return false; - - switch (mode) { - case COMPACT_LAZY: - return should_compact_bset_lazy(b, t) || - (compacting && !bset_written(b, bset(b, t))); - case COMPACT_ALL: - return true; - default: - BUG(); - } -} - -static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) -{ - bool ret = false; - - for_each_bset(b, t) { - struct bset *i = bset(b, t); - struct bkey_packed *k, *n, *out, *start, *end; - struct btree_node_entry *src = NULL, *dst = NULL; - - if (t != b->set && !bset_written(b, i)) { - src = container_of(i, struct btree_node_entry, keys); - dst = max(write_block(b), - (void *) btree_bkey_last(b, t - 1)); - } - - if (src != dst) - ret = true; - - if (!should_compact_bset(b, t, ret, mode)) { - if (src != dst) { - memmove(dst, src, sizeof(*src) + - le16_to_cpu(src->keys.u64s) * - sizeof(u64)); - i = &dst->keys; - set_btree_bset(b, t, i); - } - continue; - } - - start = btree_bkey_first(b, t); - end = btree_bkey_last(b, t); - - if (src != dst) { - memmove(dst, src, sizeof(*src)); - i = &dst->keys; - set_btree_bset(b, t, i); - } - - out = i->start; - - for (k = start; k != end; k = n) { - n = bkey_p_next(k); - - if (!bkey_deleted(k)) { - bkey_p_copy(out, k); - out = bkey_p_next(out); - } else { - BUG_ON(k->needs_whiteout); - } - } - - i->u64s = cpu_to_le16((u64 *) out - i->_data); - set_btree_bset_end(b, t); - bch2_bset_set_no_aux_tree(b, t); - ret = true; - } - - bch2_verify_btree_nr_keys(b); - - bch2_btree_build_aux_trees(b); - - return ret; -} - -bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, - enum compact_mode mode) -{ - return bch2_drop_whiteouts(b, mode); -} - -static void btree_node_sort(struct bch_fs *c, struct btree *b, - unsigned start_idx, - unsigned end_idx) -{ - struct btree_node *out; - struct sort_iter_stack sort_iter; - struct bset_tree *t; - struct bset *start_bset = bset(b, &b->set[start_idx]); - bool used_mempool = false; - u64 start_time, seq = 0; - unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; - bool sorting_entire_node = start_idx == 0 && - end_idx == b->nsets; - - sort_iter_stack_init(&sort_iter, b); - - for (t = b->set + start_idx; - t < b->set + end_idx; - t++) { - u64s += le16_to_cpu(bset(b, t)->u64s); - sort_iter_add(&sort_iter.iter, - btree_bkey_first(b, t), - btree_bkey_last(b, t)); - } - - bytes = sorting_entire_node - ? btree_buf_bytes(b) - : __vstruct_bytes(struct btree_node, u64s); - - out = btree_bounce_alloc(c, bytes, &used_mempool); - - start_time = local_clock(); - - u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter); - - out->keys.u64s = cpu_to_le16(u64s); - - BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); - - if (sorting_entire_node) - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], - start_time); - - /* Make sure we preserve bset journal_seq: */ - for (t = b->set + start_idx; t < b->set + end_idx; t++) - seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); - start_bset->journal_seq = cpu_to_le64(seq); - - if (sorting_entire_node) { - u64s = le16_to_cpu(out->keys.u64s); - - BUG_ON(bytes != btree_buf_bytes(b)); - - /* - * Our temporary buffer is the same size as the btree node's - * buffer, we can just swap buffers instead of doing a big - * memcpy() - */ - *out = *b->data; - out->keys.u64s = cpu_to_le16(u64s); - swap(out, b->data); - set_btree_bset(b, b->set, &b->data->keys); - } else { - start_bset->u64s = out->keys.u64s; - memcpy_u64s(start_bset->start, - out->keys.start, - le16_to_cpu(out->keys.u64s)); - } - - for (i = start_idx + 1; i < end_idx; i++) - b->nr.bset_u64s[start_idx] += - b->nr.bset_u64s[i]; - - b->nsets -= shift; - - for (i = start_idx + 1; i < b->nsets; i++) { - b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; - b->set[i] = b->set[i + shift]; - } - - for (i = b->nsets; i < MAX_BSETS; i++) - b->nr.bset_u64s[i] = 0; - - set_btree_bset_end(b, &b->set[start_idx]); - bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); - - btree_bounce_free(c, bytes, used_mempool, out); - - bch2_verify_btree_nr_keys(b); -} - -void bch2_btree_sort_into(struct bch_fs *c, - struct btree *dst, - struct btree *src) -{ - struct btree_nr_keys nr; - struct btree_node_iter src_iter; - u64 start_time = local_clock(); - - BUG_ON(dst->nsets != 1); - - bch2_bset_set_no_aux_tree(dst, dst->set); - - bch2_btree_node_iter_init_from_start(&src_iter, src); - - nr = bch2_sort_repack(btree_bset_first(dst), - src, &src_iter, - &dst->format, - true); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], - start_time); - - set_btree_bset_end(dst, dst->set); - - dst->nr.live_u64s += nr.live_u64s; - dst->nr.bset_u64s[0] += nr.bset_u64s[0]; - dst->nr.packed_keys += nr.packed_keys; - dst->nr.unpacked_keys += nr.unpacked_keys; - - bch2_verify_btree_nr_keys(dst); -} - -/* - * We're about to add another bset to the btree node, so if there's currently - * too many bsets - sort some of them together: - */ -static bool btree_node_compact(struct bch_fs *c, struct btree *b) -{ - unsigned unwritten_idx; - bool ret = false; - - for (unwritten_idx = 0; - unwritten_idx < b->nsets; - unwritten_idx++) - if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) - break; - - if (b->nsets - unwritten_idx > 1) { - btree_node_sort(c, b, unwritten_idx, b->nsets); - ret = true; - } - - if (unwritten_idx > 1) { - btree_node_sort(c, b, 0, unwritten_idx); - ret = true; - } - - return ret; -} - -void bch2_btree_build_aux_trees(struct btree *b) -{ - for_each_bset(b, t) - bch2_bset_build_aux_tree(b, t, - !bset_written(b, bset(b, t)) && - t == bset_tree_last(b)); -} - -/* - * If we have MAX_BSETS (3) bsets, should we sort them all down to just one? - * - * The first bset is going to be of similar order to the size of the node, the - * last bset is bounded by btree_write_set_buffer(), which is set to keep the - * memmove on insert from being too expensive: the middle bset should, ideally, - * be the geometric mean of the first and the last. - * - * Returns true if the middle bset is greater than that geometric mean: - */ -static inline bool should_compact_all(struct bch_fs *c, struct btree *b) -{ - unsigned mid_u64s_bits = - (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2; - - return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits; -} - -/* - * @bch_btree_init_next - initialize a new (unwritten) bset that can then be - * inserted into - * - * Safe to call if there already is an unwritten bset - will only add a new bset - * if @b doesn't already have one. - * - * Returns true if we sorted (i.e. invalidated iterators - */ -void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) -{ - struct bch_fs *c = trans->c; - struct btree_node_entry *bne; - bool reinit_iter = false; - - EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]); - BUG_ON(bset_written(b, bset(b, &b->set[1]))); - BUG_ON(btree_node_just_written(b)); - - if (b->nsets == MAX_BSETS && - !btree_node_write_in_flight(b) && - should_compact_all(c, b)) { - bch2_btree_node_write_trans(trans, b, SIX_LOCK_write, - BTREE_WRITE_init_next_bset); - reinit_iter = true; - } - - if (b->nsets == MAX_BSETS && - btree_node_compact(c, b)) - reinit_iter = true; - - BUG_ON(b->nsets >= MAX_BSETS); - - bne = want_new_bset(c, b); - if (bne) - bch2_bset_init_next(b, bne); - - bch2_btree_build_aux_trees(b); - - if (reinit_iter) - bch2_trans_node_reinit_iter(trans, b); -} - -static void btree_err_msg(struct printbuf *out, struct bch_fs *c, - struct bch_dev *ca, - bool print_pos, - struct btree *b, struct bset *i, struct bkey_packed *k, - unsigned offset, int rw) -{ - if (print_pos) { - prt_str(out, rw == READ - ? "error validating btree node " - : "corrupt btree node before write "); - prt_printf(out, "at btree "); - bch2_btree_pos_to_text(out, c, b); - prt_newline(out); - } - - if (ca) - prt_printf(out, "%s ", ca->name); - - prt_printf(out, "node offset %u/%u", - b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); - if (i) - prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); - if (k) - prt_printf(out, " bset byte offset %lu", - (unsigned long)(void *)k - - ((unsigned long)(void *)i & ~511UL)); - prt_str(out, ": "); -} - -__printf(11, 12) -static int __btree_err(int ret, - struct bch_fs *c, - struct bch_dev *ca, - struct btree *b, - struct bset *i, - struct bkey_packed *k, - int rw, - enum bch_sb_error_id err_type, - struct bch_io_failures *failed, - struct printbuf *err_msg, - const char *fmt, ...) -{ - if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) - return ret == -BCH_ERR_btree_node_read_err_fixable - ? bch_err_throw(c, fsck_fix) - : ret; - - bool have_retry = false; - int ret2; - - if (ca) { - bch2_mark_btree_validate_failure(failed, ca->dev_idx); - - struct extent_ptr_decoded pick; - have_retry = bch2_bkey_pick_read_device(c, - bkey_i_to_s_c(&b->key), - failed, &pick, -1) == 1; - } - - if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) - ret = bch_err_throw(c, btree_node_read_err_fixable); - if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) - ret = bch_err_throw(c, btree_node_read_err_bad_node); - - bch2_sb_error_count(c, err_type); - - bool print_deferred = err_msg && - rw == READ && - !(test_bit(BCH_FS_in_fsck, &c->flags) && - c->opts.fix_errors == FSCK_FIX_ask); - - struct printbuf out = PRINTBUF; - bch2_log_msg_start(c, &out); - - if (!print_deferred) - err_msg = &out; - - btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw); - - va_list args; - va_start(args, fmt); - prt_vprintf(err_msg, fmt, args); - va_end(args); - - if (print_deferred) { - prt_newline(err_msg); - - switch (ret) { - case -BCH_ERR_btree_node_read_err_fixable: - ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type); - if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && - !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { - ret = ret2; - goto fsck_err; - } - - if (!have_retry) - ret = bch_err_throw(c, fsck_fix); - goto out; - case -BCH_ERR_btree_node_read_err_bad_node: - prt_str(&out, ", "); - break; - } - - goto out; - } - - if (rw == WRITE) { - prt_str(&out, ", "); - ret = __bch2_inconsistent_error(c, &out) - ? -BCH_ERR_fsck_errors_not_fixed - : 0; - goto print; - } - - switch (ret) { - case -BCH_ERR_btree_node_read_err_fixable: - ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf); - if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && - !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { - ret = ret2; - goto fsck_err; - } - - if (!have_retry) - ret = bch_err_throw(c, fsck_fix); - goto out; - case -BCH_ERR_btree_node_read_err_bad_node: - prt_str(&out, ", "); - break; - } -print: - bch2_print_str(c, KERN_ERR, out.buf); -out: -fsck_err: - printbuf_exit(&out); - return ret; -} - -#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ -({ \ - int _ret = __btree_err(type, c, ca, b, i, k, write, \ - BCH_FSCK_ERR_##_err_type, \ - failed, err_msg, \ - msg, ##__VA_ARGS__); \ - \ - if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \ - ret = _ret; \ - goto fsck_err; \ - } \ - \ - true; \ -}) - -#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) - -/* - * When btree topology repair changes the start or end of a node, that might - * mean we have to drop keys that are no longer inside the node: - */ -__cold -void bch2_btree_node_drop_keys_outside_node(struct btree *b) -{ - for_each_bset(b, t) { - struct bset *i = bset(b, t); - struct bkey_packed *k; - - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) - if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) - break; - - if (k != i->start) { - unsigned shift = (u64 *) k - (u64 *) i->start; - - memmove_u64s_down(i->start, k, - (u64 *) vstruct_end(i) - (u64 *) k); - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift); - set_btree_bset_end(b, t); - } - - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) - if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0) - break; - - if (k != vstruct_last(i)) { - i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start); - set_btree_bset_end(b, t); - } - } - - /* - * Always rebuild search trees: eytzinger search tree nodes directly - * depend on the values of min/max key: - */ - bch2_bset_set_no_aux_tree(b, b->set); - bch2_btree_build_aux_trees(b); - b->nr = bch2_btree_node_count_keys(b); - - struct bkey_s_c k; - struct bkey unpacked; - struct btree_node_iter iter; - for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { - BUG_ON(bpos_lt(k.k->p, b->data->min_key)); - BUG_ON(bpos_gt(k.k->p, b->data->max_key)); - } -} - -static int validate_bset(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, struct bset *i, - unsigned offset, int write, - struct bch_io_failures *failed, - struct printbuf *err_msg) -{ - unsigned version = le16_to_cpu(i->version); - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - int ret = 0; - - btree_err_on(!bch2_version_compatible(version), - -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, NULL, - btree_node_unsupported_version, - "unsupported bset version %u.%u", - BCH_VERSION_MAJOR(version), - BCH_VERSION_MINOR(version)); - - if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes && - btree_err_on(version < c->sb.version_min, - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, NULL, - btree_node_bset_older_than_sb_min, - "bset version %u older than superblock version_min %u", - version, c->sb.version_min)) { - if (bch2_version_compatible(version)) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->version_min = cpu_to_le16(version); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } else { - /* We have no idea what's going on: */ - i->version = cpu_to_le16(c->sb.version); - } - } - - if (btree_err_on(BCH_VERSION_MAJOR(version) > - BCH_VERSION_MAJOR(c->sb.version), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, NULL, - btree_node_bset_newer_than_sb, - "bset version %u newer than superblock version %u", - version, c->sb.version)) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->version = cpu_to_le16(version); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - - btree_err_on(BSET_SEPARATE_WHITEOUTS(i), - -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, NULL, - btree_node_unsupported_version, - "BSET_SEPARATE_WHITEOUTS no longer supported"); - - btree_err_on(offset && !i->u64s, - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_empty, - "empty bset"); - - btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_wrong_sector_offset, - "bset at wrong sector offset"); - - if (!offset) { - struct btree_node *bn = - container_of(i, struct btree_node, keys); - /* These indicate that we read the wrong btree node: */ - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bch_btree_ptr_v2 *bp = - &bkey_i_to_btree_ptr_v2(&b->key)->v; - - /* XXX endianness */ - btree_err_on(bp->seq != bn->keys.seq, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - bset_bad_seq, - "incorrect sequence number (wrong btree node)"); - } - - btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, NULL, - btree_node_bad_btree, - "incorrect btree id"); - - btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, NULL, - btree_node_bad_level, - "incorrect level"); - - if (!write) - compat_btree_node(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, bn); - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bch_btree_ptr_v2 *bp = - &bkey_i_to_btree_ptr_v2(&b->key)->v; - - if (BTREE_PTR_RANGE_UPDATED(bp)) { - b->data->min_key = bp->min_key; - b->data->max_key = b->key.k.p; - } - - btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_min_key, - "incorrect min_key: got %s should be %s", - (printbuf_reset(&buf1), - bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), - (printbuf_reset(&buf2), - bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); - } - - btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, NULL, - btree_node_bad_max_key, - "incorrect max key %s", - (printbuf_reset(&buf1), - bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); - - if (write) - compat_btree_node(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, bn); - - btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), - -BCH_ERR_btree_node_read_err_bad_node, - c, ca, b, i, NULL, - btree_node_bad_format, - "invalid bkey format: %s\n%s", buf1.buf, - (printbuf_reset(&buf2), - bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); - printbuf_reset(&buf1); - - compat_bformat(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, - &bn->format); - } -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - -static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - enum bch_validate_flags flags) -{ - return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level, - .btree = b->c.btree_id, - .flags = flags - }); -} - -static int bset_key_validate(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - bool updated_range, - enum bch_validate_flags flags) -{ - struct bkey_validate_context from = (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level, - .btree = b->c.btree_id, - .flags = flags, - }; - return __bch2_bkey_validate(c, k, from) ?: - (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?: - (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0); -} - -static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, - struct bset *i, struct bkey_packed *k) -{ - if (bkey_p_next(k) > vstruct_last(i)) - return false; - - if (k->format > KEY_FORMAT_CURRENT) - return false; - - if (!bkeyp_u64s_valid(&b->format, k)) - return false; - - struct bkey tmp; - struct bkey_s u = __bkey_disassemble(b, k, &tmp); - return !__bch2_bkey_validate(c, u.s_c, - (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level, - .btree = b->c.btree_id, - .flags = BCH_VALIDATE_silent - }); -} - -static inline int btree_node_read_bkey_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed(b, l, r) - ?: (int) bkey_deleted(r) - (int) bkey_deleted(l); -} - -static int validate_bset_keys(struct bch_fs *c, struct btree *b, - struct bset *i, int write, - struct bch_io_failures *failed, - struct printbuf *err_msg) -{ - unsigned version = le16_to_cpu(i->version); - struct bkey_packed *k, *prev = NULL; - struct printbuf buf = PRINTBUF; - bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && - BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); - int ret = 0; - - for (k = i->start; - k != vstruct_last(i);) { - struct bkey_s u; - struct bkey tmp; - unsigned next_good_key; - - if (btree_err_on(bkey_p_next(k) > vstruct_last(i), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bkey_past_bset_end, - "key extends past end of bset")) { - i->u64s = cpu_to_le16((u64 *) k - i->_data); - break; - } - - if (btree_err_on(k->format > KEY_FORMAT_CURRENT, - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bkey_bad_format, - "invalid bkey format %u", k->format)) - goto drop_this_key; - - if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bkey_bad_u64s, - "bad k->u64s %u (min %u max %zu)", k->u64s, - bkeyp_key_u64s(&b->format, k), - U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k))) - goto drop_this_key; - - if (!write) - bch2_bkey_compat(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, - &b->format, k); - - u = __bkey_disassemble(b, k, &tmp); - - ret = bset_key_validate(c, b, u.s_c, updated_range, write); - if (ret == -BCH_ERR_fsck_delete_bkey) - goto drop_this_key; - if (ret) - goto fsck_err; - - if (write) - bch2_bkey_compat(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, - &b->format, k); - - if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) { - struct bkey up = bkey_unpack_key(b, prev); - - printbuf_reset(&buf); - prt_printf(&buf, "keys out of order: "); - bch2_bkey_to_text(&buf, &up); - prt_printf(&buf, " > "); - bch2_bkey_to_text(&buf, u.k); - - if (btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bkey_out_of_order, - "%s", buf.buf)) - goto drop_this_key; - } - - prev = k; - k = bkey_p_next(k); - continue; -drop_this_key: - next_good_key = k->u64s; - - if (!next_good_key || - (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN && - version >= bcachefs_metadata_version_snapshot)) { - /* - * only do scanning if bch2_bkey_compat() has nothing to - * do - */ - - if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { - for (next_good_key = 1; - next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; - next_good_key++) - if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) - goto got_good_key; - } - - /* - * didn't find a good key, have to truncate the rest of - * the bset - */ - next_good_key = (u64 *) vstruct_last(i) - (u64 *) k; - } -got_good_key: - le16_add_cpu(&i->u64s, -next_good_key); - memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_error(b); - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, - struct bch_io_failures *failed, - struct printbuf *err_msg) -{ - struct btree_node_entry *bne; - struct sort_iter *iter; - struct btree_node *sorted; - struct bkey_packed *k; - struct bset *i; - bool used_mempool, blacklisted; - bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && - BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); - unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); - u64 max_journal_seq = 0; - struct printbuf buf = PRINTBUF; - int ret = 0, write = READ; - u64 start_time = local_clock(); - - b->version_ondisk = U16_MAX; - /* We might get called multiple times on read retry: */ - b->written = 0; - - iter = mempool_alloc(&c->fill_iter, GFP_NOFS); - sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2); - - if (bch2_meta_read_fault("btree")) - btree_err(-BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_fault_injected, - "dynamic fault"); - - btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_magic, - "bad magic: want %llx, got %llx", - bset_magic(c), le64_to_cpu(b->data->magic)); - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bch_btree_ptr_v2 *bp = - &bkey_i_to_btree_ptr_v2(&b->key)->v; - - bch2_bpos_to_text(&buf, b->data->min_key); - prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, b->data->max_key); - - btree_err_on(b->data->keys.seq != bp->seq, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_seq, - "got wrong btree node: got\n%s", - (printbuf_reset(&buf), - bch2_btree_node_header_to_text(&buf, b->data), - buf.buf)); - } else { - btree_err_on(!b->data->keys.seq, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_seq, - "bad btree header: seq 0\n%s", - (printbuf_reset(&buf), - bch2_btree_node_header_to_text(&buf, b->data), - buf.buf)); - } - - while (b->written < (ptr_written ?: btree_sectors(c))) { - unsigned sectors; - bool first = !b->written; - - if (first) { - bne = NULL; - i = &b->data->keys; - } else { - bne = write_block(b); - i = &bne->keys; - - if (i->seq != b->data->keys.seq) - break; - } - - struct nonce nonce = btree_nonce(i, b->written << 9); - bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); - - btree_err_on(!good_csum_type, - bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) - ? -BCH_ERR_btree_node_read_err_must_retry - : -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_unknown_csum, - "unknown checksum type %llu", BSET_CSUM_TYPE(i)); - - if (first) { - sectors = vstruct_sectors(b->data, c->block_bits); - if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_past_end_of_btree_node, - "bset past end of btree node (offset %u len %u but written %zu)", - b->written, sectors, ptr_written ?: btree_sectors(c))) - i->u64s = 0; - if (good_csum_type) { - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); - bool csum_bad = bch2_crc_cmp(b->data->csum, csum); - if (csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "decrypting btree node: %s", bch2_err_str(ret))) - goto fsck_err; - } - - btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && - !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), - -BCH_ERR_btree_node_read_err_incompatible, - c, NULL, b, NULL, NULL, - btree_node_unsupported_version, - "btree node does not have NEW_EXTENT_OVERWRITE set"); - } else { - sectors = vstruct_sectors(bne, c->block_bits); - if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_past_end_of_btree_node, - "bset past end of btree node (offset %u len %u but written %zu)", - b->written, sectors, ptr_written ?: btree_sectors(c))) - i->u64s = 0; - if (good_csum_type) { - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - bool csum_bad = bch2_crc_cmp(bne->csum, csum); - if (ca && csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "decrypting btree node: %s", bch2_err_str(ret))) - goto fsck_err; - } - } - - b->version_ondisk = min(b->version_ondisk, - le16_to_cpu(i->version)); - - ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg); - if (ret) - goto fsck_err; - - if (!b->written) - btree_node_set_format(b, b->data->format); - - ret = validate_bset_keys(c, b, i, READ, failed, err_msg); - if (ret) - goto fsck_err; - - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - - blacklisted = bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(i->journal_seq), - true); - - btree_err_on(blacklisted && first, - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_blacklisted_journal_seq, - "first btree node bset has blacklisted journal seq (%llu)", - le64_to_cpu(i->journal_seq)); - - btree_err_on(blacklisted && ptr_written, - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - first_bset_blacklisted_journal_seq, - "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", - le64_to_cpu(i->journal_seq), - b->written, b->written + sectors, ptr_written); - - b->written = min(b->written + sectors, btree_sectors(c)); - - if (blacklisted && !first) - continue; - - sort_iter_add(iter, - vstruct_idx(i, 0), - vstruct_last(i)); - - max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq)); - } - - if (ptr_written) { - btree_err_on(b->written < ptr_written, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, NULL, - btree_node_data_missing, - "btree node data missing: expected %u sectors, found %u", - ptr_written, b->written); - } else { - for (bne = write_block(b); - bset_byte_offset(b, bne) < btree_buf_bytes(b); - bne = (void *) bne + block_bytes(c)) - btree_err_on(bne->keys.seq == b->data->keys.seq && - !bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(bne->keys.journal_seq), - true), - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, NULL, - btree_node_bset_after_end, - "found bset signature after last bset"); - } - - sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); - sorted->keys.u64s = 0; - - b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); - memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, - btree_buf_bytes(b) - - sizeof(struct btree_node) - - b->nr.live_u64s * sizeof(u64)); - - b->data->keys.u64s = sorted->keys.u64s; - *sorted = *b->data; - swap(sorted, b->data); - set_btree_bset(b, b->set, &b->data->keys); - b->nsets = 1; - b->data->keys.journal_seq = cpu_to_le64(max_journal_seq); - - BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s)); - - btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); - - i = &b->data->keys; - for (k = i->start; k != vstruct_last(i);) { - struct bkey tmp; - struct bkey_s u = __bkey_disassemble(b, k, &tmp); - - ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); - if (ret == -BCH_ERR_fsck_delete_bkey || - (static_branch_unlikely(&bch2_inject_invalid_keys) && - !bversion_cmp(u.k->bversion, MAX_VERSION))) { - btree_keys_account_key_drop(&b->nr, 0, k); - - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - set_btree_bset_end(b, b->set); - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_error(b); - continue; - } - if (ret) - goto fsck_err; - - if (u.k->type == KEY_TYPE_btree_ptr_v2) { - struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); - - bp.v->mem_ptr = 0; - } - - k = bkey_p_next(k); - } - - bch2_bset_build_aux_tree(b, b->set, false); - - set_needs_whiteout(btree_bset_first(b), true); - - btree_node_reset_sib_u64s(b); - - if (updated_range) - bch2_btree_node_drop_keys_outside_node(b); - - /* - * XXX: - * - * We deadlock if too many btree updates require node rewrites while - * we're still in journal replay. - * - * This is because btree node rewrites generate more updates for the - * interior updates (alloc, backpointers), and if those updates touch - * new nodes and generate more rewrites - well, you see the problem. - * - * The biggest cause is that we don't use the btree write buffer (for - * the backpointer updates - this needs some real thought on locking in - * order to fix. - * - * The problem with this workaround (not doing the rewrite for degraded - * nodes in journal replay) is that those degraded nodes persist, and we - * don't want that (this is a real bug when a btree node write completes - * with fewer replicas than we wanted and leaves a degraded node due to - * device _removal_, i.e. the device went away mid write). - * - * It's less of a bug here, but still a problem because we don't yet - * have a way of tracking degraded data - we another index (all - * extents/btree nodes, by replicas entry) in order to fix properly - * (re-replicate degraded data at the earliest possible time). - */ - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) { - scoped_guard(rcu) - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_degraded(b); - } - } - } - - if (!ptr_written) { - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_ptr_written_zero(b); - } -fsck_err: - mempool_free(iter, &c->fill_iter); - printbuf_exit(&buf); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); - return ret; -} - -static void btree_node_read_work(struct work_struct *work) -{ - struct btree_read_bio *rb = - container_of(work, struct btree_read_bio, work); - struct bch_fs *c = rb->c; - struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; - struct btree *b = rb->b; - struct bio *bio = &rb->bio; - struct bch_io_failures failed = { .nr = 0 }; - int ret = 0; - - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "btree node read error at btree "); - bch2_btree_pos_to_text(&buf, c, b); - prt_newline(&buf); - - goto start; - while (1) { - ret = bch2_bkey_pick_read_device(c, - bkey_i_to_s_c(&b->key), - &failed, &rb->pick, -1); - if (ret <= 0) { - set_btree_node_read_error(b); - break; - } - - ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); - rb->have_ioref = ca != NULL; - rb->start_time = local_clock(); - bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = rb->pick.ptr.offset; - bio->bi_iter.bi_size = btree_buf_bytes(b); - - if (rb->have_ioref) { - bio_set_dev(bio, ca->disk_sb.bdev); - submit_bio_wait(bio); - } else { - bio->bi_status = BLK_STS_REMOVED; - } - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, - rb->start_time, !bio->bi_status); -start: - if (rb->have_ioref) - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read); - rb->have_ioref = false; - - if (bio->bi_status) { - bch2_mark_io_failure(&failed, &rb->pick, false); - continue; - } - - ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); - if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) - continue; - - if (ret) - set_btree_node_read_error(b); - - break; - } - - bch2_io_failures_to_text(&buf, c, &failed); - - if (btree_node_read_error(b)) - bch2_btree_lost_data(c, &buf, b->c.btree_id); - - /* - * only print retry success if we read from a replica with no errors - */ - if (btree_node_read_error(b)) - prt_printf(&buf, "ret %s", bch2_err_str(ret)); - else if (failed.nr) { - if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev)) - prt_printf(&buf, "retry success"); - else - prt_printf(&buf, "repair success"); - } - - if ((failed.nr || - btree_node_need_rewrite(b)) && - !btree_node_read_error(b) && - c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { - prt_printf(&buf, " (rewriting node)"); - bch2_btree_node_rewrite_async(c, b); - } - prt_newline(&buf); - - if (failed.nr) - bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); - - async_object_list_del(c, btree_read_bio, rb->list_idx); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], - rb->start_time); - bio_put(&rb->bio); - printbuf_exit(&buf); - clear_btree_node_read_in_flight(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -} - -static void btree_node_read_endio(struct bio *bio) -{ - struct btree_read_bio *rb = - container_of(bio, struct btree_read_bio, bio); - struct bch_fs *c = rb->c; - struct bch_dev *ca = rb->have_ioref - ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, - rb->start_time, !bio->bi_status); - - queue_work(c->btree_read_complete_wq, &rb->work); -} - -void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio) -{ - bch2_bio_to_text(out, &rbio->bio); -} - -struct btree_node_read_all { - struct closure cl; - struct bch_fs *c; - struct btree *b; - unsigned nr; - void *buf[BCH_REPLICAS_MAX]; - struct bio *bio[BCH_REPLICAS_MAX]; - blk_status_t err[BCH_REPLICAS_MAX]; -}; - -static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) -{ - struct btree_node *bn = data; - struct btree_node_entry *bne; - unsigned offset = 0; - - if (le64_to_cpu(bn->magic) != bset_magic(c)) - return 0; - - while (offset < btree_sectors(c)) { - if (!offset) { - offset += vstruct_sectors(bn, c->block_bits); - } else { - bne = data + (offset << 9); - if (bne->keys.seq != bn->keys.seq) - break; - offset += vstruct_sectors(bne, c->block_bits); - } - } - - return offset; -} - -static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data) -{ - struct btree_node *bn = data; - struct btree_node_entry *bne; - - if (!offset) - return false; - - while (offset < btree_sectors(c)) { - bne = data + (offset << 9); - if (bne->keys.seq == bn->keys.seq) - return true; - offset++; - } - - return false; - return offset; -} - -static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) -{ - closure_type(ra, struct btree_node_read_all, cl); - struct bch_fs *c = ra->c; - struct btree *b = ra->b; - struct printbuf buf = PRINTBUF; - bool dump_bset_maps = false; - int ret = 0, best = -1, write = READ; - unsigned i, written = 0, written2 = 0; - __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 - ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; - bool _saw_error = false, *saw_error = &_saw_error; - struct printbuf *err_msg = NULL; - struct bch_io_failures *failed = NULL; - - for (i = 0; i < ra->nr; i++) { - struct btree_node *bn = ra->buf[i]; - - if (ra->err[i]) - continue; - - if (le64_to_cpu(bn->magic) != bset_magic(c) || - (seq && seq != bn->keys.seq)) - continue; - - if (best < 0) { - best = i; - written = btree_node_sectors_written(c, bn); - continue; - } - - written2 = btree_node_sectors_written(c, ra->buf[i]); - if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, NULL, - btree_node_replicas_sectors_written_mismatch, - "btree node sectors written mismatch: %u != %u", - written, written2) || - btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, NULL, - btree_node_bset_after_end, - "found bset signature after last bset") || - btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, NULL, - btree_node_replicas_data_mismatch, - "btree node replicas content mismatch")) - dump_bset_maps = true; - - if (written2 > written) { - written = written2; - best = i; - } - } -fsck_err: - if (dump_bset_maps) { - for (i = 0; i < ra->nr; i++) { - struct btree_node *bn = ra->buf[i]; - struct btree_node_entry *bne = NULL; - unsigned offset = 0, sectors; - bool gap = false; - - if (ra->err[i]) - continue; - - printbuf_reset(&buf); - - while (offset < btree_sectors(c)) { - if (!offset) { - sectors = vstruct_sectors(bn, c->block_bits); - } else { - bne = ra->buf[i] + (offset << 9); - if (bne->keys.seq != bn->keys.seq) - break; - sectors = vstruct_sectors(bne, c->block_bits); - } - - prt_printf(&buf, " %u-%u", offset, offset + sectors); - if (bne && bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(bne->keys.journal_seq), false)) - prt_printf(&buf, "*"); - offset += sectors; - } - - while (offset < btree_sectors(c)) { - bne = ra->buf[i] + (offset << 9); - if (bne->keys.seq == bn->keys.seq) { - if (!gap) - prt_printf(&buf, " GAP"); - gap = true; - - sectors = vstruct_sectors(bne, c->block_bits); - prt_printf(&buf, " %u-%u", offset, offset + sectors); - if (bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(bne->keys.journal_seq), false)) - prt_printf(&buf, "*"); - } - offset++; - } - - bch_err(c, "replica %u:%s", i, buf.buf); - } - } - - if (best >= 0) { - memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); - ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL); - } else { - ret = -1; - } - - if (ret) { - set_btree_node_read_error(b); - - struct printbuf buf = PRINTBUF; - bch2_btree_lost_data(c, &buf, b->c.btree_id); - if (buf.pos) - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - } else if (*saw_error) - bch2_btree_node_rewrite_async(c, b); - - for (i = 0; i < ra->nr; i++) { - mempool_free(ra->buf[i], &c->btree_bounce_pool); - bio_put(ra->bio[i]); - } - - closure_debug_destroy(&ra->cl); - kfree(ra); - printbuf_exit(&buf); - - clear_btree_node_read_in_flight(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -} - -static void btree_node_read_all_replicas_endio(struct bio *bio) -{ - struct btree_read_bio *rb = - container_of(bio, struct btree_read_bio, bio); - struct bch_fs *c = rb->c; - struct btree_node_read_all *ra = rb->ra; - - if (rb->have_ioref) { - struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); - - bch2_latency_acct(ca, rb->start_time, READ); - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_btree_node_read_all_replicas); - } - - ra->err[rb->idx] = bio->bi_status; - closure_put(&ra->cl); -} - -/* - * XXX This allocates multiple times from the same mempools, and can deadlock - * under sufficient memory pressure (but is only a debug path) - */ -static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) -{ - struct bkey_s_c k = bkey_i_to_s_c(&b->key); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded pick; - struct btree_node_read_all *ra; - unsigned i; - - ra = kzalloc(sizeof(*ra), GFP_NOFS); - if (!ra) - return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas); - - closure_init(&ra->cl, NULL); - ra->c = c; - ra->b = b; - ra->nr = bch2_bkey_nr_ptrs(k); - - for (i = 0; i < ra->nr; i++) { - ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); - ra->bio[i] = bio_alloc_bioset(NULL, - buf_pages(ra->buf[i], btree_buf_bytes(b)), - REQ_OP_READ|REQ_SYNC|REQ_META, - GFP_NOFS, - &c->btree_bio); - } - - i = 0; - bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_btree_node_read_all_replicas); - struct btree_read_bio *rb = - container_of(ra->bio[i], struct btree_read_bio, bio); - rb->c = c; - rb->b = b; - rb->ra = ra; - rb->start_time = local_clock(); - rb->have_ioref = ca != NULL; - rb->idx = i; - rb->pick = pick; - rb->bio.bi_iter.bi_sector = pick.ptr.offset; - rb->bio.bi_end_io = btree_node_read_all_replicas_endio; - bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b)); - - if (rb->have_ioref) { - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], - bio_sectors(&rb->bio)); - bio_set_dev(&rb->bio, ca->disk_sb.bdev); - - closure_get(&ra->cl); - submit_bio(&rb->bio); - } else { - ra->err[i] = BLK_STS_REMOVED; - } - - i++; - } - - if (sync) { - closure_sync(&ra->cl); - btree_node_read_all_replicas_done(&ra->cl.work); - } else { - continue_at(&ra->cl, btree_node_read_all_replicas_done, - c->btree_read_complete_wq); - } - - return 0; -} - -void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, - bool sync) -{ - struct bch_fs *c = trans->c; - struct extent_ptr_decoded pick; - struct btree_read_bio *rb; - struct bch_dev *ca; - struct bio *bio; - int ret; - - trace_and_count(c, btree_node_read, trans, b); - - if (static_branch_unlikely(&bch2_verify_all_btree_replicas) && - !btree_node_read_all_replicas(c, b, sync)) - return; - - ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - NULL, &pick, -1); - - if (ret <= 0) { - bool ratelimit = true; - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_str(&buf, "btree node read error: no device to read from\n at "); - bch2_btree_pos_to_text(&buf, c, b); - prt_newline(&buf); - bch2_btree_lost_data(c, &buf, b->c.btree_id); - - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && - bch2_fs_emergency_read_only2(c, &buf)) - ratelimit = false; - - static DEFINE_RATELIMIT_STATE(rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - if (!ratelimit || __ratelimit(&rs)) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - set_btree_node_read_error(b); - clear_btree_node_read_in_flight(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); - return; - } - - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); - - bio = bio_alloc_bioset(NULL, - buf_pages(b->data, btree_buf_bytes(b)), - REQ_OP_READ|REQ_SYNC|REQ_META, - GFP_NOFS, - &c->btree_bio); - rb = container_of(bio, struct btree_read_bio, bio); - rb->c = c; - rb->b = b; - rb->ra = NULL; - rb->start_time = local_clock(); - rb->have_ioref = ca != NULL; - rb->pick = pick; - INIT_WORK(&rb->work, btree_node_read_work); - bio->bi_iter.bi_sector = pick.ptr.offset; - bio->bi_end_io = btree_node_read_endio; - bch2_bio_map(bio, b->data, btree_buf_bytes(b)); - - async_object_list_add(c, btree_read_bio, rb, &rb->list_idx); - - if (rb->have_ioref) { - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], - bio_sectors(bio)); - bio_set_dev(bio, ca->disk_sb.bdev); - - if (sync) { - submit_bio_wait(bio); - bch2_latency_acct(ca, rb->start_time, READ); - btree_node_read_work(&rb->work); - } else { - submit_bio(bio); - } - } else { - bio->bi_status = BLK_STS_REMOVED; - - if (sync) - btree_node_read_work(&rb->work); - else - queue_work(c->btree_read_complete_wq, &rb->work); - } -} - -static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, - const struct bkey_i *k, unsigned level) -{ - struct bch_fs *c = trans->c; - struct closure cl; - struct btree *b; - int ret; - - closure_init_stack(&cl); - - do { - ret = bch2_btree_cache_cannibalize_lock(trans, &cl); - closure_sync(&cl); - } while (ret); - - b = bch2_btree_node_mem_alloc(trans, level != 0); - bch2_btree_cache_cannibalize_unlock(trans); - - BUG_ON(IS_ERR(b)); - - bkey_copy(&b->key, k); - BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); - - set_btree_node_read_in_flight(b); - - /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */ - bch2_trans_unlock(trans); - bch2_btree_node_read(trans, b, true); - - if (btree_node_read_error(b)) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - - ret = bch_err_throw(c, btree_node_read_error); - goto err; - } - - bch2_btree_set_root_for_read(c, b); -err: - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - return ret; -} - -int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, - const struct bkey_i *k, unsigned level) -{ - return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); -} - -struct btree_node_scrub { - struct bch_fs *c; - struct bch_dev *ca; - void *buf; - bool used_mempool; - unsigned written; - - enum btree_id btree; - unsigned level; - struct bkey_buf key; - __le64 seq; - - struct work_struct work; - struct bio bio; -}; - -static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written, - struct printbuf *err) -{ - unsigned written = 0; - - if (le64_to_cpu(data->magic) != bset_magic(c)) { - prt_printf(err, "bad magic: want %llx, got %llx", - bset_magic(c), le64_to_cpu(data->magic)); - return false; - } - - while (written < (ptr_written ?: btree_sectors(c))) { - struct btree_node_entry *bne; - struct bset *i; - bool first = !written; - - if (first) { - bne = NULL; - i = &data->keys; - } else { - bne = (void *) data + (written << 9); - i = &bne->keys; - - if (!ptr_written && i->seq != data->keys.seq) - break; - } - - struct nonce nonce = btree_nonce(i, written << 9); - bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); - - if (first) { - if (good_csum_type) { - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data); - if (bch2_crc_cmp(data->csum, csum)) { - bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum); - return false; - } - } - - written += vstruct_sectors(data, c->block_bits); - } else { - if (good_csum_type) { - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - if (bch2_crc_cmp(bne->csum, csum)) { - bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum); - return false; - } - } - - written += vstruct_sectors(bne, c->block_bits); - } - } - - return true; -} - -static void btree_node_scrub_work(struct work_struct *work) -{ - struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); - struct bch_fs *c = scrub->c; - struct printbuf err = PRINTBUF; - - __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, - bkey_i_to_s_c(scrub->key.k)); - prt_newline(&err); - - if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { - int ret = bch2_trans_do(c, - bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1, - scrub->key.k, 0)); - if (!bch2_err_matches(ret, ENOENT) && - !bch2_err_matches(ret, EROFS)) - bch_err_fn_ratelimited(c, ret); - } - - printbuf_exit(&err); - bch2_bkey_buf_exit(&scrub->key, c);; - btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); - enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); - kfree(scrub); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); -} - -static void btree_node_scrub_endio(struct bio *bio) -{ - struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio); - - queue_work(scrub->c->btree_read_complete_wq, &scrub->work); -} - -int bch2_btree_node_scrub(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c k, unsigned dev) -{ - if (k.k->type != KEY_TYPE_btree_ptr_v2) - return 0; - - struct bch_fs *c = trans->c; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub)) - return bch_err_throw(c, erofs_no_writes); - - struct extent_ptr_decoded pick; - int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); - if (ret <= 0) - goto err; - - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_btree_node_scrub); - if (!ca) { - ret = bch_err_throw(c, device_offline); - goto err; - } - - bool used_mempool = false; - void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool); - - unsigned vecs = buf_pages(buf, c->opts.btree_node_size); - - struct btree_node_scrub *scrub = - kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL); - if (!scrub) { - ret = -ENOMEM; - goto err_free; - } - - scrub->c = c; - scrub->ca = ca; - scrub->buf = buf; - scrub->used_mempool = used_mempool; - scrub->written = btree_ptr_sectors_written(k); - - scrub->btree = btree; - scrub->level = level; - bch2_bkey_buf_init(&scrub->key); - bch2_bkey_buf_reassemble(&scrub->key, c, k); - scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq; - - INIT_WORK(&scrub->work, btree_node_scrub_work); - - bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); - bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); - scrub->bio.bi_iter.bi_sector = pick.ptr.offset; - scrub->bio.bi_end_io = btree_node_scrub_endio; - submit_bio(&scrub->bio); - return 0; -err_free: - btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); -err: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); - return ret; -} - -static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, - struct btree_write *w) -{ - unsigned long old, new; - - old = READ_ONCE(b->will_make_reachable); - do { - new = old; - if (!(old & 1)) - break; - - new &= ~1UL; - } while (!try_cmpxchg(&b->will_make_reachable, &old, new)); - - if (old & 1) - closure_put(&((struct btree_update *) new)->cl); - - bch2_journal_pin_drop(&c->journal, &w->journal); -} - -static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) -{ - struct btree_write *w = btree_prev_write(b); - unsigned long old, new; - unsigned type = 0; - - bch2_btree_complete_write(c, b, w); - - if (start_time) - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time); - - old = READ_ONCE(b->flags); - do { - new = old; - - if ((old & (1U << BTREE_NODE_dirty)) && - (old & (1U << BTREE_NODE_need_write)) && - !(old & (1U << BTREE_NODE_never_write)) && - !(old & (1U << BTREE_NODE_write_blocked)) && - !(old & (1U << BTREE_NODE_will_make_reachable))) { - new &= ~(1U << BTREE_NODE_dirty); - new &= ~(1U << BTREE_NODE_need_write); - new |= (1U << BTREE_NODE_write_in_flight); - new |= (1U << BTREE_NODE_write_in_flight_inner); - new |= (1U << BTREE_NODE_just_written); - new ^= (1U << BTREE_NODE_write_idx); - - type = new & BTREE_WRITE_TYPE_MASK; - new &= ~BTREE_WRITE_TYPE_MASK; - } else { - new &= ~(1U << BTREE_NODE_write_in_flight); - new &= ~(1U << BTREE_NODE_write_in_flight_inner); - } - } while (!try_cmpxchg(&b->flags, &old, new)); - - if (new & (1U << BTREE_NODE_write_in_flight)) - __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); - else { - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); - } -} - -static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) -{ - struct btree_trans *trans = bch2_trans_get(c); - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - - /* we don't need transaction context anymore after we got the lock. */ - bch2_trans_put(trans); - __btree_node_write_done(c, b, start_time); - six_unlock_read(&b->c.lock); -} - -static void btree_node_write_work(struct work_struct *work) -{ - struct btree_write_bio *wbio = - container_of(work, struct btree_write_bio, work); - struct bch_fs *c = wbio->wbio.c; - struct btree *b = wbio->wbio.bio.bi_private; - u64 start_time = wbio->start_time; - int ret = 0; - - btree_bounce_free(c, - wbio->data_bytes, - wbio->wbio.used_mempool, - wbio->data); - - bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, - bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = bch_err_throw(c, btree_node_write_all_failed); - goto err; - } - - if (wbio->wbio.first_btree_write) { - if (wbio->wbio.failed.nr) { - - } - } else { - ret = bch2_trans_do(c, - bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, - BCH_WATERMARK_interior_updates| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_check_rw, - !wbio->wbio.failed.nr)); - if (ret) - goto err; - } -out: - async_object_list_del(c, btree_write_bio, wbio->list_idx); - bio_put(&wbio->wbio.bio); - btree_node_write_done(c, b, start_time); - return; -err: - set_btree_node_noevict(b); - - if (!bch2_err_matches(ret, EROFS)) { - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); - bch2_btree_pos_to_text(&buf, c, b); - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - } - goto out; -} - -static void btree_node_write_endio(struct bio *bio) -{ - struct bch_write_bio *wbio = to_wbio(bio); - struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; - struct bch_write_bio *orig = parent ?: wbio; - struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); - struct bch_fs *c = wbio->c; - struct btree *b = wbio->bio.bi_private; - struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, - wbio->submit_time, !bio->bi_status); - - if (ca && bio->bi_status) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_printf(&buf, "btree write error: %s\n ", - bch2_blk_status_to_str(bio->bi_status)); - bch2_btree_pos_to_text(&buf, c, b); - bch_err_dev_ratelimited(ca, "%s", buf.buf); - printbuf_exit(&buf); - } - - if (bio->bi_status) { - unsigned long flags; - spin_lock_irqsave(&c->btree_write_error_lock, flags); - bch2_dev_list_add_dev(&orig->failed, wbio->dev); - spin_unlock_irqrestore(&c->btree_write_error_lock, flags); - } - - /* - * XXX: we should be using io_ref[WRITE], but we aren't retrying failed - * btree writes yet (due to device removal/ro): - */ - if (wbio->have_ioref) - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_btree_node_write); - - if (parent) { - bio_put(bio); - bio_endio(&parent->bio); - return; - } - - clear_btree_node_write_in_flight_inner(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); - INIT_WORK(&wb->work, btree_node_write_work); - queue_work(c->btree_write_complete_wq, &wb->work); -} - -static int validate_bset_for_write(struct bch_fs *c, struct btree *b, - struct bset *i) -{ - int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), - (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level + 1, - .btree = b->c.btree_id, - .flags = BCH_VALIDATE_write, - }); - if (ret) { - bch2_fs_inconsistent(c, "invalid btree node key before write"); - return ret; - } - - ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?: - validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL); - if (ret) { - bch2_inconsistent_error(c); - dump_stack(); - } - - return ret; -} - -static void btree_write_submit(struct work_struct *work) -{ - struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); - BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - - bkey_copy(&tmp.k, &wbio->key); - - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) - ptr->offset += wbio->sector_offset; - - bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, - &tmp.k, false); -} - -void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) -{ - struct btree_write_bio *wbio; - struct bset *i; - struct btree_node *bn = NULL; - struct btree_node_entry *bne = NULL; - struct sort_iter_stack sort_iter; - struct nonce nonce; - unsigned bytes_to_write, sectors_to_write, bytes, u64s; - u64 seq = 0; - bool used_mempool; - unsigned long old, new; - bool validate_before_checksum = false; - enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; - void *data; - u64 start_time = local_clock(); - int ret; - - if (flags & BTREE_WRITE_ALREADY_STARTED) - goto do_write; - - /* - * We may only have a read lock on the btree node - the dirty bit is our - * "lock" against racing with other threads that may be trying to start - * a write, we do a write iff we clear the dirty bit. Since setting the - * dirty bit requires a write lock, we can't race with other threads - * redirtying it: - */ - old = READ_ONCE(b->flags); - do { - new = old; - - if (!(old & (1 << BTREE_NODE_dirty))) - return; - - if ((flags & BTREE_WRITE_ONLY_IF_NEED) && - !(old & (1 << BTREE_NODE_need_write))) - return; - - if (old & - ((1 << BTREE_NODE_never_write)| - (1 << BTREE_NODE_write_blocked))) - return; - - if (b->written && - (old & (1 << BTREE_NODE_will_make_reachable))) - return; - - if (old & (1 << BTREE_NODE_write_in_flight)) - return; - - if (flags & BTREE_WRITE_ONLY_IF_NEED) - type = new & BTREE_WRITE_TYPE_MASK; - new &= ~BTREE_WRITE_TYPE_MASK; - - new &= ~(1 << BTREE_NODE_dirty); - new &= ~(1 << BTREE_NODE_need_write); - new |= (1 << BTREE_NODE_write_in_flight); - new |= (1 << BTREE_NODE_write_in_flight_inner); - new |= (1 << BTREE_NODE_just_written); - new ^= (1 << BTREE_NODE_write_idx); - } while (!try_cmpxchg_acquire(&b->flags, &old, new)); - - if (new & (1U << BTREE_NODE_need_write)) - return; -do_write: - BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); - - atomic_long_dec(&c->btree_cache.nr_dirty); - - BUG_ON(btree_node_fake(b)); - BUG_ON((b->will_make_reachable != 0) != !b->written); - - BUG_ON(b->written >= btree_sectors(c)); - BUG_ON(b->written & (block_sectors(c) - 1)); - BUG_ON(bset_written(b, btree_bset_last(b))); - BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); - BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); - - bch2_sort_whiteouts(c, b); - - sort_iter_stack_init(&sort_iter, b); - - bytes = !b->written - ? sizeof(struct btree_node) - : sizeof(struct btree_node_entry); - - bytes += b->whiteout_u64s * sizeof(u64); - - for_each_bset(b, t) { - i = bset(b, t); - - if (bset_written(b, i)) - continue; - - bytes += le16_to_cpu(i->u64s) * sizeof(u64); - sort_iter_add(&sort_iter.iter, - btree_bkey_first(b, t), - btree_bkey_last(b, t)); - seq = max(seq, le64_to_cpu(i->journal_seq)); - } - - BUG_ON(b->written && !seq); - - /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ - bytes += 8; - - /* buffer must be a multiple of the block size */ - bytes = round_up(bytes, block_bytes(c)); - - data = btree_bounce_alloc(c, bytes, &used_mempool); - - if (!b->written) { - bn = data; - *bn = *b->data; - i = &bn->keys; - } else { - bne = data; - bne->keys = b->data->keys; - i = &bne->keys; - } - - i->journal_seq = cpu_to_le64(seq); - i->u64s = 0; - - sort_iter_add(&sort_iter.iter, - unwritten_whiteouts_start(b), - unwritten_whiteouts_end(b)); - SET_BSET_SEPARATE_WHITEOUTS(i, false); - - u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter); - le16_add_cpu(&i->u64s, u64s); - - b->whiteout_u64s = 0; - - BUG_ON(!b->written && i->u64s != b->data->keys.u64s); - - set_needs_whiteout(i, false); - - /* do we have data to write? */ - if (b->written && !i->u64s) - goto nowrite; - - bytes_to_write = vstruct_end(i) - data; - sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; - - if (!b->written && - b->key.k.type == KEY_TYPE_btree_ptr_v2) - BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write); - - memset(data + bytes_to_write, 0, - (sectors_to_write << 9) - bytes_to_write); - - BUG_ON(b->written + sectors_to_write > btree_sectors(c)); - BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); - BUG_ON(i->seq != b->data->keys.seq); - - i->version = cpu_to_le16(c->sb.version); - SET_BSET_OFFSET(i, b->written); - SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); - - if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) - validate_before_checksum = true; - - /* validate_bset will be modifying: */ - if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) - validate_before_checksum = true; - - /* if we're going to be encrypting, check metadata validity first: */ - if (validate_before_checksum && - validate_bset_for_write(c, b, i)) - goto err; - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "encrypting btree node: %s", bch2_err_str(ret))) - goto err; - - nonce = btree_nonce(i, b->written << 9); - - if (bn) - bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); - else - bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - - /* if we're not encrypting, check metadata after checksumming: */ - if (!validate_before_checksum && - validate_bset_for_write(c, b, i)) - goto err; - - /* - * We handle btree write errors by immediately halting the journal - - * after we've done that, we can't issue any subsequent btree writes - * because they might have pointers to new nodes that failed to write. - * - * Furthermore, there's no point in doing any more btree writes because - * with the journal stopped, we're never going to update the journal to - * reflect that those writes were done and the data flushed from the - * journal: - * - * Also on journal error, the pending write may have updates that were - * never journalled (interior nodes, see btree_update_nodes_written()) - - * it's critical that we don't do the write in that case otherwise we - * will have updates visible that weren't in the journal: - * - * Make sure to update b->written so bch2_btree_init_next() doesn't - * break: - */ - if (bch2_journal_error(&c->journal) || - c->opts.nochanges) - goto err; - - trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); - - wbio = container_of(bio_alloc_bioset(NULL, - buf_pages(data, sectors_to_write << 9), - REQ_OP_WRITE|REQ_META, - GFP_NOFS, - &c->btree_bio), - struct btree_write_bio, wbio.bio); - wbio_init(&wbio->wbio.bio); - wbio->data = data; - wbio->data_bytes = bytes; - wbio->sector_offset = b->written; - wbio->start_time = start_time; - wbio->wbio.c = c; - wbio->wbio.used_mempool = used_mempool; - wbio->wbio.first_btree_write = !b->written; - wbio->wbio.bio.bi_end_io = btree_node_write_endio; - wbio->wbio.bio.bi_private = b; - - bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); - - bkey_copy(&wbio->key, &b->key); - - b->written += sectors_to_write; - - if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) - bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = - cpu_to_le16(b->written); - - atomic64_inc(&c->btree_write_stats[type].nr); - atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); - - async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx); - - INIT_WORK(&wbio->work, btree_write_submit); - queue_work(c->btree_write_submit_wq, &wbio->work); - return; -err: - set_btree_node_noevict(b); - b->written += sectors_to_write; -nowrite: - btree_bounce_free(c, bytes, used_mempool, data); - __btree_node_write_done(c, b, 0); -} - -/* - * Work that must be done with write lock held: - */ -bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) -{ - bool invalidated_iter = false; - struct btree_node_entry *bne; - - if (!btree_node_just_written(b)) - return false; - - BUG_ON(b->whiteout_u64s); - - clear_btree_node_just_written(b); - - /* - * Note: immediately after write, bset_written() doesn't work - the - * amount of data we had to write after compaction might have been - * smaller than the offset of the last bset. - * - * However, we know that all bsets have been written here, as long as - * we're still holding the write lock: - */ - - /* - * XXX: decide if we really want to unconditionally sort down to a - * single bset: - */ - if (b->nsets > 1) { - btree_node_sort(c, b, 0, b->nsets); - invalidated_iter = true; - } else { - invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); - } - - for_each_bset(b, t) - set_needs_whiteout(bset(b, t), true); - - bch2_btree_verify(c, b); - - /* - * If later we don't unconditionally sort down to a single bset, we have - * to ensure this is still true: - */ - BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); - - bne = want_new_bset(c, b); - if (bne) - bch2_bset_init_next(b, bne); - - bch2_btree_build_aux_trees(b); - - return invalidated_iter; -} - -/* - * Use this one if the node is intent locked: - */ -void bch2_btree_node_write(struct bch_fs *c, struct btree *b, - enum six_lock_type lock_type_held, - unsigned flags) -{ - if (lock_type_held == SIX_LOCK_intent || - (lock_type_held == SIX_LOCK_read && - six_lock_tryupgrade(&b->c.lock))) { - __bch2_btree_node_write(c, b, flags); - - /* don't cycle lock unnecessarily: */ - if (btree_node_just_written(b) && - six_trylock_write(&b->c.lock)) { - bch2_btree_post_write_cleanup(c, b); - six_unlock_write(&b->c.lock); - } - - if (lock_type_held == SIX_LOCK_read) - six_lock_downgrade(&b->c.lock); - } else { - __bch2_btree_node_write(c, b, flags); - if (lock_type_held == SIX_LOCK_write && - btree_node_just_written(b)) - bch2_btree_post_write_cleanup(c, b); - } -} - -void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b, - enum six_lock_type lock_type_held, - unsigned flags) -{ - struct bch_fs *c = trans->c; - - if (lock_type_held == SIX_LOCK_intent || - (lock_type_held == SIX_LOCK_read && - six_lock_tryupgrade(&b->c.lock))) { - __bch2_btree_node_write(c, b, flags); - - /* don't cycle lock unnecessarily: */ - if (btree_node_just_written(b) && - six_trylock_write(&b->c.lock)) { - bch2_btree_post_write_cleanup(c, b); - __bch2_btree_node_unlock_write(trans, b); - } - - if (lock_type_held == SIX_LOCK_read) - six_lock_downgrade(&b->c.lock); - } else { - __bch2_btree_node_write(c, b, flags); - if (lock_type_held == SIX_LOCK_write && - btree_node_just_written(b)) - bch2_btree_post_write_cleanup(c, b); - } -} - -static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) -{ - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - unsigned i; - bool ret = false; -restart: - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) - if (test_bit(flag, &b->flags)) { - rcu_read_unlock(); - wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); - ret = true; - goto restart; - } - rcu_read_unlock(); - - return ret; -} - -bool bch2_btree_flush_all_reads(struct bch_fs *c) -{ - return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); -} - -bool bch2_btree_flush_all_writes(struct bch_fs *c) -{ - return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); -} - -static const char * const bch2_btree_write_types[] = { -#define x(t, n) [n] = #t, - BCH_BTREE_WRITE_TYPES() - NULL -}; - -void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) -{ - printbuf_tabstop_push(out, 20); - printbuf_tabstop_push(out, 10); - - prt_printf(out, "\tnr\tsize\n"); - - for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { - u64 nr = atomic64_read(&c->btree_write_stats[i].nr); - u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); - - prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr); - prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); - prt_newline(out); - } -} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h deleted file mode 100644 index 30a5180532c8..000000000000 --- a/fs/bcachefs/btree_io.h +++ /dev/null @@ -1,239 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_IO_H -#define _BCACHEFS_BTREE_IO_H - -#include "bkey_methods.h" -#include "bset.h" -#include "btree_locking.h" -#include "checksum.h" -#include "extents.h" -#include "io_write_types.h" - -struct bch_fs; -struct btree_write; -struct btree; -struct btree_iter; -struct btree_node_read_all; - -static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) -{ - if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) - atomic_long_inc(&c->btree_cache.nr_dirty); -} - -static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) -{ - if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) - atomic_long_dec(&c->btree_cache.nr_dirty); -} - -static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k) -{ - return k.k->type == KEY_TYPE_btree_ptr_v2 - ? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written) - : 0; -} - -struct btree_read_bio { - struct bch_fs *c; - struct btree *b; - struct btree_node_read_all *ra; - u64 start_time; - unsigned have_ioref:1; - unsigned idx:7; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - struct extent_ptr_decoded pick; - struct work_struct work; - struct bio bio; -}; - -struct btree_write_bio { - struct work_struct work; - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); - void *data; - unsigned data_bytes; - unsigned sector_offset; - u64 start_time; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - struct bch_write_bio wbio; -}; - -void bch2_btree_node_io_unlock(struct btree *); -void bch2_btree_node_io_lock(struct btree *); -void __bch2_btree_node_wait_on_read(struct btree *); -void __bch2_btree_node_wait_on_write(struct btree *); -void bch2_btree_node_wait_on_read(struct btree *); -void bch2_btree_node_wait_on_write(struct btree *); - -enum compact_mode { - COMPACT_LAZY, - COMPACT_ALL, -}; - -bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, - enum compact_mode); - -static inline bool should_compact_bset_lazy(struct btree *b, - struct bset_tree *t) -{ - unsigned total_u64s = bset_u64s(t); - unsigned dead_u64s = bset_dead_u64s(b, t); - - return dead_u64s > 64 && dead_u64s * 3 > total_u64s; -} - -static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) -{ - for_each_bset(b, t) - if (should_compact_bset_lazy(b, t)) - return bch2_compact_whiteouts(c, b, COMPACT_LAZY); - - return false; -} - -static inline struct nonce btree_nonce(struct bset *i, unsigned offset) -{ - return (struct nonce) {{ - [0] = cpu_to_le32(offset), - [1] = ((__le32 *) &i->seq)[0], - [2] = ((__le32 *) &i->seq)[1], - [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, - }}; -} - -static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) -{ - struct nonce nonce = btree_nonce(i, offset); - int ret; - - if (!offset) { - struct btree_node *bn = container_of(i, struct btree_node, keys); - unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; - - ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, - &bn->flags, bytes); - if (ret) - return ret; - - nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); - } - - return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, - vstruct_end(i) - (void *) i->_data); -} - -void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); - -void bch2_btree_node_drop_keys_outside_node(struct btree *); - -void bch2_btree_build_aux_trees(struct btree *); -void bch2_btree_init_next(struct btree_trans *, struct btree *); - -int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, - struct btree *, - struct bch_io_failures *, - struct printbuf *); -void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); -int bch2_btree_root_read(struct bch_fs *, enum btree_id, - const struct bkey_i *, unsigned); - -void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *); - -int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, unsigned); - -bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); - -enum btree_write_flags { - __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS, - __BTREE_WRITE_ALREADY_STARTED, -}; -#define BTREE_WRITE_ONLY_IF_NEED BIT(__BTREE_WRITE_ONLY_IF_NEED) -#define BTREE_WRITE_ALREADY_STARTED BIT(__BTREE_WRITE_ALREADY_STARTED) - -void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); -void bch2_btree_node_write(struct bch_fs *, struct btree *, - enum six_lock_type, unsigned); -void bch2_btree_node_write_trans(struct btree_trans *, struct btree *, - enum six_lock_type, unsigned); - -static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b, - enum six_lock_type lock_held) -{ - bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); -} - -bool bch2_btree_flush_all_reads(struct bch_fs *); -bool bch2_btree_flush_all_writes(struct bch_fs *); - -static inline void compat_bformat(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, struct bkey_format *f) -{ - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id == BTREE_ID_inodes) { - swap(f->bits_per_field[BKEY_FIELD_INODE], - f->bits_per_field[BKEY_FIELD_OFFSET]); - swap(f->field_offset[BKEY_FIELD_INODE], - f->field_offset[BKEY_FIELD_OFFSET]); - } - - if (version < bcachefs_metadata_version_snapshot && - (level || btree_type_has_snapshots(btree_id))) { - u64 max_packed = - ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); - - f->field_offset[BKEY_FIELD_SNAPSHOT] = write - ? 0 - : cpu_to_le64(U32_MAX - max_packed); - } -} - -static inline void compat_bpos(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, struct bpos *p) -{ - if (big_endian != CPU_BIG_ENDIAN) - bch2_bpos_swab(p); - - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id == BTREE_ID_inodes) - swap(p->inode, p->offset); -} - -static inline void compat_btree_node(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, - struct btree_node *bn) -{ - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id_is_extents(btree_id) && - !bpos_eq(bn->min_key, POS_MIN) && - write) - bn->min_key = bpos_nosnap_predecessor(bn->min_key); - - if (version < bcachefs_metadata_version_snapshot && - write) - bn->max_key.snapshot = 0; - - compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); - compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); - - if (version < bcachefs_metadata_version_snapshot && - !write) - bn->max_key.snapshot = U32_MAX; - - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id_is_extents(btree_id) && - !bpos_eq(bn->min_key, POS_MIN) && - !write) - bn->min_key = bpos_nosnap_successor(bn->min_key); -} - -void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c deleted file mode 100644 index f8829b667ad3..000000000000 --- a/fs/bcachefs/btree_iter.c +++ /dev/null @@ -1,3804 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_methods.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_journal_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "journal.h" -#include "journal_io.h" -#include "replicas.h" -#include "snapshot.h" -#include "super.h" -#include "trace.h" - -#include <linux/random.h> -#include <linux/prefetch.h> - -static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); -static inline void btree_path_list_add(struct btree_trans *, - btree_path_idx_t, btree_path_idx_t); - -static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) -{ -#ifdef TRACK_PATH_ALLOCATED - return iter->ip_allocated; -#else - return 0; -#endif -} - -static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t); -static void bch2_trans_srcu_lock(struct btree_trans *); - -static inline int __btree_path_cmp(const struct btree_path *l, - enum btree_id r_btree_id, - bool r_cached, - struct bpos r_pos, - unsigned r_level) -{ - /* - * Must match lock ordering as defined by __bch2_btree_node_lock: - */ - return cmp_int(l->btree_id, r_btree_id) ?: - cmp_int((int) l->cached, (int) r_cached) ?: - bpos_cmp(l->pos, r_pos) ?: - -cmp_int(l->level, r_level); -} - -static inline int btree_path_cmp(const struct btree_path *l, - const struct btree_path *r) -{ - return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); -} - -static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) -{ - /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_all_snapshots) { - p = bpos_successor(p); - } else { - p = bpos_nosnap_successor(p); - p.snapshot = iter->snapshot; - } - - return p; -} - -static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) -{ - /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_all_snapshots) { - p = bpos_predecessor(p); - } else { - p = bpos_nosnap_predecessor(p); - p.snapshot = iter->snapshot; - } - - return p; -} - -static inline struct bpos btree_iter_search_key(struct btree_iter *iter) -{ - struct bpos pos = iter->pos; - - if ((iter->flags & BTREE_ITER_is_extents) && - !bkey_eq(pos, POS_MAX)) - pos = bkey_successor(iter, pos); - return pos; -} - -static inline bool btree_path_pos_before_node(struct btree_path *path, - struct btree *b) -{ - return bpos_lt(path->pos, b->data->min_key); -} - -static inline bool btree_path_pos_after_node(struct btree_path *path, - struct btree *b) -{ - return bpos_gt(path->pos, b->key.k.p); -} - -static inline bool btree_path_pos_in_node(struct btree_path *path, - struct btree *b) -{ - return path->btree_id == b->c.btree_id && - !btree_path_pos_before_node(path, b) && - !btree_path_pos_after_node(path, b); -} - -/* Debug: */ - -static void __bch2_btree_path_verify_cached(struct btree_trans *trans, - struct btree_path *path) -{ - struct bkey_cached *ck; - bool locked = btree_node_locked(path, 0); - - if (!bch2_btree_node_relock(trans, path, 0)) - return; - - ck = (void *) path->l[0].b; - BUG_ON(ck->key.btree_id != path->btree_id || - !bkey_eq(ck->key.pos, path->pos)); - - if (!locked) - btree_node_unlock(trans, path, 0); -} - -static void __bch2_btree_path_verify_level(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - struct btree_path_level *l; - struct btree_node_iter tmp; - bool locked; - struct bkey_packed *p, *k; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - struct printbuf buf3 = PRINTBUF; - const char *msg; - - l = &path->l[level]; - tmp = l->iter; - locked = btree_node_locked(path, level); - - if (path->cached) { - if (!level) - __bch2_btree_path_verify_cached(trans, path); - return; - } - - if (!btree_path_node(path, level)) - return; - - if (!bch2_btree_node_relock_notrace(trans, path, level)) - return; - - BUG_ON(!btree_path_pos_in_node(path, l->b)); - - bch2_btree_node_iter_verify(&l->iter, l->b); - - /* - * For interior nodes, the iterator will have skipped past deleted keys: - */ - p = level - ? bch2_btree_node_iter_prev(&tmp, l->b) - : bch2_btree_node_iter_prev_all(&tmp, l->b); - k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - - if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { - msg = "before"; - goto err; - } - - if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { - msg = "after"; - goto err; - } - - if (!locked) - btree_node_unlock(trans, path, level); - return; -err: - bch2_bpos_to_text(&buf1, path->pos); - - if (p) { - struct bkey uk = bkey_unpack_key(l->b, p); - - bch2_bkey_to_text(&buf2, &uk); - } else { - prt_printf(&buf2, "(none)"); - } - - if (k) { - struct bkey uk = bkey_unpack_key(l->b, k); - - bch2_bkey_to_text(&buf3, &uk); - } else { - prt_printf(&buf3, "(none)"); - } - - panic("path should be %s key at level %u:\n" - "path pos %s\n" - "prev key %s\n" - "cur key %s\n", - msg, level, buf1.buf, buf2.buf, buf3.buf); -} - -static void __bch2_btree_path_verify(struct btree_trans *trans, - struct btree_path *path) -{ - struct bch_fs *c = trans->c; - - for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { - if (!path->l[i].b) { - BUG_ON(!path->cached && - bch2_btree_id_root(c, path->btree_id)->b->c.level > i); - break; - } - - __bch2_btree_path_verify_level(trans, path, i); - } - - bch2_btree_path_verify_locks(trans, path); -} - -void __bch2_trans_verify_paths(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned iter; - - trans_for_each_path(trans, path, iter) - __bch2_btree_path_verify(trans, path); -} - -static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) -{ - BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); - - BUG_ON((iter->flags & BTREE_ITER_is_extents) && - (iter->flags & BTREE_ITER_all_snapshots)); - - BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) && - (iter->flags & BTREE_ITER_all_snapshots) && - !btree_type_has_snapshot_field(iter->btree_id)); - - if (iter->update_path) - __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); - __bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); -} - -static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) -{ - BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && - !iter->pos.snapshot); - - BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) && - iter->pos.snapshot != iter->snapshot); - - BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) : - !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) : - (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || - bkey_gt(iter->pos, iter->k.p))); -} - -static int __bch2_btree_iter_verify_ret(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c k) -{ - struct btree_iter copy; - struct bkey_s_c prev; - int ret = 0; - - if (!(iter->flags & BTREE_ITER_filter_snapshots)) - return 0; - - if (bkey_err(k) || !k.k) - return 0; - - BUG_ON(!bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)); - - bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, - BTREE_ITER_nopreserve| - BTREE_ITER_all_snapshots); - prev = bch2_btree_iter_prev(trans, ©); - if (!prev.k) - goto out; - - ret = bkey_err(prev); - if (ret) - goto out; - - if (bkey_eq(prev.k->p, k.k->p) && - bch2_snapshot_is_ancestor(trans->c, iter->snapshot, - prev.k->p.snapshot) > 0) { - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - - bch2_bkey_to_text(&buf1, k.k); - bch2_bkey_to_text(&buf2, prev.k); - - panic("iter snap %u\n" - "k %s\n" - "prev %s\n", - iter->snapshot, - buf1.buf, buf2.buf); - } -out: - bch2_trans_iter_exit(trans, ©); - return ret; -} - -void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos) -{ - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - struct btree_path *path; - struct trans_for_each_path_inorder_iter iter; - struct printbuf buf = PRINTBUF; - - btree_trans_sort_paths(trans); - - trans_for_each_path_inorder(trans, path, iter) { - if (path->btree_id != id || - !btree_node_locked(path, 0) || - !path->should_be_locked) - continue; - - if (!path->cached) { - if (bkey_ge(pos, path->l[0].b->data->min_key) && - bkey_le(pos, path->l[0].b->key.k.p)) - return; - } else { - if (bkey_eq(pos, path->pos)) - return; - } - } - - bch2_dump_trans_paths_updates(trans); - bch2_bpos_to_text(&buf, pos); - - panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); -} - -static inline void bch2_btree_path_verify_level(struct btree_trans *trans, - struct btree_path *path, unsigned l) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_btree_path_verify_level(trans, path, l); -} - -static inline void bch2_btree_path_verify(struct btree_trans *trans, - struct btree_path *path) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_btree_path_verify(trans, path); -} - -static inline void bch2_btree_iter_verify(struct btree_trans *trans, - struct btree_iter *iter) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_btree_iter_verify(trans, iter); -} - -static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_btree_iter_verify_entry_exit(iter); -} - -static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - return static_branch_unlikely(&bch2_debug_check_iterators) - ? __bch2_btree_iter_verify_ret(trans, iter, k) - : 0; -} - -/* Btree path: fixups after btree updates */ - -static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, - struct btree *b, - struct bset_tree *t, - struct bkey_packed *k) -{ - struct btree_node_iter_set *set; - - btree_node_iter_for_each(iter, set) - if (set->end == t->end_offset) { - set->k = __btree_node_key_to_offset(b, k); - bch2_btree_node_iter_sort(iter, b); - return; - } - - bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); -} - -static void __bch2_btree_path_fix_key_modified(struct btree_path *path, - struct btree *b, - struct bkey_packed *where) -{ - struct btree_path_level *l = &path->l[b->c.level]; - - if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) - return; - - if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) - bch2_btree_node_iter_advance(&l->iter, l->b); -} - -void bch2_btree_path_fix_key_modified(struct btree_trans *trans, - struct btree *b, - struct bkey_packed *where) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path_with_node(trans, b, path, i) { - __bch2_btree_path_fix_key_modified(path, b, where); - bch2_btree_path_verify_level(trans, path, b->c.level); - } -} - -static void __bch2_btree_node_iter_fix(struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bset_tree *t, - struct bkey_packed *where, - unsigned clobber_u64s, - unsigned new_u64s) -{ - const struct bkey_packed *end = btree_bkey_last(b, t); - struct btree_node_iter_set *set; - unsigned offset = __btree_node_key_to_offset(b, where); - int shift = new_u64s - clobber_u64s; - unsigned old_end = t->end_offset - shift; - unsigned orig_iter_pos = node_iter->data[0].k; - bool iter_current_key_modified = - orig_iter_pos >= offset && - orig_iter_pos <= offset + clobber_u64s; - - btree_node_iter_for_each(node_iter, set) - if (set->end == old_end) - goto found; - - /* didn't find the bset in the iterator - might have to readd it: */ - if (new_u64s && - bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { - bch2_btree_node_iter_push(node_iter, b, where, end); - goto fixup_done; - } else { - /* Iterator is after key that changed */ - return; - } -found: - set->end = t->end_offset; - - /* Iterator hasn't gotten to the key that changed yet: */ - if (set->k < offset) - return; - - if (new_u64s && - bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { - set->k = offset; - } else if (set->k < offset + clobber_u64s) { - set->k = offset + new_u64s; - if (set->k == set->end) - bch2_btree_node_iter_set_drop(node_iter, set); - } else { - /* Iterator is after key that changed */ - set->k = (int) set->k + shift; - return; - } - - bch2_btree_node_iter_sort(node_iter, b); -fixup_done: - if (node_iter->data[0].k != orig_iter_pos) - iter_current_key_modified = true; - - /* - * When a new key is added, and the node iterator now points to that - * key, the iterator might have skipped past deleted keys that should - * come after the key the iterator now points to. We have to rewind to - * before those deleted keys - otherwise - * bch2_btree_node_iter_prev_all() breaks: - */ - if (!bch2_btree_node_iter_end(node_iter) && - iter_current_key_modified && - b->c.level) { - struct bkey_packed *k, *k2, *p; - - k = bch2_btree_node_iter_peek_all(node_iter, b); - - for_each_bset(b, t) { - bool set_pos = false; - - if (node_iter->data[0].end == t->end_offset) - continue; - - k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); - - while ((p = bch2_bkey_prev_all(b, t, k2)) && - bkey_iter_cmp(b, k, p) < 0) { - k2 = p; - set_pos = true; - } - - if (set_pos) - btree_node_iter_set_set_pos(node_iter, - b, t, k2); - } - } -} - -void bch2_btree_node_iter_fix(struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_packed *where, - unsigned clobber_u64s, - unsigned new_u64s) -{ - struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); - struct btree_path *linked; - unsigned i; - - if (node_iter != &path->l[b->c.level].iter) { - __bch2_btree_node_iter_fix(path, b, node_iter, t, - where, clobber_u64s, new_u64s); - - if (static_branch_unlikely(&bch2_debug_check_iterators)) - bch2_btree_node_iter_verify(node_iter, b); - } - - trans_for_each_path_with_node(trans, b, linked, i) { - __bch2_btree_node_iter_fix(linked, b, - &linked->l[b->c.level].iter, t, - where, clobber_u64s, new_u64s); - bch2_btree_path_verify_level(trans, linked, b->c.level); - } -} - -/* Btree path level: pointer to a particular btree node and node iter */ - -static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, - struct btree_path_level *l, - struct bkey *u, - struct bkey_packed *k) -{ - if (unlikely(!k)) { - /* - * signal to bch2_btree_iter_peek_slot() that we're currently at - * a hole - */ - u->type = KEY_TYPE_deleted; - return bkey_s_c_null; - } - - return bkey_disassemble(l->b, k, u); -} - -static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, - struct btree_path_level *l, - struct bkey *u) -{ - return __btree_iter_unpack(c, l, u, - bch2_btree_node_iter_peek_all(&l->iter, l->b)); -} - -static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, - struct btree_path *path, - struct btree_path_level *l, - struct bkey *u) -{ - struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, - bch2_btree_node_iter_prev(&l->iter, l->b)); - - path->pos = k.k ? k.k->p : l->b->data->min_key; - trans->paths_sorted = false; - bch2_btree_path_verify_level(trans, path, l - path->l); - return k; -} - -static inline bool btree_path_advance_to_pos(struct btree_path *path, - struct btree_path_level *l, - int max_advance) -{ - struct bkey_packed *k; - int nr_advanced = 0; - - while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && - bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { - if (max_advance > 0 && nr_advanced >= max_advance) - return false; - - bch2_btree_node_iter_advance(&l->iter, l->b); - nr_advanced++; - } - - return true; -} - -static inline void __btree_path_level_init(struct btree_path *path, - unsigned level) -{ - struct btree_path_level *l = &path->l[level]; - - bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); - - /* - * Iterators to interior nodes should always be pointed at the first non - * whiteout: - */ - if (level) - bch2_btree_node_iter_peek(&l->iter, l->b); -} - -void bch2_btree_path_level_init(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - BUG_ON(path->cached); - - EBUG_ON(!btree_path_pos_in_node(path, b)); - - path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); - path->l[b->c.level].b = b; - __btree_path_level_init(path, b->c.level); -} - -/* Btree path: fixups after btree node updates: */ - -static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) -{ - struct bch_fs *c = trans->c; - - trans_for_each_update(trans, i) - if (!i->cached && - i->level == b->c.level && - i->btree_id == b->c.btree_id && - bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && - bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { - i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v; - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, i->btree_id, i->level, - i->k->k.p); - - if (j_k) { - i->old_k = j_k->k; - i->old_v = &j_k->v; - } - } - } -} - -/* - * A btree node is being replaced - update the iterator to point to the new - * node: - */ -void bch2_trans_node_add(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - struct btree_path *prev; - - BUG_ON(!btree_path_pos_in_node(path, b)); - - while ((prev = prev_btree_path(trans, path)) && - btree_path_pos_in_node(prev, b)) - path = prev; - - for (; - path && btree_path_pos_in_node(path, b); - path = next_btree_path(trans, path)) - if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) { - enum btree_node_locked_type t = - btree_lock_want(path, b->c.level); - - if (t != BTREE_NODE_UNLOCKED) { - btree_node_unlock(trans, path, b->c.level); - six_lock_increment(&b->c.lock, (enum six_lock_type) t); - mark_btree_node_locked(trans, path, b->c.level, t); - } - - bch2_btree_path_level_init(trans, path, b); - } - - bch2_trans_revalidate_updates_in_node(trans, b); -} - -void bch2_trans_node_drop(struct btree_trans *trans, - struct btree *b) -{ - struct btree_path *path; - unsigned i, level = b->c.level; - - trans_for_each_path(trans, path, i) - if (path->l[level].b == b) { - btree_node_unlock(trans, path, level); - path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); - } -} - -/* - * A btree node has been modified in such a way as to invalidate iterators - fix - * them: - */ -void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path_with_node(trans, b, path, i) - __btree_path_level_init(path, b->c.level); - - bch2_trans_revalidate_updates_in_node(trans, b); -} - -/* Btree path: traverse, set_pos: */ - -static inline int btree_path_lock_root(struct btree_trans *trans, - struct btree_path *path, - unsigned depth_want, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, path->btree_id); - enum six_lock_type lock_type; - unsigned i; - int ret; - - EBUG_ON(path->nodes_locked); - - while (1) { - struct btree *b = READ_ONCE(r->b); - if (unlikely(!b)) { - BUG_ON(!r->error); - return r->error; - } - - path->level = READ_ONCE(b->c.level); - - if (unlikely(path->level < depth_want)) { - /* - * the root is at a lower depth than the depth we want: - * got to the end of the btree, or we're walking nodes - * greater than some depth and there are no nodes >= - * that depth - */ - path->level = depth_want; - for (i = path->level; i < BTREE_MAX_DEPTH; i++) - path->l[i].b = NULL; - return 1; - } - - lock_type = __btree_lock_want(path, path->level); - ret = btree_node_lock(trans, path, &b->c, - path->level, lock_type, trace_ip); - if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - BUG(); - } - - if (likely(b == READ_ONCE(r->b) && - b->c.level == path->level && - !race_fault())) { - for (i = 0; i < path->level; i++) - path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root); - path->l[path->level].b = b; - for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) - path->l[i].b = NULL; - - mark_btree_node_locked(trans, path, path->level, - (enum btree_node_locked_type) lock_type); - bch2_btree_path_level_init(trans, path, b); - return 0; - } - - six_unlock_type(&b->c.lock, lock_type); - } -} - -noinline -static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) -{ - struct bch_fs *c = trans->c; - struct btree_path_level *l = path_l(path); - struct btree_node_iter node_iter = l->iter; - struct bkey_packed *k; - struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_started, &c->flags) - ? (path->level > 1 ? 0 : 2) - : (path->level > 1 ? 1 : 16); - bool was_locked = btree_node_locked(path, path->level); - int ret = 0; - - bch2_bkey_buf_init(&tmp); - - while (nr-- && !ret) { - if (!bch2_btree_node_relock(trans, path, path->level)) - break; - - bch2_btree_node_iter_advance(&node_iter, l->b); - k = bch2_btree_node_iter_peek(&node_iter, l->b); - if (!k) - break; - - bch2_bkey_buf_unpack(&tmp, c, l->b, k); - ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, - path->level - 1); - } - - if (!was_locked) - btree_node_unlock(trans, path, path->level); - - bch2_bkey_buf_exit(&tmp, c); - return ret; -} - -static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, - struct btree_and_journal_iter *jiter) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c k; - struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_started, &c->flags) - ? (path->level > 1 ? 0 : 2) - : (path->level > 1 ? 1 : 16); - bool was_locked = btree_node_locked(path, path->level); - int ret = 0; - - bch2_bkey_buf_init(&tmp); - - jiter->fail_if_too_many_whiteouts = true; - - while (nr-- && !ret) { - if (!bch2_btree_node_relock(trans, path, path->level)) - break; - - bch2_btree_and_journal_iter_advance(jiter); - k = bch2_btree_and_journal_iter_peek(jiter); - if (!k.k) - break; - - bch2_bkey_buf_reassemble(&tmp, c, k); - ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, - path->level - 1); - } - - if (!was_locked) - btree_node_unlock(trans, path, path->level); - - bch2_bkey_buf_exit(&tmp, c); - return ret; -} - -static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, - struct btree_path *path, - unsigned plevel, struct btree *b) -{ - struct btree_path_level *l = &path->l[plevel]; - bool locked = btree_node_locked(path, plevel); - struct bkey_packed *k; - struct bch_btree_ptr_v2 *bp; - - if (!bch2_btree_node_relock(trans, path, plevel)) - return; - - k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); - - bp = (void *) bkeyp_val(&l->b->format, k); - bp->mem_ptr = (unsigned long)b; - - if (!locked) - btree_node_unlock(trans, path, plevel); -} - -static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, - struct btree_path *path, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_path_level *l = path_l(path); - struct btree_and_journal_iter jiter; - struct bkey_s_c k; - int ret = 0; - - __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); - - k = bch2_btree_and_journal_iter_peek(&jiter); - if (!k.k) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " at btree "); - bch2_btree_pos_to_text(&buf, c, l->b); - - ret = bch2_fs_topology_error(c, "%s", buf.buf); - printbuf_exit(&buf); - goto err; - } - - bkey_reassemble(&trans->btree_path_down, k); - - if ((flags & BTREE_ITER_prefetch) && - c->opts.btree_node_prefetch) - ret = btree_path_prefetch_j(trans, path, &jiter); - -err: - bch2_btree_and_journal_iter_exit(&jiter); - return ret; -} - -static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans, - struct btree_path *path) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " within parent node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key)); - - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, btree_need_topology_repair); -} - -static __always_inline int btree_path_down(struct btree_trans *trans, - struct btree_path *path, - unsigned flags, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_path_level *l = path_l(path); - struct btree *b; - unsigned level = path->level - 1; - enum six_lock_type lock_type = __btree_lock_want(path, level); - int ret; - - EBUG_ON(!btree_node_locked(path, path->level)); - - if (unlikely(trans->journal_replay_not_finished)) { - ret = btree_node_iter_and_journal_peek(trans, path, flags); - if (ret) - return ret; - } else { - struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); - if (unlikely(!k)) - return btree_node_missing_err(trans, path); - - bch2_bkey_unpack(l->b, &trans->btree_path_down, k); - - if (unlikely((flags & BTREE_ITER_prefetch)) && - c->opts.btree_node_prefetch) { - ret = btree_path_prefetch(trans, path); - if (ret) - return ret; - } - } - - b = bch2_btree_node_get(trans, path, &trans->btree_path_down, - level, lock_type, trace_ip); - ret = PTR_ERR_OR_ZERO(b); - if (unlikely(ret)) - return ret; - - if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) && - likely(!trans->journal_replay_not_finished && - trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2)) - btree_node_mem_ptr_set(trans, path, level + 1, b); - - if (btree_node_read_locked(path, level + 1)) - btree_node_unlock(trans, path, level + 1); - - mark_btree_node_locked(trans, path, level, - (enum btree_node_locked_type) lock_type); - path->level = level; - bch2_btree_path_level_init(trans, path, b); - - bch2_btree_path_verify_locks(trans, path); - return 0; -} - -static int bch2_btree_path_traverse_all(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_path *path; - unsigned long trace_ip = _RET_IP_; - unsigned i; - int ret = 0; - - if (trans->in_traverse_all) - return bch_err_throw(c, transaction_restart_in_traverse_all); - - trans->in_traverse_all = true; -retry_all: - trans->restarted = 0; - trans->last_restarted_ip = 0; - - trans_for_each_path(trans, path, i) - path->should_be_locked = false; - - btree_trans_sort_paths(trans); - - bch2_trans_unlock(trans); - cond_resched(); - trans_set_locked(trans, false); - - if (unlikely(trans->memory_allocation_failure)) { - struct closure cl; - - closure_init_stack(&cl); - - do { - ret = bch2_btree_cache_cannibalize_lock(trans, &cl); - closure_sync(&cl); - } while (ret); - } - - /* Now, redo traversals in correct order: */ - i = 0; - while (i < trans->nr_sorted) { - btree_path_idx_t idx = trans->sorted[i]; - - /* - * Traversing a path can cause another path to be added at about - * the same position: - */ - if (trans->paths[idx].uptodate) { - __btree_path_get(trans, &trans->paths[idx], false); - ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_); - __btree_path_put(trans, &trans->paths[idx], false); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, ENOMEM)) - goto retry_all; - if (ret) - goto err; - } else { - i++; - } - } - - /* - * We used to assert that all paths had been traversed here - * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since - * path->should_be_locked is not set yet, we might have unlocked and - * then failed to relock a path - that's fine. - */ -err: - bch2_btree_cache_cannibalize_unlock(trans); - - trans->in_traverse_all = false; - - trace_and_count(c, trans_traverse_all, trans, trace_ip); - return ret; -} - -static inline bool btree_path_check_pos_in_node(struct btree_path *path, - unsigned l, int check_pos) -{ - if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) - return false; - if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) - return false; - return true; -} - -static inline bool btree_path_good_node(struct btree_trans *trans, - struct btree_path *path, - unsigned l, int check_pos) -{ - return is_btree_node(path, l) && - bch2_btree_node_relock(trans, path, l) && - btree_path_check_pos_in_node(path, l, check_pos); -} - -static void btree_path_set_level_down(struct btree_trans *trans, - struct btree_path *path, - unsigned new_level) -{ - unsigned l; - - path->level = new_level; - - for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) - if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) - btree_node_unlock(trans, path, l); - - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - bch2_btree_path_verify(trans, path); -} - -static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans, - struct btree_path *path, - int check_pos) -{ - unsigned i, l = path->level; -again: - while (btree_path_node(path, l) && - !btree_path_good_node(trans, path, l, check_pos)) - __btree_path_set_level_up(trans, path, l++); - - /* If we need intent locks, take them too: */ - for (i = l + 1; - i < path->locks_want && btree_path_node(path, i); - i++) - if (!bch2_btree_node_relock(trans, path, i)) { - while (l <= i) - __btree_path_set_level_up(trans, path, l++); - goto again; - } - - return l; -} - -static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, - struct btree_path *path, - int check_pos) -{ - return likely(btree_node_locked(path, path->level) && - btree_path_check_pos_in_node(path, path->level, check_pos)) - ? path->level - : __btree_path_up_until_good_node(trans, path, check_pos); -} - -/* - * This is the main state machine for walking down the btree - walks down to a - * specified depth - * - * Returns 0 on success, -EIO on error (error reading in a btree node). - * - * On error, caller (peek_node()/peek_key()) must return NULL; the error is - * stashed in the iterator and returned from bch2_trans_exit(). - */ -int bch2_btree_path_traverse_one(struct btree_trans *trans, - btree_path_idx_t path_idx, - unsigned flags, - unsigned long trace_ip) -{ - struct btree_path *path = &trans->paths[path_idx]; - unsigned depth_want = path->level; - int ret = -((int) trans->restarted); - - if (unlikely(ret)) - goto out; - - if (unlikely(!trans->srcu_held)) - bch2_trans_srcu_lock(trans); - - trace_btree_path_traverse_start(trans, path); - - /* - * Ensure we obey path->should_be_locked: if it's set, we can't unlock - * and re-traverse the path without a transaction restart: - */ - if (path->should_be_locked) { - ret = bch2_btree_path_relock(trans, path, trace_ip); - goto out; - } - - if (path->cached) { - ret = bch2_btree_path_traverse_cached(trans, path_idx, flags); - goto out; - } - - path = &trans->paths[path_idx]; - - if (unlikely(path->level >= BTREE_MAX_DEPTH)) - goto out_uptodate; - - path->level = btree_path_up_until_good_node(trans, path, 0); - unsigned max_level = path->level; - - EBUG_ON(btree_path_node(path, path->level) && - !btree_node_locked(path, path->level)); - - /* - * Note: path->nodes[path->level] may be temporarily NULL here - that - * would indicate to other code that we got to the end of the btree, - * here it indicates that relocking the root failed - it's critical that - * btree_path_lock_root() comes next and that it can't fail - */ - while (path->level > depth_want) { - ret = btree_path_node(path, path->level) - ? btree_path_down(trans, path, flags, trace_ip) - : btree_path_lock_root(trans, path, depth_want, trace_ip); - if (unlikely(ret)) { - if (ret == 1) { - /* - * No nodes at this level - got to the end of - * the btree: - */ - ret = 0; - goto out; - } - - __bch2_btree_path_unlock(trans, path); - path->level = depth_want; - path->l[path->level].b = ERR_PTR(ret); - goto out; - } - } - - if (unlikely(max_level > path->level)) { - struct btree_path *linked; - unsigned iter; - - trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter) - for (unsigned j = path->level + 1; j < max_level; j++) - linked->l[j] = path->l[j]; - } - -out_uptodate: - path->uptodate = BTREE_ITER_UPTODATE; - trace_btree_path_traverse_end(trans, path); -out: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) - panic("ret %s (%i) trans->restarted %s (%i)\n", - bch2_err_str(ret), ret, - bch2_err_str(trans->restarted), trans->restarted); - bch2_btree_path_verify(trans, path); - return ret; -} - -static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, - struct btree_path *src) -{ - unsigned i, offset = offsetof(struct btree_path, pos); - - memcpy((void *) dst + offset, - (void *) src + offset, - sizeof(struct btree_path) - offset); - - for (i = 0; i < BTREE_MAX_DEPTH; i++) { - unsigned t = btree_node_locked_type(dst, i); - - if (t != BTREE_NODE_UNLOCKED) - six_lock_increment(&dst->l[i].b->c.lock, t); - } -} - -static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src, - bool intent, unsigned long ip) -{ - btree_path_idx_t new = btree_path_alloc(trans, src); - btree_path_copy(trans, trans->paths + new, trans->paths + src); - __btree_path_get(trans, trans->paths + new, intent); -#ifdef TRACK_PATH_ALLOCATED - trans->paths[new].ip_allocated = ip; -#endif - return new; -} - -__flatten -btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, - btree_path_idx_t path, bool intent, unsigned long ip) -{ - struct btree_path *old = trans->paths + path; - __btree_path_put(trans, trans->paths + path, intent); - path = btree_path_clone(trans, path, intent, ip); - trace_btree_path_clone(trans, old, trans->paths + path); - trans->paths[path].preserve = false; - return path; -} - -btree_path_idx_t __must_check -__bch2_btree_path_set_pos(struct btree_trans *trans, - btree_path_idx_t path_idx, struct bpos new_pos, - bool intent, unsigned long ip) -{ - int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - EBUG_ON(!trans->paths[path_idx].ref); - - trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos); - - path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip); - - struct btree_path *path = trans->paths + path_idx; - path->pos = new_pos; - trans->paths_sorted = false; - - if (unlikely(path->cached)) { - btree_node_unlock(trans, path, 0); - path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - goto out; - } - - unsigned level = btree_path_up_until_good_node(trans, path, cmp); - - if (btree_path_node(path, level)) { - struct btree_path_level *l = &path->l[level]; - - BUG_ON(!btree_node_locked(path, level)); - /* - * We might have to skip over many keys, or just a few: try - * advancing the node iterator, and if we have to skip over too - * many keys just reinit it (or if we're rewinding, since that - * is expensive). - */ - if (cmp < 0 || - !btree_path_advance_to_pos(path, l, 8)) - bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); - - /* - * Iterators to interior nodes should always be pointed at the first non - * whiteout: - */ - if (unlikely(level)) - bch2_btree_node_iter_peek(&l->iter, l->b); - } - - if (unlikely(level != path->level)) { - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - __bch2_btree_path_unlock(trans, path); - } -out: - bch2_btree_path_verify(trans, path); - return path_idx; -} - -/* Btree path: main interface: */ - -static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) -{ - struct btree_path *sib; - - sib = prev_btree_path(trans, path); - if (sib && !btree_path_cmp(sib, path)) - return sib; - - sib = next_btree_path(trans, path); - if (sib && !btree_path_cmp(sib, path)) - return sib; - - return NULL; -} - -static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) -{ - struct btree_path *sib; - - sib = prev_btree_path(trans, path); - if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) - return sib; - - sib = next_btree_path(trans, path); - if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) - return sib; - - return NULL; -} - -static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path) -{ - __bch2_btree_path_unlock(trans, trans->paths + path); - btree_path_list_remove(trans, trans->paths + path); - __clear_bit(path, trans->paths_allocated); -} - -static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path) -{ - unsigned l = path->level; - - do { - if (!btree_path_node(path, l)) - break; - - if (!is_btree_node(path, l)) - return false; - - if (path->l[l].lock_seq != path->l[l].b->c.lock.seq) - return false; - - l++; - } while (l < path->locks_want); - - return true; -} - -void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) -{ - struct btree_path *path = trans->paths + path_idx, *dup = NULL; - - if (!__btree_path_put(trans, path, intent)) - return; - - if (!path->preserve && !path->should_be_locked) - goto free; - - dup = path->preserve - ? have_path_at_pos(trans, path) - : have_node_at_pos(trans, path); - if (!dup) - return; - - /* - * If we need this path locked, the duplicate also has te be locked - * before we free this one: - */ - if (path->should_be_locked && - !dup->should_be_locked && - !trans->restarted) { - if (!(trans->locked - ? bch2_btree_path_relock_norestart(trans, dup) - : bch2_btree_path_can_relock(trans, dup))) - return; - - dup->should_be_locked = true; - } - - BUG_ON(path->should_be_locked && - !trans->restarted && - trans->locked && - !btree_node_locked(dup, dup->level)); - - path->should_be_locked = false; - dup->preserve |= path->preserve; -free: - trace_btree_path_free(trans, path_idx, dup); - __bch2_path_free(trans, path_idx); -} - -void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) -{ - panic("trans->restart_count %u, should be %u, last restarted by %pS\n", - trans->restart_count, restart_count, - (void *) trans->last_begin_ip); -} - -static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct printbuf buf = PRINTBUF; - bch2_prt_backtrace(&buf, &trans->last_restarted_trace); - panic("in transaction restart: %s, last restarted by\n%s", - bch2_err_str(trans->restarted), - buf.buf); -#else - panic("in transaction restart: %s, last restarted by %pS\n", - bch2_err_str(trans->restarted), - (void *) trans->last_restarted_ip); -#endif -} - -void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans) -{ - if (trans->restarted) - bch2_trans_in_restart_error(trans); - - if (!trans->locked) - panic("trans should be locked, unlocked by %pS\n", - (void *) trans->last_unlock_ip); - - BUG(); -} - -noinline __cold -void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) -{ - prt_printf(buf, "%u transaction updates for %s journal seq %llu\n", - trans->nr_updates, trans->fn, trans->journal_res.seq); - printbuf_indent_add(buf, 2); - - trans_for_each_update(trans, i) { - struct bkey_s_c old = { &i->old_k, i->old_v }; - - prt_str(buf, "update: btree="); - bch2_btree_id_to_text(buf, i->btree_id); - prt_printf(buf, " cached=%u %pS\n", - i->cached, - (void *) i->ip_allocated); - - prt_printf(buf, " old "); - bch2_bkey_val_to_text(buf, trans->c, old); - prt_newline(buf); - - prt_printf(buf, " new "); - bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); - prt_newline(buf); - } - - for (struct jset_entry *e = btree_trans_journal_entries_start(trans); - e != btree_trans_journal_entries_top(trans); - e = vstruct_next(e)) { - bch2_journal_entry_to_text(buf, trans->c, e); - prt_newline(buf); - } - - printbuf_indent_sub(buf, 2); -} - -static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) -{ - struct btree_path *path = trans->paths + path_idx; - - prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ", - path_idx, path->ref, path->intent_ref, - path->preserve ? 'P' : ' ', - path->should_be_locked ? 'S' : ' ', - path->cached ? 'C' : 'B'); - bch2_btree_id_level_to_text(out, path->btree_id, path->level); - prt_str(out, " pos "); - bch2_bpos_to_text(out, path->pos); - - if (!path->cached && btree_node_locked(path, path->level)) { - prt_char(out, ' '); - struct btree *b = path_l(path)->b; - bch2_bpos_to_text(out, b->data->min_key); - prt_char(out, '-'); - bch2_bpos_to_text(out, b->key.k.p); - } - -#ifdef TRACK_PATH_ALLOCATED - prt_printf(out, " %pS", (void *) path->ip_allocated); -#endif -} - -static const char *btree_node_locked_str(enum btree_node_locked_type t) -{ - switch (t) { - case BTREE_NODE_UNLOCKED: - return "unlocked"; - case BTREE_NODE_READ_LOCKED: - return "read"; - case BTREE_NODE_INTENT_LOCKED: - return "intent"; - case BTREE_NODE_WRITE_LOCKED: - return "write"; - default: - return NULL; - } -} - -void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) -{ - bch2_btree_path_to_text_short(out, trans, path_idx); - - struct btree_path *path = trans->paths + path_idx; - - prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want); - prt_newline(out); - - printbuf_indent_add(out, 2); - for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { - prt_printf(out, "l=%u locks %s seq %u node ", l, - btree_node_locked_str(btree_node_locked_type(path, l)), - path->l[l].lock_seq); - - int ret = PTR_ERR_OR_ZERO(path->l[l].b); - if (ret) - prt_str(out, bch2_err_str(ret)); - else - prt_printf(out, "%px", path->l[l].b); - prt_newline(out); - } - printbuf_indent_sub(out, 2); -} - -static noinline __cold -void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, - bool nosort) -{ - struct trans_for_each_path_inorder_iter iter; - - if (!nosort) - btree_trans_sort_paths(trans); - - trans_for_each_path_idx_inorder(trans, iter) { - bch2_btree_path_to_text_short(out, trans, iter.path_idx); - prt_newline(out); - } -} - -noinline __cold -void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) -{ - __bch2_trans_paths_to_text(out, trans, false); -} - -static noinline __cold -void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) -{ - struct printbuf buf = PRINTBUF; - - __bch2_trans_paths_to_text(&buf, trans, nosort); - bch2_trans_updates_to_text(&buf, trans); - - bch2_print_str(trans->c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -} - -noinline __cold -void bch2_dump_trans_paths_updates(struct btree_trans *trans) -{ - __bch2_dump_trans_paths_updates(trans, false); -} - -noinline __cold -static void bch2_trans_update_max_paths(struct btree_trans *trans) -{ - struct btree_transaction_stats *s = btree_trans_stats(trans); - struct printbuf buf = PRINTBUF; - size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths); - - bch2_trans_paths_to_text(&buf, trans); - - if (!buf.allocation_failure) { - mutex_lock(&s->lock); - if (nr > s->nr_max_paths) { - s->nr_max_paths = nr; - swap(s->max_paths_text, buf.buf); - } - mutex_unlock(&s->lock); - } - - printbuf_exit(&buf); - - trans->nr_paths_max = nr; -} - -noinline __cold -int __bch2_btree_trans_too_many_iters(struct btree_trans *trans) -{ - if (trace_trans_restart_too_many_iters_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_trans_paths_to_text(&buf, trans); - trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf); - printbuf_exit(&buf); - } - - count_event(trans->c, trans_restart_too_many_iters); - - return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); -} - -static noinline void btree_path_overflow(struct btree_trans *trans) -{ - bch2_dump_trans_paths_updates(trans); - bch_err(trans->c, "trans path overflow"); -} - -static noinline void btree_paths_realloc(struct btree_trans *trans) -{ - unsigned nr = trans->nr_paths * 2; - - void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + - sizeof(struct btree_trans_paths) + - nr * sizeof(struct btree_path) + - nr * sizeof(btree_path_idx_t) + 8 + - nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL); - - unsigned long *paths_allocated = p; - memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long)); - p += BITS_TO_LONGS(nr) * sizeof(unsigned long); - - p += sizeof(struct btree_trans_paths); - struct btree_path *paths = p; - *trans_paths_nr(paths) = nr; - memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path)); - p += nr * sizeof(struct btree_path); - - btree_path_idx_t *sorted = p; - memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t)); - p += nr * sizeof(btree_path_idx_t) + 8; - - struct btree_insert_entry *updates = p; - memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry)); - - unsigned long *old = trans->paths_allocated; - - rcu_assign_pointer(trans->paths_allocated, paths_allocated); - rcu_assign_pointer(trans->paths, paths); - rcu_assign_pointer(trans->sorted, sorted); - rcu_assign_pointer(trans->updates, updates); - - trans->nr_paths = nr; - - if (old != trans->_paths_allocated) - kfree_rcu_mightsleep(old); -} - -static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans, - btree_path_idx_t pos) -{ - btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths); - - if (unlikely(idx == trans->nr_paths)) { - if (trans->nr_paths == BTREE_ITER_MAX) { - btree_path_overflow(trans); - return 0; - } - - btree_paths_realloc(trans); - } - - /* - * Do this before marking the new path as allocated, since it won't be - * initialized yet: - */ - if (unlikely(idx > trans->nr_paths_max)) - bch2_trans_update_max_paths(trans); - - __set_bit(idx, trans->paths_allocated); - - struct btree_path *path = &trans->paths[idx]; - path->ref = 0; - path->intent_ref = 0; - path->nodes_locked = 0; - - btree_path_list_add(trans, pos, idx); - trans->paths_sorted = false; - return idx; -} - -btree_path_idx_t bch2_path_get(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos, - unsigned locks_want, unsigned level, - unsigned flags, unsigned long ip) -{ - struct btree_path *path; - bool cached = flags & BTREE_ITER_cached; - bool intent = flags & BTREE_ITER_intent; - struct trans_for_each_path_inorder_iter iter; - btree_path_idx_t path_pos = 0, path_idx; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_trans_verify_locks(trans); - - btree_trans_sort_paths(trans); - - if (intent) - locks_want = max(locks_want, level + 1); - locks_want = min(locks_want, BTREE_MAX_DEPTH); - - trans_for_each_path_inorder(trans, path, iter) { - if (__btree_path_cmp(path, - btree_id, - cached, - pos, - level) > 0) - break; - - path_pos = iter.path_idx; - } - - if (path_pos && - trans->paths[path_pos].cached == cached && - trans->paths[path_pos].btree_id == btree_id && - trans->paths[path_pos].level == level && - bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) { - trace_btree_path_get(trans, trans->paths + path_pos, &pos); - - __btree_path_get(trans, trans->paths + path_pos, intent); - path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); - path = trans->paths + path_idx; - } else { - path_idx = btree_path_alloc(trans, path_pos); - path = trans->paths + path_idx; - - __btree_path_get(trans, path, intent); - path->pos = pos; - path->btree_id = btree_id; - path->cached = cached; - path->uptodate = BTREE_ITER_NEED_TRAVERSE; - path->should_be_locked = false; - path->level = level; - path->locks_want = locks_want; - path->nodes_locked = 0; - for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++) - path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); -#ifdef TRACK_PATH_ALLOCATED - path->ip_allocated = ip; -#endif - trans->paths_sorted = false; - - trace_btree_path_alloc(trans, path); - } - - if (!(flags & BTREE_ITER_nopreserve)) - path->preserve = true; - - /* - * If the path has locks_want greater than requested, we don't downgrade - * it here - on transaction restart because btree node split needs to - * upgrade locks, we might be putting/getting the iterator again. - * Downgrading iterators only happens via bch2_trans_downgrade(), after - * a successful transaction commit. - */ - - return path_idx; -} - -btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans, - enum btree_id btree_id, - unsigned level, - struct bpos pos) -{ - btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, - BTREE_ITER_nopreserve| - BTREE_ITER_intent, _RET_IP_); - path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); - - struct btree_path *path = trans->paths + path_idx; - bch2_btree_path_downgrade(trans, path); - __bch2_btree_path_unlock(trans, path); - return path_idx; -} - -struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) -{ - - struct btree_path_level *l = path_l(path); - struct bkey_packed *_k; - struct bkey_s_c k; - - if (unlikely(!l->b)) - return bkey_s_c_null; - - EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); - EBUG_ON(!btree_node_locked(path, path->level)); - - if (!path->cached) { - _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; - - EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos)); - - if (!k.k || !bpos_eq(path->pos, k.k->p)) - goto hole; - } else { - struct bkey_cached *ck = (void *) path->l[0].b; - if (!ck) - return bkey_s_c_null; - - EBUG_ON(path->btree_id != ck->key.btree_id || - !bkey_eq(path->pos, ck->key.pos)); - - *u = ck->k->k; - k = (struct bkey_s_c) { u, &ck->k->v }; - } - - return k; -hole: - bkey_init(u); - u->p = path->pos; - return (struct bkey_s_c) { u, NULL }; -} - -void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!iter->path || trans->restarted) - return; - - struct btree_path *path = btree_iter_path(trans, iter); - path->preserve = false; - if (path->ref == 1) - path->should_be_locked = false; -} -/* Btree iterators: */ - -int __must_check -__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) -{ - return bch2_btree_path_traverse(trans, iter->path, iter->flags); -} - -int __must_check -bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) -{ - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - iter->path = bch2_btree_path_set_pos(trans, iter->path, - btree_iter_search_key(iter), - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - return ret; - - struct btree_path *path = btree_iter_path(trans, iter); - if (btree_path_node(path, path->level)) - btree_path_set_should_be_locked(trans, path); - return 0; -} - -/* Iterate across nodes (leaf and interior nodes) */ - -struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, - struct btree_iter *iter) -{ - struct btree *b = NULL; - int ret; - - EBUG_ON(trans->paths[iter->path].cached); - bch2_btree_iter_verify(trans, iter); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - goto err; - - struct btree_path *path = btree_iter_path(trans, iter); - b = btree_path_node(path, path->level); - if (!b) - goto out; - - BUG_ON(bpos_lt(b->key.k.p, iter->pos)); - - bkey_init(&iter->k); - iter->k.p = iter->pos = b->key.k.p; - - iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); -out: - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - - return b; -err: - b = ERR_PTR(ret); - goto out; -} - -/* Only kept for -tools */ -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans, - struct btree_iter *iter) -{ - struct btree *b; - - while (b = bch2_btree_iter_peek_node(trans, iter), - bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) - bch2_trans_begin(trans); - - return b; -} - -struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter) -{ - struct btree *b = NULL; - int ret; - - EBUG_ON(trans->paths[iter->path].cached); - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(trans, iter); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - goto err; - - - struct btree_path *path = btree_iter_path(trans, iter); - - /* already at end? */ - if (!btree_path_node(path, path->level)) - return NULL; - - /* got to end? */ - if (!btree_path_node(path, path->level + 1)) { - path->should_be_locked = false; - btree_path_set_level_up(trans, path); - return NULL; - } - - /* - * We don't correctly handle nodes with extra intent locks here: - * downgrade so we don't violate locking invariants - */ - bch2_btree_path_downgrade(trans, path); - - if (!bch2_btree_node_relock(trans, path, path->level + 1)) { - trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); - __bch2_btree_path_unlock(trans, path); - path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); - path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - goto err; - } - - b = btree_path_node(path, path->level + 1); - - if (bpos_eq(iter->pos, b->key.k.p)) { - __btree_path_set_level_up(trans, path, path->level++); - } else { - if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED) - btree_node_unlock(trans, path, path->level + 1); - - /* - * Haven't gotten to the end of the parent node: go back down to - * the next child node - */ - iter->path = bch2_btree_path_set_pos(trans, iter->path, - bpos_successor(iter->pos), - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - path = btree_iter_path(trans, iter); - btree_path_set_level_down(trans, path, iter->min_depth); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - goto err; - - path = btree_iter_path(trans, iter); - b = path->l[path->level].b; - } - - bkey_init(&iter->k); - iter->k.p = iter->pos = b->key.k.p; - - iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); - EBUG_ON(btree_iter_path(trans, iter)->uptodate); -out: - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - - return b; -err: - b = ERR_PTR(ret); - goto out; -} - -/* Iterate across keys (in leaf nodes only) */ - -inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter) -{ - struct bpos pos = iter->k.p; - bool ret = !(iter->flags & BTREE_ITER_all_snapshots - ? bpos_eq(pos, SPOS_MAX) - : bkey_eq(pos, SPOS_MAX)); - - if (ret && !(iter->flags & BTREE_ITER_is_extents)) - pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(trans, iter, pos); - return ret; -} - -inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter) -{ - struct bpos pos = bkey_start_pos(&iter->k); - bool ret = !(iter->flags & BTREE_ITER_all_snapshots - ? bpos_eq(pos, POS_MIN) - : bkey_eq(pos, POS_MIN)); - - if (ret && !(iter->flags & BTREE_ITER_is_extents)) - pos = bkey_predecessor(iter, pos); - bch2_btree_iter_set_pos(trans, iter, pos); - return ret; -} - -static noinline -void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key, struct bkey_s_c *k) -{ - struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key; - - trans_for_each_update(trans, i) - if (!i->key_cache_already_flushed && - i->btree_id == iter->btree_id && - bpos_le(i->k->k.p, search_key) && - bpos_ge(i->k->k.p, k->k ? k->k->p : end)) { - iter->k = i->k->k; - *k = bkey_i_to_s_c(i->k); - } -} - -static noinline -void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key, - struct bkey_s_c *k) -{ - struct btree_path *path = btree_iter_path(trans, iter); - struct bpos end = path_l(path)->b->key.k.p; - - trans_for_each_update(trans, i) - if (!i->key_cache_already_flushed && - i->btree_id == iter->btree_id && - bpos_ge(i->k->k.p, search_key) && - bpos_le(i->k->k.p, k->k ? k->k->p : end)) { - iter->k = i->k->k; - *k = bkey_i_to_s_c(i->k); - } -} - -static noinline -void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k) -{ - trans_for_each_update(trans, i) - if (!i->key_cache_already_flushed && - i->btree_id == iter->btree_id && - bpos_eq(i->k->k.p, iter->pos)) { - iter->k = i->k->k; - *k = bkey_i_to_s_c(i->k); - } -} - -static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_pos, - struct bpos end_pos) -{ - struct btree_path *path = btree_iter_path(trans, iter); - - return bch2_journal_keys_peek_max(trans->c, iter->btree_id, - path->level, - search_pos, - end_pos, - &iter->journal_idx); -} - -static noinline -struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, - struct btree_iter *iter) -{ - struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos); - - if (k) { - iter->k = k->k; - return bkey_i_to_s_c(k); - } else { - return bkey_s_c_null; - } -} - -static noinline -void btree_trans_peek_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_key, - struct bkey_s_c *k) -{ - struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *next_journal = - bch2_btree_journal_peek(trans, iter, search_key, - k->k ? k->k->p : path_l(path)->b->key.k.p); - if (next_journal) { - iter->k = next_journal->k; - *k = bkey_i_to_s_c(next_journal); - } -} - -static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_key, - struct bpos end_pos) -{ - struct btree_path *path = btree_iter_path(trans, iter); - - return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, - path->level, - search_key, - end_pos, - &iter->journal_idx); -} - -static noinline -void btree_trans_peek_prev_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_key, - struct bkey_s_c *k) -{ - struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *next_journal = - bch2_btree_journal_peek_prev(trans, iter, search_key, - k->k ? k->k->p : path_l(path)->b->data->min_key); - - if (next_journal) { - iter->k = next_journal->k; - *k = bkey_i_to_s_c(next_journal); - } -} - -/* - * Checks btree key cache for key at iter->pos and returns it if present, or - * bkey_s_c_null: - */ -static noinline -struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct bkey u; - struct bkey_s_c k; - int ret; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if ((iter->flags & BTREE_ITER_key_cache_fill) && - bpos_eq(iter->pos, pos)) - return bkey_s_c_null; - - if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) - return bkey_s_c_null; - - if (!iter->key_cache_path) - iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, - iter->flags & BTREE_ITER_intent, 0, - iter->flags|BTREE_ITER_cached| - BTREE_ITER_cached_nofill, - _THIS_IP_); - - iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - iter->flags|BTREE_ITER_cached) ?: - bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_); - if (unlikely(ret)) - return bkey_s_c_err(ret); - - k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); - if (!k.k) - return k; - - if ((iter->flags & BTREE_ITER_all_snapshots) && - !bpos_eq(pos, k.k->p)) - return bkey_s_c_null; - - iter->k = u; - k.k = &iter->k; - btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); - return k; -} - -static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key) -{ - struct bkey_s_c k, k2; - int ret; - - EBUG_ON(btree_iter_path(trans, iter)->cached); - bch2_btree_iter_verify(trans, iter); - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(trans, iter, iter->pos); - k = bkey_s_c_err(ret); - break; - } - - struct btree_path *path = btree_iter_path(trans, iter); - struct btree_path_level *l = path_l(path); - - if (unlikely(!l->b)) { - /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); - k = bkey_s_c_null; - break; - } - - btree_path_set_should_be_locked(trans, path); - - k = btree_path_level_peek_all(trans->c, l, &iter->k); - - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - k.k && - (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { - k = k2; - if (bkey_err(k)) { - bch2_btree_iter_set_pos(trans, iter, iter->pos); - break; - } - } - - if (unlikely(iter->flags & BTREE_ITER_with_journal)) - btree_trans_peek_journal(trans, iter, search_key, &k); - - if (unlikely((iter->flags & BTREE_ITER_with_updates) && - trans->nr_updates)) - bch2_btree_trans_peek_updates(trans, iter, search_key, &k); - - if (k.k && bkey_deleted(k.k)) { - /* - * If we've got a whiteout, and it's after the search - * key, advance the search key to the whiteout instead - * of just after the whiteout - it might be a btree - * whiteout, with a real key at the same position, since - * in the btree deleted keys sort before non deleted. - */ - search_key = !bpos_eq(search_key, k.k->p) - ? k.k->p - : bpos_successor(k.k->p); - continue; - } - - if (likely(k.k)) { - break; - } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) { - /* Advance to next leaf node: */ - search_key = bpos_successor(l->b->key.k.p); - } else { - /* End of btree: */ - bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); - k = bkey_s_c_null; - break; - } - } - - bch2_btree_iter_verify(trans, iter); - - if (trace___btree_iter_peek_enabled()) { - CLASS(printbuf, buf)(); - - int ret = bkey_err(k); - if (ret) - prt_str(&buf, bch2_err_str(ret)); - else if (k.k) - bch2_bkey_val_to_text(&buf, trans->c, k); - else - prt_str(&buf, "(null)"); - trace___btree_iter_peek(trans->c, buf.buf); - } - - return k; -} - -/** - * bch2_btree_iter_peek_max() - returns first key greater than or equal to - * iterator's current position - * @trans: btree transaction object - * @iter: iterator to peek from - * @end: search limit: returns keys less than or equal to @end - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end) -{ - struct bpos search_key = btree_iter_search_key(iter); - struct bkey_s_c k; - struct bpos iter_pos = iter->pos; - int ret; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; - } - - if (iter->update_path) { - bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - } - - while (1) { - k = __bch2_btree_iter_peek(trans, iter, search_key); - if (unlikely(!k.k)) - goto end; - if (unlikely(bkey_err(k))) - goto out_no_locked; - - if (iter->flags & BTREE_ITER_filter_snapshots) { - /* - * We need to check against @end before FILTER_SNAPSHOTS because - * if we get to a different inode that requested we might be - * seeing keys for a different snapshot tree that will all be - * filtered out. - * - * But we can't do the full check here, because bkey_start_pos() - * isn't monotonically increasing before FILTER_SNAPSHOTS, and - * that's what we check against in extents mode: - */ - if (unlikely(!(iter->flags & BTREE_ITER_is_extents) - ? bkey_gt(k.k->p, end) - : k.k->p.inode > end.inode)) - goto end; - - if (iter->update_path && - !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - } - - if ((iter->flags & BTREE_ITER_intent) && - !(iter->flags & BTREE_ITER_is_extents) && - !iter->update_path) { - struct bpos pos = k.k->p; - - if (pos.snapshot < iter->snapshot) { - search_key = bpos_successor(k.k->p); - continue; - } - - pos.snapshot = iter->snapshot; - - /* - * advance, same as on exit for iter->path, but only up - * to snapshot - */ - __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); - iter->update_path = iter->path; - - iter->update_path = bch2_btree_path_set_pos(trans, - iter->update_path, pos, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; - } - } - - /* - * We can never have a key in a leaf node at POS_MAX, so - * we don't have to check these successor() calls: - */ - if (!bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - search_key = bpos_successor(k.k->p); - continue; - } - - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_key_cache_fill)) { - search_key = bkey_successor(iter, k.k->p); - continue; - } - } - - /* - * iter->pos should be mononotically increasing, and always be - * equal to the key we just returned - except extents can - * straddle iter->pos: - */ - if (!(iter->flags & BTREE_ITER_is_extents)) - iter_pos = k.k->p; - else - iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - - if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) : - iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) : - bkey_gt(iter_pos, end))) - goto end; - - break; - } - - iter->pos = iter_pos; - - iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); -out_no_locked: - if (iter->update_path) { - ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_); - if (unlikely(ret)) - k = bkey_s_c_err(ret); - else - btree_path_set_should_be_locked(trans, trans->paths + iter->update_path); - } - - if (!(iter->flags & BTREE_ITER_all_snapshots)) - iter->pos.snapshot = iter->snapshot; - - ret = bch2_btree_iter_verify_ret(trans, iter, k); - if (unlikely(ret)) { - bch2_btree_iter_set_pos(trans, iter, iter->pos); - k = bkey_s_c_err(ret); - } - - bch2_btree_iter_verify_entry_exit(iter); - - if (trace_btree_iter_peek_max_enabled()) { - CLASS(printbuf, buf)(); - - int ret = bkey_err(k); - if (ret) - prt_str(&buf, bch2_err_str(ret)); - else if (k.k) - bch2_bkey_val_to_text(&buf, trans->c, k); - else - prt_str(&buf, "(null)"); - trace_btree_iter_peek_max(trans->c, buf.buf); - } - - return k; -end: - bch2_btree_iter_set_pos(trans, iter, end); - k = bkey_s_c_null; - goto out_no_locked; -} - -/** - * bch2_btree_iter_next() - returns first key greater than iterator's current - * position - * @trans: btree transaction object - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!bch2_btree_iter_advance(trans, iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek(trans, iter); -} - -static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key) -{ - struct bkey_s_c k, k2; - - bch2_btree_iter_verify(trans, iter); - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(trans, iter, iter->pos); - k = bkey_s_c_err(ret); - break; - } - - struct btree_path *path = btree_iter_path(trans, iter); - struct btree_path_level *l = path_l(path); - - if (unlikely(!l->b)) { - /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); - k = bkey_s_c_null; - break; - } - - btree_path_set_should_be_locked(trans, path); - - k = btree_path_level_peek_all(trans->c, l, &iter->k); - if (!k.k || bpos_gt(k.k->p, search_key)) { - k = btree_path_level_prev(trans, path, l, &iter->k); - - BUG_ON(k.k && bpos_gt(k.k->p, search_key)); - } - - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - k.k && - (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { - k = k2; - if (bkey_err(k2)) { - bch2_btree_iter_set_pos(trans, iter, iter->pos); - break; - } - } - - if (unlikely(iter->flags & BTREE_ITER_with_journal)) - btree_trans_peek_prev_journal(trans, iter, search_key, &k); - - if (unlikely((iter->flags & BTREE_ITER_with_updates) && - trans->nr_updates)) - bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k); - - if (likely(k.k && !bkey_deleted(k.k))) { - break; - } else if (k.k) { - search_key = bpos_predecessor(k.k->p); - } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { - /* Advance to previous leaf node: */ - search_key = bpos_predecessor(path->l[0].b->data->min_key); - } else { - /* Start of btree: */ - bch2_btree_iter_set_pos(trans, iter, POS_MIN); - k = bkey_s_c_null; - break; - } - } - - bch2_btree_iter_verify(trans, iter); - return k; -} - -/** - * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to - * iterator's current position - * @trans: btree transaction object - * @iter: iterator to peek from - * @end: search limit: returns keys greater than or equal to @end - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end) -{ - if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && - !bkey_eq(iter->pos, POS_MAX) && - !((iter->flags & BTREE_ITER_is_extents) && - iter->pos.offset == U64_MAX)) { - - /* - * bkey_start_pos(), for extents, is not monotonically - * increasing until after filtering for snapshots: - * - * Thus, for extents we need to search forward until we find a - * real visible extents - easiest to just use peek_slot() (which - * internally uses peek() for extents) - */ - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); - if (bkey_err(k)) - return k; - - if (!bkey_deleted(k.k) && - (!(iter->flags & BTREE_ITER_is_extents) || - bkey_lt(bkey_start_pos(k.k), iter->pos))) - return k; - } - - struct bpos search_key = iter->pos; - struct bkey_s_c k; - btree_path_idx_t saved_path = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode); - - int ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; - } - - while (1) { - k = __bch2_btree_iter_peek_prev(trans, iter, search_key); - if (unlikely(!k.k)) - goto end; - if (unlikely(bkey_err(k))) - goto out_no_locked; - - if (iter->flags & BTREE_ITER_filter_snapshots) { - struct btree_path *s = saved_path ? trans->paths + saved_path : NULL; - if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) { - /* - * If we have a saved candidate, and we're past - * the last possible snapshot overwrite, return - * it: - */ - bch2_path_put(trans, iter->path, - iter->flags & BTREE_ITER_intent); - iter->path = saved_path; - saved_path = 0; - k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); - break; - } - - /* - * We need to check against @end before FILTER_SNAPSHOTS because - * if we get to a different inode that requested we might be - * seeing keys for a different snapshot tree that will all be - * filtered out. - */ - if (unlikely(bkey_lt(k.k->p, end))) - goto end; - - if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { - search_key = bpos_predecessor(k.k->p); - continue; - } - - if (k.k->p.snapshot != iter->snapshot) { - /* - * Have a key visible in iter->snapshot, but - * might have overwrites: - save it and keep - * searching. Unless it's a whiteout - then drop - * our previous saved candidate: - */ - if (saved_path) { - bch2_path_put(trans, saved_path, - iter->flags & BTREE_ITER_intent); - saved_path = 0; - } - - if (!bkey_whiteout(k.k)) { - saved_path = btree_path_clone(trans, iter->path, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - trace_btree_path_save_pos(trans, - trans->paths + iter->path, - trans->paths + saved_path); - } - - search_key = bpos_predecessor(k.k->p); - continue; - } - - if (bkey_whiteout(k.k)) { - search_key = bkey_predecessor(iter, k.k->p); - search_key.snapshot = U32_MAX; - continue; - } - } - - EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) : - iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) : - bkey_gt(k.k->p, iter->pos)); - - if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) : - iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) : - bkey_lt(k.k->p, end))) - goto end; - - break; - } - - /* Extents can straddle iter->pos: */ - iter->pos = bpos_min(iter->pos, k.k->p);; - - if (iter->flags & BTREE_ITER_filter_snapshots) - iter->pos.snapshot = iter->snapshot; -out_no_locked: - if (saved_path) - bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent); - - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - - if (trace_btree_iter_peek_prev_min_enabled()) { - CLASS(printbuf, buf)(); - - int ret = bkey_err(k); - if (ret) - prt_str(&buf, bch2_err_str(ret)); - else if (k.k) - bch2_bkey_val_to_text(&buf, trans->c, k); - else - prt_str(&buf, "(null)"); - trace_btree_iter_peek_prev_min(trans->c, buf.buf); - } - return k; -end: - bch2_btree_iter_set_pos(trans, iter, end); - k = bkey_s_c_null; - goto out_no_locked; -} - -/** - * bch2_btree_iter_prev() - returns first key less than iterator's current - * position - * @trans: btree transaction object - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!bch2_btree_iter_rewind(trans, iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_prev(trans, iter); -} - -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter) -{ - struct bpos search_key; - struct bkey_s_c k; - int ret; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(trans, iter); - bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out; - } - - /* extents can't span inode numbers: */ - if ((iter->flags & BTREE_ITER_is_extents) && - unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { - if (iter->pos.inode == KEY_INODE_MAX) { - k = bkey_s_c_null; - goto out2; - } - - bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); - } - - search_key = btree_iter_search_key(iter); - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out; - } - - struct btree_path *path = btree_iter_path(trans, iter); - if (unlikely(!btree_path_node(path, path->level))) { - k = bkey_s_c_null; - goto out2; - } - - btree_path_set_should_be_locked(trans, path); - - if ((iter->flags & BTREE_ITER_cached) || - !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { - k = bkey_s_c_null; - - if (unlikely((iter->flags & BTREE_ITER_with_updates) && - trans->nr_updates)) { - bch2_btree_trans_peek_slot_updates(trans, iter, &k); - if (k.k) - goto out; - } - - if (unlikely(iter->flags & BTREE_ITER_with_journal) && - (k = btree_trans_peek_slot_journal(trans, iter)).k) - goto out; - - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { - if (!bkey_err(k)) - iter->k = *k.k; - /* We're not returning a key from iter->path: */ - goto out; - } - - k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); - if (unlikely(!k.k)) - goto out; - - if (unlikely(k.k->type == KEY_TYPE_whiteout && - (iter->flags & BTREE_ITER_filter_snapshots) && - !(iter->flags & BTREE_ITER_key_cache_fill))) - iter->k.type = KEY_TYPE_deleted; - } else { - struct bpos next; - struct bpos end = iter->pos; - - if (iter->flags & BTREE_ITER_is_extents) - end.offset = U64_MAX; - - EBUG_ON(btree_iter_path(trans, iter)->level); - - if (iter->flags & BTREE_ITER_intent) { - struct btree_iter iter2; - - bch2_trans_copy_iter(trans, &iter2, iter); - k = bch2_btree_iter_peek_max(trans, &iter2, end); - - if (k.k && !bkey_err(k)) { - swap(iter->key_cache_path, iter2.key_cache_path); - iter->k = iter2.k; - k.k = &iter->k; - } - bch2_trans_iter_exit(trans, &iter2); - } else { - struct bpos pos = iter->pos; - - k = bch2_btree_iter_peek_max(trans, iter, end); - if (unlikely(bkey_err(k))) - bch2_btree_iter_set_pos(trans, iter, pos); - else - iter->pos = pos; - } - - if (unlikely(bkey_err(k))) - goto out; - - next = k.k ? bkey_start_pos(k.k) : POS_MAX; - - if (bkey_lt(iter->pos, next)) { - bkey_init(&iter->k); - iter->k.p = iter->pos; - - if (iter->flags & BTREE_ITER_is_extents) { - bch2_key_resize(&iter->k, - min_t(u64, KEY_SIZE_MAX, - (next.inode == iter->pos.inode - ? next.offset - : KEY_OFFSET_MAX) - - iter->pos.offset)); - EBUG_ON(!iter->k.size); - } - - k = (struct bkey_s_c) { &iter->k, NULL }; - } - } -out: - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - ret = bch2_btree_iter_verify_ret(trans, iter, k); - if (unlikely(ret)) - k = bkey_s_c_err(ret); -out2: - if (trace_btree_iter_peek_slot_enabled()) { - CLASS(printbuf, buf)(); - - int ret = bkey_err(k); - if (ret) - prt_str(&buf, bch2_err_str(ret)); - else if (k.k) - bch2_bkey_val_to_text(&buf, trans->c, k); - else - prt_str(&buf, "(null)"); - trace_btree_iter_peek_slot(trans->c, buf.buf); - } - - return k; -} - -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!bch2_btree_iter_advance(trans, iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_slot(trans, iter); -} - -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!bch2_btree_iter_rewind(trans, iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_slot(trans, iter); -} - -/* Obsolete, but still used by rust wrapper in -tools */ -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter) -{ - struct bkey_s_c k; - - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_type(trans, iter, iter->flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); - - return k; -} - -/* new transactional stuff: */ - -#ifdef CONFIG_BCACHEFS_DEBUG -static void btree_trans_verify_sorted_refs(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1); - - trans_for_each_path(trans, path, i) { - BUG_ON(path->sorted_idx >= trans->nr_sorted); - BUG_ON(trans->sorted[path->sorted_idx] != i); - } - - for (i = 0; i < trans->nr_sorted; i++) { - unsigned idx = trans->sorted[i]; - - BUG_ON(!test_bit(idx, trans->paths_allocated)); - BUG_ON(trans->paths[idx].sorted_idx != i); - } -} - -static void btree_trans_verify_sorted(struct btree_trans *trans) -{ - struct btree_path *path, *prev = NULL; - struct trans_for_each_path_inorder_iter iter; - - if (!static_branch_unlikely(&bch2_debug_check_iterators)) - return; - - trans_for_each_path_inorder(trans, path, iter) { - if (prev && btree_path_cmp(prev, path) > 0) { - __bch2_dump_trans_paths_updates(trans, true); - panic("trans paths out of order!\n"); - } - prev = path; - } -} -#else -static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} -static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} -#endif - -void __bch2_btree_trans_sort_paths(struct btree_trans *trans) -{ - int i, l = 0, r = trans->nr_sorted, inc = 1; - bool swapped; - - btree_trans_verify_sorted_refs(trans); - - if (trans->paths_sorted) - goto out; - - /* - * Cocktail shaker sort: this is efficient because iterators will be - * mostly sorted. - */ - do { - swapped = false; - - for (i = inc > 0 ? l : r - 2; - i + 1 < r && i >= l; - i += inc) { - if (btree_path_cmp(trans->paths + trans->sorted[i], - trans->paths + trans->sorted[i + 1]) > 0) { - swap(trans->sorted[i], trans->sorted[i + 1]); - trans->paths[trans->sorted[i]].sorted_idx = i; - trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; - swapped = true; - } - } - - if (inc > 0) - --r; - else - l++; - inc = -inc; - } while (swapped); - - trans->paths_sorted = true; -out: - btree_trans_verify_sorted(trans); -} - -static inline void btree_path_list_remove(struct btree_trans *trans, - struct btree_path *path) -{ - EBUG_ON(path->sorted_idx >= trans->nr_sorted); -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - trans->nr_sorted--; - memmove_u64s_down_small(trans->sorted + path->sorted_idx, - trans->sorted + path->sorted_idx + 1, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, - sizeof(u64) / sizeof(btree_path_idx_t))); -#else - array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); -#endif - for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) - trans->paths[trans->sorted[i]].sorted_idx = i; -} - -static inline void btree_path_list_add(struct btree_trans *trans, - btree_path_idx_t pos, - btree_path_idx_t path_idx) -{ - struct btree_path *path = trans->paths + path_idx; - - path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted; - -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, - trans->sorted + path->sorted_idx, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, - sizeof(u64) / sizeof(btree_path_idx_t))); - trans->nr_sorted++; - trans->sorted[path->sorted_idx] = path_idx; -#else - array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx); -#endif - - for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) - trans->paths[trans->sorted[i]].sorted_idx = i; - - btree_trans_verify_sorted_refs(trans); -} - -void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) -{ - if (iter->update_path) - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - if (iter->path) - bch2_path_put(trans, iter->path, - iter->flags & BTREE_ITER_intent); - if (iter->key_cache_path) - bch2_path_put(trans, iter->key_cache_path, - iter->flags & BTREE_ITER_intent); - iter->path = 0; - iter->update_path = 0; - iter->key_cache_path = 0; -} - -void bch2_trans_iter_init_outlined(struct btree_trans *trans, - struct btree_iter *iter, - enum btree_id btree_id, struct bpos pos, - unsigned flags) -{ - bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, 0, flags), - _RET_IP_); -} - -void bch2_trans_node_iter_init(struct btree_trans *trans, - struct btree_iter *iter, - enum btree_id btree_id, - struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags) -{ - flags |= BTREE_ITER_not_extents; - flags |= BTREE_ITER_snapshot_field; - flags |= BTREE_ITER_all_snapshots; - - if (!depth && btree_id_cached(trans->c, btree_id)) - flags |= BTREE_ITER_with_key_cache; - - bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, - bch2_btree_iter_flags(trans, btree_id, depth, flags), - _RET_IP_); - - iter->min_depth = depth; - - struct btree_path *path = btree_iter_path(trans, iter); - BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); - BUG_ON(path->level != depth); - BUG_ON(iter->min_depth != depth); -} - -void bch2_trans_copy_iter(struct btree_trans *trans, - struct btree_iter *dst, struct btree_iter *src) -{ - *dst = *src; -#ifdef TRACK_PATH_ALLOCATED - dst->ip_allocated = _RET_IP_; -#endif - if (src->path) - __btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent); - if (src->update_path) - __btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent); - dst->key_cache_path = 0; -} - -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, - darray_trans_kmalloc_trace *trace) -{ - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 60); - - darray_for_each(*trace, i) - prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes); -} -#endif - -void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip) -{ - struct bch_fs *c = trans->c; - unsigned new_top = trans->mem_top + size; - unsigned old_bytes = trans->mem_bytes; - unsigned new_bytes = roundup_pow_of_two(new_top); - int ret; - void *new_mem; - void *p; - - if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n", - BTREE_TRANS_MEM_MAX); - - bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -#endif - } - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (ret) - return ERR_PTR(ret); - - struct btree_transaction_stats *s = btree_trans_stats(trans); - if (new_bytes > s->max_mem) { - mutex_lock(&s->lock); -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); - s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, - trans->trans_kmalloc_trace.nr); - - memcpy(s->trans_kmalloc_trace.data, - trans->trans_kmalloc_trace.data, - sizeof(s->trans_kmalloc_trace.data[0]) * - s->trans_kmalloc_trace.nr); -#endif - s->max_mem = new_bytes; - mutex_unlock(&s->lock); - } - - if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) { - EBUG_ON(trans->mem_bytes >= new_bytes); - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); - } - - if (old_bytes) { - trans->realloc_bytes_required = new_bytes; - trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart_ip(trans, - BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); - } - - EBUG_ON(trans->mem); - - new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); - - new_mem = kmalloc(new_bytes, GFP_KERNEL); - if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - trans->used_mempool = true; - } - - EBUG_ON(!new_mem); - - trans->mem = new_mem; - trans->mem_bytes = new_bytes; - - ret = bch2_trans_relock(trans); - if (ret) - return ERR_PTR(ret); - } - - trans->mem = new_mem; - trans->mem_bytes = new_bytes; - - p = trans->mem + trans->mem_top; - trans->mem_top += size; - memset(p, 0, size); - return p; -} - -static inline void check_srcu_held_too_long(struct btree_trans *trans) -{ - WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10), - "btree trans held srcu lock (delaying memory reclaim) for %lu seconds", - (jiffies - trans->srcu_lock_time) / HZ); -} - -void bch2_trans_srcu_unlock(struct btree_trans *trans) -{ - if (trans->srcu_held) { - struct bch_fs *c = trans->c; - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->cached && !btree_node_locked(path, 0)) - path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); - - check_srcu_held_too_long(trans); - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); - trans->srcu_held = false; - } -} - -static void bch2_trans_srcu_lock(struct btree_trans *trans) -{ - if (!trans->srcu_held) { - trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; - trans->srcu_held = true; - } -} - -/** - * bch2_trans_begin() - reset a transaction after a interrupted attempt - * @trans: transaction to reset - * - * Returns: current restart counter, to be used with trans_was_restarted() - * - * While iterating over nodes or updating nodes a attempt to lock a btree node - * may return BCH_ERR_transaction_restart when the trylock fails. When this - * occurs bch2_trans_begin() should be called and the transaction retried. - */ -u32 bch2_trans_begin(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - u64 now; - - bch2_trans_reset_updates(trans); - - trans->restart_count++; - trans->mem_top = 0; - - if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) { - EBUG_ON(!trans->mem || !trans->mem_bytes); - unsigned new_bytes = trans->realloc_bytes_required; - void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); - new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); - - EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); - - if (!new_mem) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - trans->used_mempool = true; - kfree(trans->mem); - } - } - trans->mem = new_mem; - trans->mem_bytes = new_bytes; - } - - trans_for_each_path(trans, path, i) { - path->should_be_locked = false; - - /* - * If the transaction wasn't restarted, we're presuming to be - * doing something new: dont keep iterators excpt the ones that - * are in use - except for the subvolumes btree: - */ - if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) - path->preserve = false; - - /* - * XXX: we probably shouldn't be doing this if the transaction - * was restarted, but currently we still overflow transaction - * iterators if we do that - */ - if (!path->ref && !path->preserve) - __bch2_path_free(trans, i); - else - path->preserve = false; - } - - now = local_clock(); - - if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) && - time_after64(now, trans->last_begin_time + 10)) - __bch2_time_stats_update(&btree_trans_stats(trans)->duration, - trans->last_begin_time, now); - - if (!trans->restarted && - (need_resched() || - time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) { - bch2_trans_unlock(trans); - cond_resched(); - now = local_clock(); - } - trans->last_begin_time = now; - - if (unlikely(trans->srcu_held && - time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) - bch2_trans_srcu_unlock(trans); - - trans->last_begin_ip = _RET_IP_; - -#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS - if (trans->restarted) { - trans->restart_count_this_trans++; - } else { - trans->restart_count_this_trans = 0; - } -#endif - -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - trans->trans_kmalloc_trace.nr = 0; -#endif - - trans_set_locked(trans, false); - - if (trans->restarted) { - bch2_btree_path_traverse_all(trans); - trans->notrace_relock_fail = false; - } - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - return trans->restart_count; -} - -const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" }; - -unsigned bch2_trans_get_fn_idx(const char *fn) -{ - for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) - if (!bch2_btree_transaction_fns[i] || - bch2_btree_transaction_fns[i] == fn) { - bch2_btree_transaction_fns[i] = fn; - return i; - } - - pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); - return 0; -} - -struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) - __acquires(&c->btree_trans_barrier) -{ - struct btree_trans *trans; - - if (IS_ENABLED(__KERNEL__)) { - trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); - if (trans) { - memset(trans, 0, offsetof(struct btree_trans, list)); - goto got_trans; - } - } - - trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); - memset(trans, 0, sizeof(*trans)); - - seqmutex_lock(&c->btree_trans_lock); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - struct btree_trans *pos; - pid_t pid = current->pid; - - trans->locking_wait.task = current; - - list_for_each_entry(pos, &c->btree_trans_list, list) { - struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task); - /* - * We'd much prefer to be stricter here and completely - * disallow multiple btree_trans in the same thread - - * but the data move path calls bch2_write when we - * already have a btree_trans initialized. - */ - BUG_ON(pos_task && - pid == pos_task->pid && - pos->locked); - } - } - - list_add(&trans->list, &c->btree_trans_list); - seqmutex_unlock(&c->btree_trans_lock); -got_trans: - trans->c = c; - trans->last_begin_time = local_clock(); - trans->fn_idx = fn_idx; - trans->locking_wait.task = current; - trans->journal_replay_not_finished = - unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) && - atomic_inc_not_zero(&c->journal_keys.ref); - trans->nr_paths = ARRAY_SIZE(trans->_paths); - trans->paths_allocated = trans->_paths_allocated; - trans->sorted = trans->_sorted; - trans->paths = trans->_paths; - trans->updates = trans->_updates; - - *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL; - - trans->paths_allocated[0] = 1; - - static struct lock_class_key lockdep_key; - lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0); - - if (fn_idx < BCH_TRANSACTIONS_NR) { - trans->fn = bch2_btree_transaction_fns[fn_idx]; - - struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx]; - - if (s->max_mem) { - unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); - - trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); - if (likely(trans->mem)) - trans->mem_bytes = expected_mem_bytes; - } - - trans->nr_paths_max = s->nr_max_paths; - } - - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; - trans->srcu_held = true; - trans_set_locked(trans, false); - - closure_init_stack_release(&trans->ref); - return trans; -} - -#ifdef CONFIG_BCACHEFS_DEBUG - -static bool btree_paths_leaked(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->ref) - return true; - return false; -} - -static void check_btree_paths_leaked(struct btree_trans *trans) -{ - if (btree_paths_leaked(trans)) { - struct bch_fs *c = trans->c; - struct btree_path *path; - unsigned i; - - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn); - trans_for_each_path(trans, path, i) - if (path->ref) - prt_printf(&buf, "btree %s %pS\n", - bch2_btree_id_str(path->btree_id), - (void *) path->ip_allocated); - - bch2_fs_emergency_read_only2(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } -} -#else -static inline void check_btree_paths_leaked(struct btree_trans *trans) {} -#endif - -void bch2_trans_put(struct btree_trans *trans) - __releases(&c->btree_trans_barrier) -{ - struct bch_fs *c = trans->c; - - if (trans->restarted) - bch2_trans_in_restart_error(trans); - - bch2_trans_unlock(trans); - - trans_for_each_update(trans, i) - __btree_path_put(trans, trans->paths + i->path, true); - trans->nr_updates = 0; - - check_btree_paths_leaked(trans); - - if (trans->srcu_held) { - check_srcu_held_too_long(trans); - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); - } - - if (unlikely(trans->journal_replay_not_finished)) - bch2_journal_keys_put(c); - - /* - * trans->ref protects trans->locking_wait.task, btree_paths array; used - * by cycle detector - */ - closure_return_sync(&trans->ref); - trans->locking_wait.task = NULL; - -#ifdef CONFIG_BCACHEFS_DEBUG - darray_exit(&trans->last_restarted_trace); -#endif -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_exit(&trans->trans_kmalloc_trace); -#endif - - unsigned long *paths_allocated = trans->paths_allocated; - trans->paths_allocated = NULL; - trans->paths = NULL; - - if (paths_allocated != trans->_paths_allocated) - kvfree_rcu_mightsleep(paths_allocated); - - if (trans->used_mempool) - mempool_free(trans->mem, &c->btree_trans_mem_pool); - else - kfree(trans->mem); - - /* Userspace doesn't have a real percpu implementation: */ - if (IS_ENABLED(__KERNEL__)) - trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); - - if (trans) { - seqmutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - seqmutex_unlock(&c->btree_trans_lock); - - mempool_free(trans, &c->btree_trans_pool); - } -} - -bool bch2_current_has_btree_trans(struct bch_fs *c) -{ - seqmutex_lock(&c->btree_trans_lock); - struct btree_trans *trans; - bool ret = false; - list_for_each_entry(trans, &c->btree_trans_list, list) - if (trans->locking_wait.task == current && - trans->locked) { - ret = true; - break; - } - seqmutex_unlock(&c->btree_trans_lock); - return ret; -} - -static void __maybe_unused -bch2_btree_bkey_cached_common_to_text(struct printbuf *out, - struct btree_bkey_cached_common *b) -{ - struct six_lock_count c = six_lock_counts(&b->lock); - pid_t pid; - - scoped_guard(rcu) { - struct task_struct *owner = READ_ONCE(b->lock.owner); - pid = owner ? owner->pid : 0; - } - - prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); - bch2_btree_id_to_text(out, b->btree_id); - prt_printf(out, " l=%u:", b->level); - bch2_bpos_to_text(out, btree_node_pos(b)); - - prt_printf(out, "\t locks %u:%u:%u held by pid %u", - c.n[0], c.n[1], c.n[2], pid); -} - -void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) -{ - struct btree_bkey_cached_common *b; - static char lock_types[] = { 'r', 'i', 'w' }; - struct task_struct *task = READ_ONCE(trans->locking_wait.task); - unsigned l, idx; - - /* before rcu_read_lock(): */ - bch2_printbuf_make_room(out, 4096); - - if (!out->nr_tabstops) { - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 32); - } - - prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); - - /* trans->paths is rcu protected vs. freeing */ - guard(rcu)(); - out->atomic++; - - struct btree_path *paths = rcu_dereference(trans->paths); - if (!paths) - goto out; - - unsigned long *paths_allocated = trans_paths_allocated(paths); - - trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) { - struct btree_path *path = paths + idx; - if (!path->nodes_locked) - continue; - - prt_printf(out, " path %u %c ", - idx, - path->cached ? 'c' : 'b'); - bch2_btree_id_to_text(out, path->btree_id); - prt_printf(out, " l=%u:", path->level); - bch2_bpos_to_text(out, path->pos); - prt_newline(out); - - for (l = 0; l < BTREE_MAX_DEPTH; l++) { - if (btree_node_locked(path, l) && - !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { - prt_printf(out, " %c l=%u ", - lock_types[btree_node_locked_type(path, l)], l); - bch2_btree_bkey_cached_common_to_text(out, b); - prt_newline(out); - } - } - } - - b = READ_ONCE(trans->locking); - if (b) { - prt_printf(out, " blocked for %lluus on\n", - div_u64(local_clock() - trans->locking_wait.start_time, 1000)); - prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); - bch2_btree_bkey_cached_common_to_text(out, b); - prt_newline(out); - } -out: - --out->atomic; -} - -void bch2_fs_btree_iter_exit(struct bch_fs *c) -{ - struct btree_transaction_stats *s; - struct btree_trans *trans; - int cpu; - - if (c->btree_trans_bufs) - for_each_possible_cpu(cpu) { - struct btree_trans *trans = - per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; - - if (trans) { - seqmutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - seqmutex_unlock(&c->btree_trans_lock); - } - kfree(trans); - } - free_percpu(c->btree_trans_bufs); - - trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); - if (trans) - panic("%s leaked btree_trans\n", trans->fn); - - for (s = c->btree_transaction_stats; - s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); - s++) { -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_exit(&s->trans_kmalloc_trace); -#endif - kfree(s->max_paths_text); - bch2_time_stats_exit(&s->lock_hold_times); - } - - if (c->btree_trans_barrier_initialized) { - synchronize_srcu_expedited(&c->btree_trans_barrier); - cleanup_srcu_struct(&c->btree_trans_barrier); - } - mempool_exit(&c->btree_trans_mem_pool); - mempool_exit(&c->btree_trans_pool); -} - -void bch2_fs_btree_iter_init_early(struct bch_fs *c) -{ - struct btree_transaction_stats *s; - - for (s = c->btree_transaction_stats; - s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); - s++) { - bch2_time_stats_init(&s->duration); - bch2_time_stats_init(&s->lock_hold_times); - mutex_init(&s->lock); - } - - INIT_LIST_HEAD(&c->btree_trans_list); - seqmutex_init(&c->btree_trans_lock); -} - -int bch2_fs_btree_iter_init(struct bch_fs *c) -{ - int ret; - - c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf); - if (!c->btree_trans_bufs) - return -ENOMEM; - - ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1, - sizeof(struct btree_trans)) ?: - mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, - BTREE_TRANS_MEM_MAX) ?: - init_srcu_struct(&c->btree_trans_barrier); - if (ret) - return ret; - - /* - * static annotation (hackily done) for lock ordering of reclaim vs. - * btree node locks: - */ -#ifdef CONFIG_LOCKDEP - fs_reclaim_acquire(GFP_KERNEL); - struct btree_trans *trans = bch2_trans_get(c); - trans_set_locked(trans, false); - bch2_trans_put(trans); - fs_reclaim_release(GFP_KERNEL); -#endif - - c->btree_trans_barrier_initialized = true; - return 0; - -} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h deleted file mode 100644 index 09dd3e52622e..000000000000 --- a/fs/bcachefs/btree_iter.h +++ /dev/null @@ -1,1010 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_ITER_H -#define _BCACHEFS_BTREE_ITER_H - -#include "bset.h" -#include "btree_types.h" -#include "trace.h" - -void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); -void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); -void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); -void bch2_dump_trans_paths_updates(struct btree_trans *); - -static inline int __bkey_err(const struct bkey *k) -{ - return PTR_ERR_OR_ZERO(k); -} - -#define bkey_err(_k) __bkey_err((_k).k) - -static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent) -{ - unsigned idx = path - trans->paths; - - EBUG_ON(idx >= trans->nr_paths); - EBUG_ON(!test_bit(idx, trans->paths_allocated)); - if (unlikely(path->ref == U8_MAX)) { - bch2_dump_trans_paths_updates(trans); - panic("path %u refcount overflow\n", idx); - } - - path->ref++; - path->intent_ref += intent; - trace_btree_path_get_ll(trans, path); -} - -static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) -{ - EBUG_ON(path - trans->paths >= trans->nr_paths); - EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated)); - EBUG_ON(!path->ref); - EBUG_ON(!path->intent_ref && intent); - - trace_btree_path_put_ll(trans, path); - path->intent_ref -= intent; - return --path->ref == 0; -} - -static inline void btree_path_set_dirty(struct btree_trans *trans, - struct btree_path *path, - enum btree_path_uptodate u) -{ - BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); - path->uptodate = max_t(unsigned, path->uptodate, u); -} - -static inline struct btree *btree_path_node(struct btree_path *path, - unsigned level) -{ - return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL; -} - -static inline bool btree_node_lock_seq_matches(const struct btree_path *path, - const struct btree *b, unsigned level) -{ - return path->l[level].lock_seq == six_lock_seq(&b->c.lock); -} - -static inline struct btree *btree_node_parent(struct btree_path *path, - struct btree *b) -{ - return btree_path_node(path, b->c.level + 1); -} - -/* Iterate over paths within a transaction: */ - -void __bch2_btree_trans_sort_paths(struct btree_trans *); - -static inline void btree_trans_sort_paths(struct btree_trans *trans) -{ - if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - trans->paths_sorted) - return; - __bch2_btree_trans_sort_paths(trans); -} - -static inline unsigned long *trans_paths_nr(struct btree_path *paths) -{ - return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths; -} - -static inline unsigned long *trans_paths_allocated(struct btree_path *paths) -{ - unsigned long *v = trans_paths_nr(paths); - return v - BITS_TO_LONGS(*v); -} - -#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\ - for (_idx = _start; \ - (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \ - _idx++) - -static inline struct btree_path * -__trans_next_path(struct btree_trans *trans, unsigned *idx) -{ - unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG; - /* - * Open coded find_next_bit(), because - * - this is fast path, we can't afford the function call - * - and we know that nr_paths is a multiple of BITS_PER_LONG, - */ - while (*idx < trans->nr_paths) { - unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1)); - if (v) { - *idx += __ffs(v); - return trans->paths + *idx; - } - - *idx += BITS_PER_LONG; - *idx &= ~(BITS_PER_LONG - 1); - w++; - } - - return NULL; -} - -/* - * This version is intended to be safe for use on a btree_trans that is owned by - * another thread, for bch2_btree_trans_to_text(); - */ -#define trans_for_each_path_from(_trans, _path, _idx, _start) \ - for (_idx = _start; \ - (_path = __trans_next_path((_trans), &_idx)); \ - _idx++) - -#define trans_for_each_path(_trans, _path, _idx) \ - trans_for_each_path_from(_trans, _path, _idx, 1) - -static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) -{ - unsigned idx = path ? path->sorted_idx + 1 : 0; - - EBUG_ON(idx > trans->nr_sorted); - - return idx < trans->nr_sorted - ? trans->paths + trans->sorted[idx] - : NULL; -} - -static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path) -{ - unsigned idx = path ? path->sorted_idx : trans->nr_sorted; - - return idx - ? trans->paths + trans->sorted[idx - 1] - : NULL; -} - -#define trans_for_each_path_idx_inorder(_trans, _iter) \ - for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ - (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ - _iter.sorted_idx < (_trans)->nr_sorted); \ - _iter.sorted_idx++) - -struct trans_for_each_path_inorder_iter { - btree_path_idx_t sorted_idx; - btree_path_idx_t path_idx; -}; - -#define trans_for_each_path_inorder(_trans, _path, _iter) \ - for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ - (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ - _path = (_trans)->paths + _iter.path_idx, \ - _iter.sorted_idx < (_trans)->nr_sorted); \ - _iter.sorted_idx++) - -#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \ - for (_i = trans->nr_sorted - 1; \ - ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\ - --_i) - -static inline bool __path_has_node(const struct btree_path *path, - const struct btree *b) -{ - return path->l[b->c.level].b == b && - btree_node_lock_seq_matches(path, b, b->c.level); -} - -static inline struct btree_path * -__trans_next_path_with_node(struct btree_trans *trans, struct btree *b, - unsigned *idx) -{ - struct btree_path *path; - - while ((path = __trans_next_path(trans, idx)) && - !__path_has_node(path, b)) - (*idx)++; - - return path; -} - -#define trans_for_each_path_with_node(_trans, _b, _path, _iter) \ - for (_iter = 1; \ - (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\ - _iter++) - -btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t, - bool, unsigned long); - -static inline btree_path_idx_t __must_check -bch2_btree_path_make_mut(struct btree_trans *trans, - btree_path_idx_t path, bool intent, - unsigned long ip) -{ - if (trans->paths[path].ref > 1 || - trans->paths[path].preserve) - path = __bch2_btree_path_make_mut(trans, path, intent, ip); - trans->paths[path].should_be_locked = false; - return path; -} - -btree_path_idx_t __must_check -__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t, - struct bpos, bool, unsigned long); - -static inline btree_path_idx_t __must_check -bch2_btree_path_set_pos(struct btree_trans *trans, - btree_path_idx_t path, struct bpos new_pos, - bool intent, unsigned long ip) -{ - return !bpos_eq(new_pos, trans->paths[path].pos) - ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip) - : path; -} - -int __must_check bch2_btree_path_traverse_one(struct btree_trans *, - btree_path_idx_t, - unsigned, unsigned long); - -static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *); - -static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, - btree_path_idx_t path, unsigned flags) -{ - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) - return 0; - - return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_); -} - -btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, - unsigned, unsigned, unsigned, unsigned long); -btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id, - unsigned, struct bpos); - -struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); - -/* - * bch2_btree_path_peek_slot() for a cached iterator might return a key in a - * different snapshot: - */ -static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) -{ - struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); - - if (k.k && bpos_eq(path->pos, k.k->p)) - return k; - - bkey_init(u); - u->p = path->pos; - return (struct bkey_s_c) { u, NULL }; -} - -struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, - struct btree_iter *, struct bpos); - -void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *); - -int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *); - -static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) -{ - return mutex_trylock(lock) - ? 0 - : __bch2_trans_mutex_lock(trans, lock); -} - -/* Debug: */ - -void __bch2_trans_verify_paths(struct btree_trans *); -void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); - -static inline void bch2_trans_verify_paths(struct btree_trans *trans) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_trans_verify_paths(trans); -} - -static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree, - struct bpos pos) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_assert_pos_locked(trans, btree, pos); -} - -void bch2_btree_path_fix_key_modified(struct btree_trans *trans, - struct btree *, struct bkey_packed *); -void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, - struct btree *, struct btree_node_iter *, - struct bkey_packed *, unsigned, unsigned); - -int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); - -void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool); - -int bch2_trans_relock(struct btree_trans *); -int bch2_trans_relock_notrace(struct btree_trans *); -void bch2_trans_unlock(struct btree_trans *); -void bch2_trans_unlock_long(struct btree_trans *); - -static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) -{ - return restart_count != trans->restart_count - ? -BCH_ERR_transaction_restart_nested - : 0; -} - -void __noreturn bch2_trans_restart_error(struct btree_trans *, u32); - -static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, - u32 restart_count) -{ - if (trans_was_restarted(trans, restart_count)) - bch2_trans_restart_error(trans, restart_count); -} - -void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *); - -static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans) -{ - if (trans->restarted || !trans->locked) - bch2_trans_unlocked_or_in_restart_error(trans); -} - -__always_inline -static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip) -{ - BUG_ON(err <= 0); - BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); - - trans->restarted = err; - trans->last_restarted_ip = ip; - return -err; -} - -__always_inline -static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) -{ - btree_trans_restart_foreign_task(trans, err, ip); -#ifdef CONFIG_BCACHEFS_DEBUG - darray_exit(&trans->last_restarted_trace); - bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); -#endif - return -err; -} - -__always_inline -static int btree_trans_restart(struct btree_trans *trans, int err) -{ - return btree_trans_restart_ip(trans, err, _THIS_IP_); -} - -static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip) -{ -#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS - if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) { - trace_and_count(trans->c, trans_restart_injected, trans, ip); - return btree_trans_restart_ip(trans, - BCH_ERR_transaction_restart_fault_inject, ip); - } -#endif - return 0; -} - -bool bch2_btree_node_upgrade(struct btree_trans *, - struct btree_path *, unsigned); - -void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned); - -static inline void bch2_btree_path_downgrade(struct btree_trans *trans, - struct btree_path *path) -{ - unsigned new_locks_want = path->level + !!path->intent_ref; - - if (path->locks_want > new_locks_want) - __bch2_btree_path_downgrade(trans, path, new_locks_want); -} - -void bch2_trans_downgrade(struct btree_trans *); - -void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *); -void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); -void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); - -int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); -int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); - -struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *); -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *); -struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *); - -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos); -struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *); - -static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans, - struct btree_iter *iter) -{ - return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX); -} - -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos); - -static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter) -{ - return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN); -} - -struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *); - -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *); - -bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *); -bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *); - -static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) -{ - iter->k.type = KEY_TYPE_deleted; - iter->k.p.inode = iter->pos.inode = new_pos.inode; - iter->k.p.offset = iter->pos.offset = new_pos.offset; - iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; - iter->k.size = 0; -} - -static inline void bch2_btree_iter_set_pos(struct btree_trans *trans, - struct btree_iter *iter, struct bpos new_pos) -{ - if (unlikely(iter->update_path)) - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - - if (!(iter->flags & BTREE_ITER_all_snapshots)) - new_pos.snapshot = iter->snapshot; - - __bch2_btree_iter_set_pos(iter, new_pos); -} - -static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) -{ - BUG_ON(!(iter->flags & BTREE_ITER_is_extents)); - iter->pos = bkey_start_pos(&iter->k); -} - -static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans, - struct btree_iter *iter, u32 snapshot) -{ - struct bpos pos = iter->pos; - - iter->snapshot = snapshot; - pos.snapshot = snapshot; - bch2_btree_iter_set_pos(trans, iter, pos); -} - -void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); - -static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, - unsigned btree_id, - unsigned level, - unsigned flags) -{ - if (level || !btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_cached; - flags &= ~BTREE_ITER_with_key_cache; - } else if (!(flags & BTREE_ITER_cached)) - flags |= BTREE_ITER_with_key_cache; - - if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) && - btree_id_is_extents(btree_id)) - flags |= BTREE_ITER_is_extents; - - if (!(flags & BTREE_ITER_snapshot_field) && - !btree_type_has_snapshot_field(btree_id)) - flags &= ~BTREE_ITER_all_snapshots; - - if (!(flags & BTREE_ITER_all_snapshots) && - btree_type_has_snapshots(btree_id)) - flags |= BTREE_ITER_filter_snapshots; - - if (trans->journal_replay_not_finished) - flags |= BTREE_ITER_with_journal; - - return flags; -} - -static inline void bch2_trans_iter_init_common(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags, - unsigned long ip) -{ - iter->update_path = 0; - iter->key_cache_path = 0; - iter->btree_id = btree_id; - iter->min_depth = 0; - iter->flags = flags; - iter->snapshot = pos.snapshot; - iter->pos = pos; - iter->k = POS_KEY(pos); - iter->journal_idx = 0; -#ifdef CONFIG_BCACHEFS_DEBUG - iter->ip_allocated = ip; -#endif - iter->path = bch2_path_get(trans, btree_id, iter->pos, - locks_want, depth, flags, ip); -} - -void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *, - enum btree_id, struct bpos, unsigned); - -static inline void bch2_trans_iter_init(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned flags) -{ - if (__builtin_constant_p(btree_id) && - __builtin_constant_p(flags)) - bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, 0, flags), - _THIS_IP_); - else - bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); -} - -void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, - enum btree_id, struct bpos, - unsigned, unsigned, unsigned); -void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *); - -void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); - -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -void bch2_trans_kmalloc_trace_to_text(struct printbuf *, - darray_trans_kmalloc_trace *); -#endif - -void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long); - -static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size, - unsigned long ip) -{ -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_push(&trans->trans_kmalloc_trace, - ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size })); -#endif -} - -static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size, - unsigned long ip) -{ - size = roundup(size, 8); - - bch2_trans_kmalloc_trace(trans, size, ip); - - if (likely(trans->mem_top + size <= trans->mem_bytes)) { - void *p = trans->mem + trans->mem_top; - - trans->mem_top += size; - return p; - } else { - return __bch2_trans_kmalloc(trans, size, ip); - } -} - -static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size, - unsigned long ip) -{ - size = roundup(size, 8); - - bch2_trans_kmalloc_trace(trans, size, ip); - - if (likely(trans->mem_top + size <= trans->mem_bytes)) { - void *p = trans->mem + trans->mem_top; - - trans->mem_top += size; - memset(p, 0, size); - return p; - } else { - return __bch2_trans_kmalloc(trans, size, ip); - } -} - -/** - * bch2_trans_kmalloc - allocate memory for use by the current transaction - * - * Must be called after bch2_trans_begin, which on second and further calls - * frees all memory allocated in this transaction - */ -static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) -{ - return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_); -} - -static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) -{ - return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_); -} - -static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type) -{ - struct bkey_s_c k; - - bch2_trans_iter_init(trans, iter, btree_id, pos, flags); - k = bch2_btree_iter_peek_slot(trans, iter); - - if (!bkey_err(k) && type && k.k->type != type) - k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); - if (unlikely(bkey_err(k))) - bch2_trans_iter_exit(trans, iter); - return k; -} - -static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned flags) -{ - return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0); -} - -#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ - bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ - _btree_id, _pos, _flags, KEY_TYPE_##_type)) - -static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k) -{ - unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k)); - memcpy(dst_v, src_k.v, b); - if (unlikely(b < dst_size)) - memset(dst_v + b, 0, dst_size - b); -} - -#define bkey_val_copy(_dst_v, _src_k) \ -do { \ - BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \ - __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \ -} while (0) - -static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, - unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type, - unsigned val_size, void *val) -{ - struct btree_iter iter; - struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); - int ret = bkey_err(k); - if (!ret) { - __bkey_val_copy(val, val_size, k); - bch2_trans_iter_exit(trans, &iter); - } - - return ret; -} - -#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\ - __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \ - KEY_TYPE_##_type, sizeof(*_val), _val) - -void bch2_trans_srcu_unlock(struct btree_trans *); - -u32 bch2_trans_begin(struct btree_trans *); - -#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _locks_want, _depth, _flags, _b, _do) \ -({ \ - bch2_trans_begin((_trans)); \ - \ - struct btree_iter _iter; \ - bch2_trans_node_iter_init((_trans), &_iter, (_btree_id), \ - _start, _locks_want, _depth, _flags); \ - int _ret3 = 0; \ - do { \ - _ret3 = lockrestart_do((_trans), ({ \ - struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\ - if (!_b) \ - break; \ - \ - PTR_ERR_OR_ZERO(_b) ?: (_do); \ - })) ?: \ - lockrestart_do((_trans), \ - PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\ - } while (!_ret3); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - -#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _flags, _b, _do) \ - __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - 0, 0, _flags, _b, _do) - -static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans, - struct btree_iter *iter, - unsigned flags) -{ - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : - bch2_btree_iter_peek_prev(trans, iter); -} - -static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans, - struct btree_iter *iter, - unsigned flags) -{ - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : - bch2_btree_iter_peek(trans, iter); -} - -static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end, - unsigned flags) -{ - if (!(flags & BTREE_ITER_slots)) - return bch2_btree_iter_peek_max(trans, iter, end); - - if (bkey_gt(iter->pos, end)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_slot(trans, iter); -} - -int __bch2_btree_trans_too_many_iters(struct btree_trans *); - -static inline int btree_trans_too_many_iters(struct btree_trans *trans) -{ - if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8) - return __bch2_btree_trans_too_many_iters(trans); - - return 0; -} - -/* - * goto instead of loop, so that when used inside for_each_btree_key2() - * break/continue work correctly - */ -#define lockrestart_do(_trans, _do) \ -({ \ - __label__ transaction_restart; \ - u32 _restart_count; \ - int _ret2; \ -transaction_restart: \ - _restart_count = bch2_trans_begin(_trans); \ - _ret2 = (_do); \ - \ - if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)) \ - goto transaction_restart; \ - \ - if (!_ret2) \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - _ret2; \ -}) - -/* - * nested_lockrestart_do(), nested_commit_do(): - * - * These are like lockrestart_do() and commit_do(), with two differences: - * - * - We don't call bch2_trans_begin() unless we had a transaction restart - * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a - * transaction restart - */ -#define nested_lockrestart_do(_trans, _do) \ -({ \ - u32 _restart_count, _orig_restart_count; \ - int _ret2; \ - \ - _restart_count = _orig_restart_count = (_trans)->restart_count; \ - \ - while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\ - _restart_count = bch2_trans_begin(_trans); \ - \ - if (!_ret2) \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - \ - _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ -}) - -#define for_each_btree_key_max_continue(_trans, _iter, \ - _end, _flags, _k, _do) \ -({ \ - struct bkey_s_c _k; \ - int _ret3 = 0; \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \ - _end, (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ - } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - -#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ - for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) - -#define for_each_btree_key_max(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ -({ \ - bch2_trans_begin(trans); \ - \ - struct btree_iter _iter; \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\ -}) - -#define for_each_btree_key(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ - for_each_btree_key_max(_trans, _iter, _btree_id, _start, \ - SPOS_MAX, _flags, _k, _do) - -#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ -({ \ - struct btree_iter _iter; \ - struct bkey_s_c _k; \ - int _ret3 = 0; \ - \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \ - (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ - } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - -#define for_each_btree_key_commit(_trans, _iter, _btree_id, \ - _start, _iter_flags, _k, \ - _disk_res, _journal_seq, _commit_flags,\ - _do) \ - for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ - (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_commit_flags))) - -#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \ - _start, _iter_flags, _k, \ - _disk_res, _journal_seq, _commit_flags,\ - _do) \ - for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ - (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_commit_flags))) - -#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \ - _start, _end, _iter_flags, _k, \ - _disk_res, _journal_seq, _commit_flags,\ - _do) \ - for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ - (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_commit_flags))) - -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, - struct btree_iter *); - -#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(_trans, &(_iter))) - -#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\ - for (; \ - (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(_trans, &(_iter))) - -#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ - SPOS_MAX, _flags, _k, _ret) - -#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_rewind(_trans, &(_iter))) - -#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \ - for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret) - -/* - * This should not be used in a fastpath, without first trying _do in - * nonblocking mode - it will cause excessive transaction restarts and - * potentially livelocking: - */ -#define drop_locks_do(_trans, _do) \ -({ \ - bch2_trans_unlock(_trans); \ - (_do) ?: bch2_trans_relock(_trans); \ -}) - -#define allocate_dropping_locks_errcode(_trans, _do) \ -({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ - int _ret = _do; \ - \ - if (bch2_err_matches(_ret, ENOMEM)) { \ - _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(_trans, _do); \ - } \ - _ret; \ -}) - -#define allocate_dropping_locks(_trans, _ret, _do) \ -({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ - typeof(_do) _p = _do; \ - \ - _ret = 0; \ - if (unlikely(!_p)) { \ - _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(_trans, ((_p = _do), 0)); \ - } \ - _p; \ -}) - -struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); -void bch2_trans_put(struct btree_trans *); - -bool bch2_current_has_btree_trans(struct bch_fs *); - -extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; -unsigned bch2_trans_get_fn_idx(const char *); - -#define bch2_trans_get(_c) \ -({ \ - static unsigned trans_fn_idx; \ - \ - if (unlikely(!trans_fn_idx)) \ - trans_fn_idx = bch2_trans_get_fn_idx(__func__); \ - __bch2_trans_get(_c, trans_fn_idx); \ -}) - -/* - * We don't use DEFINE_CLASS() because using a function for the constructor - * breaks bch2_trans_get()'s use of __func__ - */ -typedef struct btree_trans * class_btree_trans_t; -static inline void class_btree_trans_destructor(struct btree_trans **p) -{ - struct btree_trans *trans = *p; - bch2_trans_put(trans); -} - -#define class_btree_trans_constructor(_c) bch2_trans_get(_c) - -#define bch2_trans_run(_c, _do) \ -({ \ - CLASS(btree_trans, trans)(_c); \ - (_do); \ -}) - -#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) - -void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); - -void bch2_fs_btree_iter_exit(struct bch_fs *); -void bch2_fs_btree_iter_init_early(struct bch_fs *); -int bch2_fs_btree_iter_init(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c deleted file mode 100644 index ea839560a136..000000000000 --- a/fs/bcachefs/btree_journal_iter.c +++ /dev/null @@ -1,830 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_buf.h" -#include "bset.h" -#include "btree_cache.h" -#include "btree_journal_iter.h" -#include "journal_io.h" - -#include <linux/sort.h> - -/* - * For managing keys we read from the journal: until journal replay works normal - * btree lookups need to be able to find and return keys from the journal where - * they overwrite what's in the btree, so we have a special iterator and - * operations for the regular btree iter code to use: - */ - -static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos) -{ - size_t gap_size = keys->size - keys->nr; - - BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size); - - if (pos >= keys->gap) - pos -= gap_size; - return pos; -} - -static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) -{ - size_t gap_size = keys->size - keys->nr; - - if (idx >= keys->gap) - idx += gap_size; - return idx; -} - -static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) -{ - return keys->data + idx_to_pos(keys, idx); -} - -static size_t __bch2_journal_key_search(struct journal_keys *keys, - enum btree_id id, unsigned level, - struct bpos pos) -{ - size_t l = 0, r = keys->nr, m; - - while (l < r) { - m = l + ((r - l) >> 1); - if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) - l = m + 1; - else - r = m; - } - - BUG_ON(l < keys->nr && - __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); - - BUG_ON(l && - __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); - - return l; -} - -static size_t bch2_journal_key_search(struct journal_keys *keys, - enum btree_id id, unsigned level, - struct bpos pos) -{ - return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); -} - -/* Returns first non-overwritten key >= search key: */ -struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos, - struct bpos end_pos, size_t *idx) -{ - struct journal_keys *keys = &c->journal_keys; - unsigned iters = 0; - struct journal_key *k; - - BUG_ON(*idx > keys->nr); -search: - if (!*idx) - *idx = __bch2_journal_key_search(keys, btree_id, level, pos); - - while (*idx && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { - --(*idx); - iters++; - if (iters == 10) { - *idx = 0; - goto search; - } - } - - struct bkey_i *ret = NULL; - rcu_read_lock(); /* for overwritten_ranges */ - - while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { - if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) - break; - - if (k->overwritten) { - if (k->overwritten_range) - *idx = rcu_dereference(k->overwritten_range)->end; - else - *idx += 1; - continue; - } - - if (__journal_key_cmp(btree_id, level, pos, k) <= 0) { - ret = k->k; - break; - } - - (*idx)++; - iters++; - if (iters == 10) { - *idx = 0; - rcu_read_unlock(); - goto search; - } - } - - rcu_read_unlock(); - return ret; -} - -struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos, - struct bpos end_pos, size_t *idx) -{ - struct journal_keys *keys = &c->journal_keys; - unsigned iters = 0; - struct journal_key *k; - - BUG_ON(*idx > keys->nr); - - if (!keys->nr) - return NULL; -search: - if (!*idx) - *idx = __bch2_journal_key_search(keys, btree_id, level, pos); - - while (*idx < keys->nr && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) { - (*idx)++; - iters++; - if (iters == 10) { - *idx = 0; - goto search; - } - } - - if (*idx == keys->nr) - --(*idx); - - struct bkey_i *ret = NULL; - rcu_read_lock(); /* for overwritten_ranges */ - - while (true) { - k = idx_to_key(keys, *idx); - if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) - break; - - if (k->overwritten) { - if (k->overwritten_range) - *idx = rcu_dereference(k->overwritten_range)->start; - if (!*idx) - break; - --(*idx); - continue; - } - - if (__journal_key_cmp(btree_id, level, pos, k) >= 0) { - ret = k->k; - break; - } - - if (!*idx) - break; - --(*idx); - iters++; - if (iters == 10) { - *idx = 0; - goto search; - } - } - - rcu_read_unlock(); - return ret; -} - -struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos) -{ - size_t idx = 0; - - return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx); -} - -static void journal_iter_verify(struct journal_iter *iter) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct journal_keys *keys = iter->keys; - size_t gap_size = keys->size - keys->nr; - - BUG_ON(iter->idx >= keys->gap && - iter->idx < keys->gap + gap_size); - - if (iter->idx < keys->size) { - struct journal_key *k = keys->data + iter->idx; - - int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); - BUG_ON(cmp > 0); - } -#endif -} - -static void journal_iters_fix(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - /* The key we just inserted is immediately before the gap: */ - size_t gap_end = keys->gap + (keys->size - keys->nr); - struct journal_key *new_key = &keys->data[keys->gap - 1]; - struct journal_iter *iter; - - /* - * If an iterator points one after the key we just inserted, decrement - * the iterator so it points at the key we just inserted - if the - * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will - * handle that: - */ - list_for_each_entry(iter, &c->journal_iters, list) { - journal_iter_verify(iter); - if (iter->idx == gap_end && - new_key->btree_id == iter->btree_id && - new_key->level == iter->level) - iter->idx = keys->gap - 1; - journal_iter_verify(iter); - } -} - -static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) -{ - struct journal_keys *keys = &c->journal_keys; - struct journal_iter *iter; - size_t gap_size = keys->size - keys->nr; - - list_for_each_entry(iter, &c->journal_iters, list) { - if (iter->idx > old_gap) - iter->idx -= gap_size; - if (iter->idx >= new_gap) - iter->idx += gap_size; - } -} - -int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) -{ - struct journal_key n = { - .btree_id = id, - .level = level, - .k = k, - .allocated = true, - /* - * Ensure these keys are done last by journal replay, to unblock - * journal reclaim: - */ - .journal_seq = U64_MAX, - }; - struct journal_keys *keys = &c->journal_keys; - size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); - - BUG_ON(test_bit(BCH_FS_rw, &c->flags)); - - if (idx < keys->size && - journal_key_cmp(&n, &keys->data[idx]) == 0) { - if (keys->data[idx].allocated) - kfree(keys->data[idx].k); - keys->data[idx] = n; - return 0; - } - - if (idx > keys->gap) - idx -= keys->size - keys->nr; - - size_t old_gap = keys->gap; - - if (keys->nr == keys->size) { - journal_iters_move_gap(c, old_gap, keys->size); - old_gap = keys->size; - - struct journal_keys new_keys = { - .nr = keys->nr, - .size = max_t(size_t, keys->size, 8) * 2, - }; - - new_keys.data = bch2_kvmalloc(new_keys.size * sizeof(new_keys.data[0]), GFP_KERNEL); - if (!new_keys.data) { - bch_err(c, "%s: error allocating new key array (size %zu)", - __func__, new_keys.size); - return bch_err_throw(c, ENOMEM_journal_key_insert); - } - - /* Since @keys was full, there was no gap: */ - memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); - kvfree(keys->data); - keys->data = new_keys.data; - keys->nr = new_keys.nr; - keys->size = new_keys.size; - - /* And now the gap is at the end: */ - keys->gap = keys->nr; - } - - journal_iters_move_gap(c, old_gap, idx); - - move_gap(keys, idx); - - keys->nr++; - keys->data[keys->gap++] = n; - - journal_iters_fix(c); - - return 0; -} - -/* - * Can only be used from the recovery thread while we're still RO - can't be - * used once we've got RW, as journal_keys is at that point used by multiple - * threads: - */ -int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) -{ - struct bkey_i *n; - int ret; - - n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); - if (!n) - return bch_err_throw(c, ENOMEM_journal_key_insert); - - bkey_copy(n, k); - ret = bch2_journal_key_insert_take(c, id, level, n); - if (ret) - kfree(n); - return ret; -} - -int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, - unsigned level, struct bpos pos) -{ - struct bkey_i whiteout; - - bkey_init(&whiteout.k); - whiteout.k.p = pos; - - return bch2_journal_key_insert(c, id, level, &whiteout); -} - -bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, - unsigned level, struct bpos pos) -{ - struct journal_keys *keys = &trans->c->journal_keys; - size_t idx = bch2_journal_key_search(keys, btree, level, pos); - - if (!trans->journal_replay_not_finished) - return false; - - return (idx < keys->size && - keys->data[idx].btree_id == btree && - keys->data[idx].level == level && - bpos_eq(keys->data[idx].k->k.p, pos) && - bkey_deleted(&keys->data[idx].k->k)); -} - -static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos) -{ - struct journal_key *k = keys->data + pos; - size_t idx = pos_to_idx(keys, pos); - - k->overwritten = true; - - struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL; - struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL; - - bool prev_overwritten = prev && prev->overwritten; - bool next_overwritten = next && next->overwritten; - - struct journal_key_range_overwritten *prev_range = - prev_overwritten ? prev->overwritten_range : NULL; - struct journal_key_range_overwritten *next_range = - next_overwritten ? next->overwritten_range : NULL; - - BUG_ON(prev_range && prev_range->end != idx); - BUG_ON(next_range && next_range->start != idx + 1); - - if (prev_range && next_range) { - prev_range->end = next_range->end; - - keys->data[pos].overwritten_range = prev_range; - for (size_t i = next_range->start; i < next_range->end; i++) { - struct journal_key *ip = keys->data + idx_to_pos(keys, i); - BUG_ON(ip->overwritten_range != next_range); - ip->overwritten_range = prev_range; - } - - kfree_rcu_mightsleep(next_range); - } else if (prev_range) { - prev_range->end++; - k->overwritten_range = prev_range; - if (next_overwritten) { - prev_range->end++; - next->overwritten_range = prev_range; - } - } else if (next_range) { - next_range->start--; - k->overwritten_range = next_range; - if (prev_overwritten) { - next_range->start--; - prev->overwritten_range = next_range; - } - } else if (prev_overwritten || next_overwritten) { - struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL); - if (!r) - return; - - r->start = idx - (size_t) prev_overwritten; - r->end = idx + 1 + (size_t) next_overwritten; - - rcu_assign_pointer(k->overwritten_range, r); - if (prev_overwritten) - prev->overwritten_range = r; - if (next_overwritten) - next->overwritten_range = r; - } -} - -void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, - unsigned level, struct bpos pos) -{ - struct journal_keys *keys = &c->journal_keys; - size_t idx = bch2_journal_key_search(keys, btree, level, pos); - - if (idx < keys->size && - keys->data[idx].btree_id == btree && - keys->data[idx].level == level && - bpos_eq(keys->data[idx].k->k.p, pos) && - !keys->data[idx].overwritten) { - mutex_lock(&keys->overwrite_lock); - __bch2_journal_key_overwritten(keys, idx); - mutex_unlock(&keys->overwrite_lock); - } -} - -static void bch2_journal_iter_advance(struct journal_iter *iter) -{ - if (iter->idx < iter->keys->size) { - iter->idx++; - if (iter->idx == iter->keys->gap) - iter->idx += iter->keys->size - iter->keys->nr; - } -} - -static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) -{ - journal_iter_verify(iter); - - guard(rcu)(); - while (iter->idx < iter->keys->size) { - struct journal_key *k = iter->keys->data + iter->idx; - - int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); - if (cmp < 0) - break; - BUG_ON(cmp); - - if (!k->overwritten) - return bkey_i_to_s_c(k->k); - - if (k->overwritten_range) - iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); - else - bch2_journal_iter_advance(iter); - } - - return bkey_s_c_null; -} - -static void bch2_journal_iter_exit(struct journal_iter *iter) -{ - list_del(&iter->list); -} - -static void bch2_journal_iter_init(struct bch_fs *c, - struct journal_iter *iter, - enum btree_id id, unsigned level, - struct bpos pos) -{ - iter->btree_id = id; - iter->level = level; - iter->keys = &c->journal_keys; - iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); - - journal_iter_verify(iter); -} - -static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) -{ - return bch2_btree_node_iter_peek_unpack(&iter->node_iter, - iter->b, &iter->unpacked); -} - -static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) -{ - bch2_btree_node_iter_advance(&iter->node_iter, iter->b); -} - -void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) -{ - if (bpos_eq(iter->pos, SPOS_MAX)) - iter->at_end = true; - else - iter->pos = bpos_successor(iter->pos); -} - -static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) -{ - struct btree_and_journal_iter iter = *_iter; - struct bch_fs *c = iter.trans->c; - unsigned level = iter.journal.level; - struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_started, &c->flags) - ? (level > 1 ? 0 : 2) - : (level > 1 ? 1 : 16); - - iter.prefetch = false; - iter.fail_if_too_many_whiteouts = true; - bch2_bkey_buf_init(&tmp); - - while (nr--) { - bch2_btree_and_journal_iter_advance(&iter); - struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); - if (!k.k) - break; - - bch2_bkey_buf_reassemble(&tmp, c, k); - bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); - } - - bch2_bkey_buf_exit(&tmp, c); -} - -struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) -{ - struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; - size_t iters = 0; - - if (iter->prefetch && iter->journal.level) - btree_and_journal_iter_prefetch(iter); -again: - if (iter->at_end) - return bkey_s_c_null; - - iters++; - - if (iters > 20 && iter->fail_if_too_many_whiteouts) - return bkey_s_c_null; - - while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && - bpos_lt(btree_k.k->p, iter->pos)) - bch2_journal_iter_advance_btree(iter); - - if (iter->trans->journal_replay_not_finished) - while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && - bpos_lt(journal_k.k->p, iter->pos)) - bch2_journal_iter_advance(&iter->journal); - - ret = journal_k.k && - (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) - ? journal_k - : btree_k; - - if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) - ret = bkey_s_c_null; - - if (ret.k) { - iter->pos = ret.k->p; - if (bkey_deleted(ret.k)) { - bch2_btree_and_journal_iter_advance(iter); - goto again; - } - } else { - iter->pos = SPOS_MAX; - iter->at_end = true; - } - - return ret; -} - -void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) -{ - bch2_journal_iter_exit(&iter->journal); -} - -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, - struct btree_and_journal_iter *iter, - struct btree *b, - struct btree_node_iter node_iter, - struct bpos pos) -{ - memset(iter, 0, sizeof(*iter)); - - iter->trans = trans; - iter->b = b; - iter->node_iter = node_iter; - iter->pos = b->data->min_key; - iter->at_end = false; - INIT_LIST_HEAD(&iter->journal.list); - - if (trans->journal_replay_not_finished) { - bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); - if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags)) - list_add(&iter->journal.list, &trans->c->journal_iters); - } -} - -/* - * this version is used by btree_gc before filesystem has gone RW and - * multithreaded, so uses the journal_iters list: - */ -void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, - struct btree_and_journal_iter *iter, - struct btree *b) -{ - struct btree_node_iter node_iter; - - bch2_btree_node_iter_init_from_start(&node_iter, b); - __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); -} - -/* sort and dedup all keys in the journal: */ - -/* - * When keys compare equal, oldest compares first: - */ -static int journal_sort_key_cmp(const void *_l, const void *_r) -{ - const struct journal_key *l = _l; - const struct journal_key *r = _r; - int rewind = l->rewind && r->rewind ? -1 : 1; - - return journal_key_cmp(l, r) ?: - ((cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset)) * rewind); -} - -void bch2_journal_keys_put(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - BUG_ON(atomic_read(&keys->ref) <= 0); - - if (!atomic_dec_and_test(&keys->ref)) - return; - - move_gap(keys, keys->nr); - - darray_for_each(*keys, i) { - if (i->overwritten_range && - (i == &darray_last(*keys) || - i->overwritten_range != i[1].overwritten_range)) - kfree(i->overwritten_range); - - if (i->allocated) - kfree(i->k); - } - - kvfree(keys->data); - keys->data = NULL; - keys->nr = keys->gap = keys->size = 0; - - struct journal_replay **i; - struct genradix_iter iter; - - genradix_for_each(&c->journal_entries, iter, i) - kvfree(*i); - genradix_free(&c->journal_entries); -} - -static void __journal_keys_sort(struct journal_keys *keys) -{ - sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]), - journal_sort_key_cmp, NULL); - - cond_resched(); - - struct journal_key *dst = keys->data; - - darray_for_each(*keys, src) { - /* - * We don't accumulate accounting keys here because we have to - * compare each individual accounting key against the version in - * the btree during replay: - */ - if (src->k->k.type != KEY_TYPE_accounting && - src + 1 < &darray_top(*keys) && - !journal_key_cmp(src, src + 1)) - continue; - - *dst++ = *src; - } - - keys->nr = dst - keys->data; -} - -int bch2_journal_keys_sort(struct bch_fs *c) -{ - struct genradix_iter iter; - struct journal_replay *i, **_i; - struct journal_keys *keys = &c->journal_keys; - size_t nr_read = 0; - - u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX; - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - cond_resched(); - - vstruct_for_each(&i->j, entry) { - bool rewind = !entry->level && - !btree_id_is_alloc(entry->btree_id) && - le64_to_cpu(i->j.seq) >= rewind_seq; - - if (entry->type != (rewind - ? BCH_JSET_ENTRY_overwrite - : BCH_JSET_ENTRY_btree_keys)) - continue; - - if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start) - continue; - - jset_entry_for_each_key(entry, k) { - struct journal_key n = (struct journal_key) { - .btree_id = entry->btree_id, - .level = entry->level, - .rewind = rewind, - .k = k, - .journal_seq = le64_to_cpu(i->j.seq), - .journal_offset = k->_data - i->j._data, - }; - - if (darray_push(keys, n)) { - __journal_keys_sort(keys); - - if (keys->nr * 8 > keys->size * 7) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", - keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); - return bch_err_throw(c, ENOMEM_journal_keys_sort); - } - - BUG_ON(darray_push(keys, n)); - } - - nr_read++; - } - } - } - - __journal_keys_sort(keys); - keys->gap = keys->nr; - - bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); - return 0; -} - -void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, - unsigned level_min, unsigned level_max, - struct bpos start, struct bpos end) -{ - struct journal_keys *keys = &c->journal_keys; - size_t dst = 0; - - move_gap(keys, keys->nr); - - darray_for_each(*keys, i) - if (!(i->btree_id == btree && - i->level >= level_min && - i->level <= level_max && - bpos_ge(i->k->k.p, start) && - bpos_le(i->k->k.p, end))) - keys->data[dst++] = *i; - keys->nr = keys->gap = dst; -} - -void bch2_journal_keys_dump(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - struct printbuf buf = PRINTBUF; - - pr_info("%zu keys:", keys->nr); - - move_gap(keys, keys->nr); - - darray_for_each(*keys, i) { - printbuf_reset(&buf); - prt_printf(&buf, "btree="); - bch2_btree_id_to_text(&buf, i->btree_id); - prt_printf(&buf, " l=%u ", i->level); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); - pr_err("%s", buf.buf); - } - printbuf_exit(&buf); -} - -void bch2_fs_journal_keys_init(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - atomic_set(&keys->ref, 1); - keys->initial_ref_held = true; - mutex_init(&keys->overwrite_lock); -} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h deleted file mode 100644 index 2a3082919b8d..000000000000 --- a/fs/bcachefs/btree_journal_iter.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H -#define _BCACHEFS_BTREE_JOURNAL_ITER_H - -#include "bkey.h" - -struct journal_iter { - struct list_head list; - enum btree_id btree_id; - unsigned level; - size_t idx; - struct journal_keys *keys; -}; - -/* - * Iterate over keys in the btree, with keys from the journal overlaid on top: - */ - -struct btree_and_journal_iter { - struct btree_trans *trans; - struct btree *b; - struct btree_node_iter node_iter; - struct bkey unpacked; - - struct journal_iter journal; - struct bpos pos; - bool at_end; - bool prefetch; - bool fail_if_too_many_whiteouts; -}; - -static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, - unsigned l_level, - const struct journal_key *r) -{ - return -cmp_int(l_level, r->level) ?: - cmp_int(l_btree_id, r->btree_id); -} - -static inline int __journal_key_cmp(enum btree_id l_btree_id, - unsigned l_level, - struct bpos l_pos, - const struct journal_key *r) -{ - return __journal_key_btree_cmp(l_btree_id, l_level, r) ?: - bpos_cmp(l_pos, r->k->k.p); -} - -static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) -{ - return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); -} - -struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, - unsigned, struct bpos, struct bpos, size_t *); -struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id, - unsigned, struct bpos, struct bpos, size_t *); -struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, - unsigned, struct bpos); - -int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *, - struct btree_and_journal_iter *); - -int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, - unsigned, struct bkey_i *); -int bch2_journal_key_insert(struct bch_fs *, enum btree_id, - unsigned, struct bkey_i *); -int bch2_journal_key_delete(struct bch_fs *, enum btree_id, - unsigned, struct bpos); -bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos); -void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos); - -void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); -struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); - -void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, - struct btree_and_journal_iter *, struct btree *, - struct btree_node_iter, struct bpos); -void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, - struct btree_and_journal_iter *, struct btree *); - -void bch2_journal_keys_put(struct bch_fs *); - -static inline void bch2_journal_keys_put_initial(struct bch_fs *c) -{ - if (c->journal_keys.initial_ref_held) - bch2_journal_keys_put(c); - c->journal_keys.initial_ref_held = false; -} - -int bch2_journal_keys_sort(struct bch_fs *); - -void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, - unsigned, unsigned, - struct bpos, struct bpos); - -void bch2_journal_keys_dump(struct bch_fs *); - -void bch2_fs_journal_keys_init(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h deleted file mode 100644 index 86aacb254fb2..000000000000 --- a/fs/bcachefs/btree_journal_iter_types.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H -#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H - -struct journal_key_range_overwritten { - size_t start, end; -}; - -struct journal_key { - u64 journal_seq; - u32 journal_offset; - enum btree_id btree_id:8; - unsigned level:8; - bool allocated:1; - bool overwritten:1; - bool rewind:1; - struct journal_key_range_overwritten __rcu * - overwritten_range; - struct bkey_i *k; -}; - -struct journal_keys { - /* must match layout in darray_types.h */ - size_t nr, size; - struct journal_key *data; - /* - * Gap buffer: instead of all the empty space in the array being at the - * end of the buffer - from @nr to @size - the empty space is at @gap. - * This means that sequential insertions are O(n) instead of O(n^2). - */ - size_t gap; - atomic_t ref; - bool initial_ref_held; - struct mutex overwrite_lock; -}; - -#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c deleted file mode 100644 index d96188b92db2..000000000000 --- a/fs/bcachefs/btree_key_cache.c +++ /dev/null @@ -1,880 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "errcode.h" -#include "error.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "trace.h" - -#include <linux/sched/mm.h> - -static inline bool btree_uses_pcpu_readers(enum btree_id id) -{ - return id == BTREE_ID_subvolumes; -} - -static struct kmem_cache *bch2_key_cache; - -static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct bkey_cached *ck = obj; - const struct bkey_cached_key *key = arg->key; - - return ck->key.btree_id != key->btree_id || - !bpos_eq(ck->key.pos, key->pos); -} - -static const struct rhashtable_params bch2_btree_key_cache_params = { - .head_offset = offsetof(struct bkey_cached, hash), - .key_offset = offsetof(struct bkey_cached, key), - .key_len = sizeof(struct bkey_cached_key), - .obj_cmpfn = bch2_btree_key_cache_cmp_fn, - .automatic_shrinking = true, -}; - -static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path, - struct bkey_cached *ck, - enum btree_node_locked_type lock_held) -{ - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - path->l[0].b = (void *) ck; - mark_btree_node_locked(trans, path, 0, lock_held); -} - -__flatten -inline struct bkey_cached * -bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) -{ - struct bkey_cached_key key = { - .btree_id = btree_id, - .pos = pos, - }; - - return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, - bch2_btree_key_cache_params); -} - -static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) -{ - if (!six_trylock_intent(&ck->c.lock)) - return false; - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - six_unlock_intent(&ck->c.lock); - return false; - } - - if (!six_trylock_write(&ck->c.lock)) { - six_unlock_intent(&ck->c.lock); - return false; - } - - return true; -} - -static bool bkey_cached_evict(struct btree_key_cache *c, - struct bkey_cached *ck) -{ - bool ret = !rhashtable_remove_fast(&c->table, &ck->hash, - bch2_btree_key_cache_params); - if (ret) { - memset(&ck->key, ~0, sizeof(ck->key)); - atomic_long_dec(&c->nr_keys); - } - - return ret; -} - -static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu) -{ - struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier); - struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu); - - this_cpu_dec(*c->btree_key_cache.nr_pending); - kmem_cache_free(bch2_key_cache, ck); -} - -static inline void bkey_cached_free_noassert(struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - kfree(ck->k); - ck->k = NULL; - ck->u64s = 0; - - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - - bool pcpu_readers = ck->c.lock.readers != NULL; - rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu); - this_cpu_inc(*bc->nr_pending); -} - -static void bkey_cached_free(struct btree_trans *trans, - struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - /* - * we'll hit strange issues in the SRCU code if we aren't holding an - * SRCU read lock... - */ - EBUG_ON(!trans->srcu_held); - - bkey_cached_free_noassert(bc, ck); -} - -static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) -{ - gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; - - struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); - if (unlikely(!ck)) - return NULL; - ck->k = kmalloc(key_u64s * sizeof(u64), gfp); - if (unlikely(!ck->k)) { - kmem_cache_free(bch2_key_cache, ck); - return NULL; - } - ck->u64s = key_u64s; - return ck; -} - -static struct bkey_cached * -bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) -{ - struct bch_fs *c = trans->c; - struct btree_key_cache *bc = &c->btree_key_cache; - bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); - int ret; - - struct bkey_cached *ck = container_of_or_null( - rcu_pending_dequeue(&bc->pending[pcpu_readers]), - struct bkey_cached, rcu); - if (ck) - goto lock; - - ck = allocate_dropping_locks(trans, ret, - __bkey_cached_alloc(key_u64s, _gfp)); - if (ret) { - if (ck) - kfree(ck->k); - kmem_cache_free(bch2_key_cache, ck); - return ERR_PTR(ret); - } - - if (ck) { - bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); - ck->c.cached = true; - goto lock; - } - - ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]), - struct bkey_cached, rcu); - if (ck) - goto lock; -lock: - six_lock_intent(&ck->c.lock, NULL, NULL); - six_lock_write(&ck->c.lock, NULL, NULL); - return ck; -} - -static struct bkey_cached * -bkey_cached_reuse(struct btree_key_cache *c) -{ - - guard(rcu)(); - struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table); - struct rhash_head *pos; - struct bkey_cached *ck; - - for (unsigned i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bkey_cached_lock_for_evict(ck)) { - if (bkey_cached_evict(c, ck)) - return ck; - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - } - } - return NULL; -} - -static int btree_key_cache_create(struct btree_trans *trans, - struct btree_path *path, - struct btree_path *ck_path, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct btree_key_cache *bc = &c->btree_key_cache; - - /* - * bch2_varint_decode can read past the end of the buffer by at - * most 7 bytes (it won't be used): - */ - unsigned key_u64s = k.k->u64s + 1; - - /* - * Allocate some extra space so that the transaction commit path is less - * likely to have to reallocate, since that requires a transaction - * restart: - */ - key_u64s = min(256U, (key_u64s * 3) / 2); - key_u64s = roundup_pow_of_two(key_u64s); - - struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s); - int ret = PTR_ERR_OR_ZERO(ck); - if (ret) - return ret; - - if (unlikely(!ck)) { - ck = bkey_cached_reuse(bc); - if (unlikely(!ck)) { - bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_id_str(ck_path->btree_id)); - return bch_err_throw(c, ENOMEM_btree_key_cache_create); - } - } - - ck->c.level = 0; - ck->c.btree_id = ck_path->btree_id; - ck->key.btree_id = ck_path->btree_id; - ck->key.pos = ck_path->pos; - ck->flags = 1U << BKEY_CACHED_ACCESSED; - - if (unlikely(key_u64s > ck->u64s)) { - mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); - - struct bkey_i *new_k = allocate_dropping_locks(trans, ret, - kmalloc(key_u64s * sizeof(u64), _gfp)); - if (unlikely(!new_k)) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(ck->key.btree_id), key_u64s); - ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill); - } else if (ret) { - kfree(new_k); - goto err; - } - - kfree(ck->k); - ck->k = new_k; - ck->u64s = key_u64s; - } - - bkey_reassemble(ck->k, k); - - ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c); - if (unlikely(ret)) - goto err; - - ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); - - bch2_btree_node_unlock_write(trans, path, path_l(path)->b); - - if (unlikely(ret)) /* raced with another fill? */ - goto err; - - atomic_long_inc(&bc->nr_keys); - six_unlock_write(&ck->c.lock); - - enum six_lock_type lock_want = __btree_lock_want(ck_path, 0); - if (lock_want == SIX_LOCK_read) - six_lock_downgrade(&ck->c.lock); - btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want); - ck_path->uptodate = BTREE_ITER_UPTODATE; - return 0; -err: - bkey_cached_free(trans, bc, ck); - mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); - - return ret; -} - -static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans, - struct btree_path *ck_path, - struct bkey_s_c k) -{ - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, ck_path->pos); - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, trans->c, k); - trace_key_cache_fill(trans, buf.buf); - printbuf_exit(&buf); -} - -static noinline int btree_key_cache_fill(struct btree_trans *trans, - btree_path_idx_t ck_path_idx, - unsigned flags) -{ - struct btree_path *ck_path = trans->paths + ck_path_idx; - - if (flags & BTREE_ITER_cached_nofill) { - ck_path->l[0].b = NULL; - return 0; - } - - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, - BTREE_ITER_intent| - BTREE_ITER_key_cache_fill| - BTREE_ITER_cached_nofill); - iter.flags &= ~BTREE_ITER_with_journal; - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - /* Recheck after btree lookup, before allocating: */ - ck_path = trans->paths + ck_path_idx; - ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; - if (unlikely(ret)) - goto out; - - ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); - if (ret) - goto err; - - if (trace_key_cache_fill_enabled()) - do_trace_key_cache_fill(trans, ck_path, k); -out: - /* We're not likely to need this iterator again: */ - bch2_set_btree_iter_dontneed(trans, &iter); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, - btree_path_idx_t path_idx) -{ - struct bch_fs *c = trans->c; - struct bkey_cached *ck; - struct btree_path *path = trans->paths + path_idx; -retry: - ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) - return -ENOENT; - - enum six_lock_type lock_want = __btree_lock_want(path, 0); - - int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_); - if (ret) - return ret; - - if (ck->key.btree_id != path->btree_id || - !bpos_eq(ck->key.pos, path->pos)) { - six_unlock_type(&ck->c.lock, lock_want); - goto retry; - } - - if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) - set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - - btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); - path->uptodate = BTREE_ITER_UPTODATE; - return 0; -} - -int bch2_btree_path_traverse_cached(struct btree_trans *trans, - btree_path_idx_t path_idx, - unsigned flags) -{ - EBUG_ON(trans->paths[path_idx].level); - - int ret; - do { - ret = btree_path_traverse_cached_fast(trans, path_idx); - if (unlikely(ret == -ENOENT)) - ret = btree_key_cache_fill(trans, path_idx, flags); - } while (ret == -EEXIST); - - struct btree_path *path = trans->paths + path_idx; - - if (unlikely(ret)) { - path->uptodate = BTREE_ITER_NEED_TRAVERSE; - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - btree_node_unlock(trans, path, 0); - path->l[0].b = ERR_PTR(ret); - } - } else { - BUG_ON(path->uptodate); - BUG_ON(!path->nodes_locked); - } - - return ret; -} - -static int btree_key_cache_flush_pos(struct btree_trans *trans, - struct bkey_cached_key key, - u64 journal_seq, - unsigned commit_flags, - bool evict) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct btree_iter c_iter, b_iter; - struct bkey_cached *ck = NULL; - int ret; - - bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, - BTREE_ITER_slots| - BTREE_ITER_intent| - BTREE_ITER_all_snapshots); - bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, - BTREE_ITER_cached| - BTREE_ITER_intent); - b_iter.flags &= ~BTREE_ITER_with_key_cache; - - ret = bch2_btree_iter_traverse(trans, &c_iter); - if (ret) - goto out; - - ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; - if (!ck) - goto out; - - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - if (evict) - goto evict; - goto out; - } - - if (journal_seq && ck->journal.seq != journal_seq) - goto out; - - trans->journal_res.seq = ck->journal.seq; - - /* - * If we're at the end of the journal, we really want to free up space - * in the journal right away - we don't want to pin that old journal - * sequence number with a new btree node write, we want to re-journal - * the update - */ - if (ck->journal.seq == journal_last_seq(j)) - commit_flags |= BCH_WATERMARK_reclaim; - - if (ck->journal.seq != journal_last_seq(j) || - !test_bit(JOURNAL_space_low, &c->journal.flags)) - commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - - struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter); - ret = bkey_err(btree_k); - if (ret) - goto err; - - /* * Check that we're not violating cache coherency rules: */ - BUG_ON(bkey_deleted(btree_k.k)); - - ret = bch2_trans_update(trans, &b_iter, ck->k, - BTREE_UPDATE_key_cache_reclaim| - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - commit_flags); -err: - bch2_fs_fatal_err_on(ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && - !bch2_journal_error(j), c, - "flushing key cache: %s", bch2_err_str(ret)); - if (ret) - goto out; - - bch2_journal_pin_drop(j, &ck->journal); - - struct btree_path *path = btree_iter_path(trans, &c_iter); - BUG_ON(!btree_node_locked(path, 0)); - - if (!evict) { - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - atomic_long_dec(&c->btree_key_cache.nr_dirty); - } - } else { - struct btree_path *path2; - unsigned i; -evict: - trans_for_each_path(trans, path2, i) - if (path2 != path) - __bch2_btree_path_unlock(trans, path2); - - bch2_btree_node_lock_write_nofail(trans, path, &ck->c); - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - atomic_long_dec(&c->btree_key_cache.nr_dirty); - } - - mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); - if (bkey_cached_evict(&c->btree_key_cache, ck)) { - bkey_cached_free(trans, &c->btree_key_cache, ck); - } else { - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - } - } -out: - bch2_trans_iter_exit(trans, &b_iter); - bch2_trans_iter_exit(trans, &c_iter); - return ret; -} - -int bch2_btree_key_cache_journal_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bkey_cached *ck = - container_of(pin, struct bkey_cached, journal); - struct bkey_cached_key key; - struct btree_trans *trans = bch2_trans_get(c); - int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - int ret = 0; - - btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); - key = ck->key; - - if (ck->journal.seq != seq || - !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - six_unlock_read(&ck->c.lock); - goto unlock; - } - - if (ck->seq != seq) { - bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, - bch2_btree_key_cache_journal_flush); - six_unlock_read(&ck->c.lock); - goto unlock; - } - six_unlock_read(&ck->c.lock); - - ret = lockrestart_do(trans, - btree_key_cache_flush_pos(trans, key, seq, - BCH_TRANS_COMMIT_journal_reclaim, false)); -unlock: - srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - - bch2_trans_put(trans); - return ret; -} - -bool bch2_btree_insert_key_cached(struct btree_trans *trans, - unsigned flags, - struct btree_insert_entry *insert_entry) -{ - struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; - struct bkey_i *insert = insert_entry->k; - bool kick_reclaim = false; - - BUG_ON(insert->k.u64s > ck->u64s); - - bkey_copy(ck->k, insert); - - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - set_bit(BKEY_CACHED_DIRTY, &ck->flags); - atomic_long_inc(&c->btree_key_cache.nr_dirty); - - if (bch2_nr_btree_keys_need_flush(c)) - kick_reclaim = true; - } - - /* - * To minimize lock contention, we only add the journal pin here and - * defer pin updates to the flush callback via ->seq. Be careful not to - * update ->seq on nojournal commits because we don't want to update the - * pin to a seq that doesn't include journal updates on disk. Otherwise - * we risk losing the update after a crash. - * - * The only exception is if the pin is not active in the first place. We - * have to add the pin because journal reclaim drives key cache - * flushing. The flush callback will not proceed unless ->seq matches - * the latest pin, so make sure it starts with a consistent value. - */ - if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || - !journal_pin_active(&ck->journal)) { - ck->seq = trans->journal_res.seq; - } - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, - &ck->journal, bch2_btree_key_cache_journal_flush); - - if (kick_reclaim) - journal_reclaim_kick(&c->journal); - return true; -} - -void bch2_btree_key_cache_drop(struct btree_trans *trans, - struct btree_path *path) -{ - struct bch_fs *c = trans->c; - struct btree_key_cache *bc = &c->btree_key_cache; - struct bkey_cached *ck = (void *) path->l[0].b; - - /* - * We just did an update to the btree, bypassing the key cache: the key - * cache key is now stale and must be dropped, even if dirty: - */ - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - atomic_long_dec(&c->btree_key_cache.nr_dirty); - bch2_journal_pin_drop(&c->journal, &ck->journal); - } - - bkey_cached_evict(bc, ck); - bkey_cached_free(trans, bc, ck); - - mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); - - struct btree_path *path2; - unsigned i; - trans_for_each_path(trans, path2, i) - if (path2->l[0].b == (void *) ck) { - /* - * It's safe to clear should_be_locked here because - * we're evicting from the key cache, and we still have - * the underlying btree locked: filling into the key - * cache would require taking a write lock on the btree - * node - */ - path2->should_be_locked = false; - __bch2_btree_path_unlock(trans, path2); - path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); - btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE); - } - - bch2_trans_verify_locks(trans); -} - -static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct bch_fs *c = shrink->private_data; - struct btree_key_cache *bc = &c->btree_key_cache; - struct bucket_table *tbl; - struct bkey_cached *ck; - size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; - unsigned iter, start; - int srcu_idx; - - srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - rcu_read_lock(); - - tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - - /* - * Scanning is expensive while a rehash is in progress - most elements - * will be on the new hashtable, if it's in progress - * - * A rehash could still start while we're scanning - that's ok, we'll - * still see most elements. - */ - if (unlikely(tbl->nest)) { - rcu_read_unlock(); - srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - return SHRINK_STOP; - } - - iter = bc->shrink_iter; - if (iter >= tbl->size) - iter = 0; - start = iter; - - do { - struct rhash_head *pos, *next; - - pos = rht_ptr_rcu(&tbl->buckets[iter]); - - while (!rht_is_a_nulls(pos)) { - next = rht_dereference_bucket_rcu(pos->next, tbl, iter); - ck = container_of(pos, struct bkey_cached, hash); - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - bc->skipped_dirty++; - } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { - clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); - bc->skipped_accessed++; - } else if (!bkey_cached_lock_for_evict(ck)) { - bc->skipped_lock_fail++; - } else if (bkey_cached_evict(bc, ck)) { - bkey_cached_free_noassert(bc, ck); - bc->freed++; - freed++; - } else { - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - } - - scanned++; - if (scanned >= nr) - goto out; - - pos = next; - } - - iter++; - if (iter >= tbl->size) - iter = 0; - } while (scanned < nr && iter != start); -out: - bc->shrink_iter = iter; - - rcu_read_unlock(); - srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - - return freed; -} - -static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct bch_fs *c = shrink->private_data; - struct btree_key_cache *bc = &c->btree_key_cache; - long nr = atomic_long_read(&bc->nr_keys) - - atomic_long_read(&bc->nr_dirty); - - /* - * Avoid hammering our shrinker too much if it's nearly empty - the - * shrinker code doesn't take into account how big our cache is, if it's - * mostly empty but the system is under memory pressure it causes nasty - * lock contention: - */ - nr -= 128; - - return max(0L, nr); -} - -void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) -{ - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - struct bucket_table *tbl; - struct bkey_cached *ck; - struct rhash_head *pos; - LIST_HEAD(items); - unsigned i; - - shrinker_free(bc->shrink); - - /* - * The loop is needed to guard against racing with rehash: - */ - while (atomic_long_read(&bc->nr_keys)) { - rcu_read_lock(); - tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - if (tbl) { - if (tbl->nest) { - /* wait for in progress rehash */ - rcu_read_unlock(); - mutex_lock(&bc->table.mutex); - mutex_unlock(&bc->table.mutex); - continue; - } - for (i = 0; i < tbl->size; i++) - while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { - ck = container_of(pos, struct bkey_cached, hash); - BUG_ON(!bkey_cached_evict(bc, ck)); - kfree(ck->k); - kmem_cache_free(bch2_key_cache, ck); - } - } - rcu_read_unlock(); - } - - if (atomic_long_read(&bc->nr_dirty) && - !bch2_journal_error(&c->journal) && - test_bit(BCH_FS_was_rw, &c->flags)) - panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", - atomic_long_read(&bc->nr_dirty)); - - if (atomic_long_read(&bc->nr_keys)) - panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", - atomic_long_read(&bc->nr_keys)); - - if (bc->table_init_done) - rhashtable_destroy(&bc->table); - - rcu_pending_exit(&bc->pending[0]); - rcu_pending_exit(&bc->pending[1]); - - free_percpu(bc->nr_pending); -} - -void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) -{ -} - -int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) -{ - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - struct shrinker *shrink; - - bc->nr_pending = alloc_percpu(size_t); - if (!bc->nr_pending) - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); - - if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || - rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); - - if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); - - bc->table_init_done = true; - - shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); - if (!shrink) - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); - bc->shrink = shrink; - shrink->count_objects = bch2_btree_key_cache_count; - shrink->scan_objects = bch2_btree_key_cache_scan; - shrink->batch = 1 << 14; - shrink->seeks = 0; - shrink->private_data = c; - shrinker_register(shrink); - return 0; -} - -void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) -{ - printbuf_tabstop_push(out, 24); - printbuf_tabstop_push(out, 12); - - prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); - prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); - prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size); - prt_newline(out); - prt_printf(out, "shrinker:\n"); - prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); - prt_printf(out, "freed:\t%lu\r\n", bc->freed); - prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); - prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); - prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); - prt_newline(out); - prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending)); -} - -void bch2_btree_key_cache_exit(void) -{ - kmem_cache_destroy(bch2_key_cache); -} - -int __init bch2_btree_key_cache_init(void) -{ - bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); - if (!bch2_key_cache) - return -ENOMEM; - - return 0; -} diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h deleted file mode 100644 index 82d8c72512a9..000000000000 --- a/fs/bcachefs/btree_key_cache.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_KEY_CACHE_H -#define _BCACHEFS_BTREE_KEY_CACHE_H - -static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) -{ - size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); - size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); - size_t max_dirty = 1024 + nr_keys / 2; - - return max_t(ssize_t, 0, nr_dirty - max_dirty); -} - -static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c) -{ - size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); - size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); - size_t max_dirty = 4096 + (nr_keys * 3) / 4; - - return nr_dirty - max_dirty; -} - -static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) -{ - return __bch2_btree_key_cache_must_wait(c) > 0; -} - -static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c) -{ - size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); - size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); - size_t max_dirty = 2048 + (nr_keys * 5) / 8; - - return nr_dirty <= max_dirty; -} - -int bch2_btree_key_cache_journal_flush(struct journal *, - struct journal_entry_pin *, u64); - -struct bkey_cached * -bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); - -int bch2_btree_path_traverse_cached(struct btree_trans *, btree_path_idx_t, unsigned); - -bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, - struct btree_insert_entry *); -void bch2_btree_key_cache_drop(struct btree_trans *, - struct btree_path *); - -void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); -void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); -int bch2_fs_btree_key_cache_init(struct btree_key_cache *); - -void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); - -void bch2_btree_key_cache_exit(void); -int __init bch2_btree_key_cache_init(void); - -#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h deleted file mode 100644 index 722f1ed10551..000000000000 --- a/fs/bcachefs/btree_key_cache_types.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H -#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H - -#include "rcu_pending.h" - -struct btree_key_cache { - struct rhashtable table; - bool table_init_done; - - struct shrinker *shrink; - unsigned shrink_iter; - - /* 0: non pcpu reader locks, 1: pcpu reader locks */ - struct rcu_pending pending[2]; - size_t __percpu *nr_pending; - - atomic_long_t nr_keys; - atomic_long_t nr_dirty; - - /* shrinker stats */ - unsigned long requested_to_free; - unsigned long freed; - unsigned long skipped_dirty; - unsigned long skipped_accessed; - unsigned long skipped_lock_fail; -}; - -struct bkey_cached_key { - u32 btree_id; - struct bpos pos; -} __packed __aligned(4); - -#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */ diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c deleted file mode 100644 index bed2b4b6ffb9..000000000000 --- a/fs/bcachefs/btree_locking.c +++ /dev/null @@ -1,936 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_locking.h" -#include "btree_types.h" - -static struct lock_class_key bch2_btree_node_lock_key; - -void bch2_btree_lock_init(struct btree_bkey_cached_common *b, - enum six_lock_init_flags flags, - gfp_t gfp) -{ - __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp); - lockdep_set_notrack_class(&b->lock); -} - -/* Btree node locking: */ - -struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, - struct btree_path *skip, - struct btree_bkey_cached_common *b, - unsigned level) -{ - struct btree_path *path; - struct six_lock_count ret; - unsigned i; - - memset(&ret, 0, sizeof(ret)); - - if (IS_ERR_OR_NULL(b)) - return ret; - - trans_for_each_path(trans, path, i) - if (path != skip && &path->l[level].b->c == b) { - int t = btree_node_locked_type(path, level); - - if (t != BTREE_NODE_UNLOCKED) - ret.n[t]++; - } - - return ret; -} - -/* unlock */ - -void bch2_btree_node_unlock_write(struct btree_trans *trans, - struct btree_path *path, struct btree *b) -{ - bch2_btree_node_unlock_write_inlined(trans, path, b); -} - -/* lock */ - -/* - * @trans wants to lock @b with type @type - */ -struct trans_waiting_for_lock { - struct btree_trans *trans; - struct btree_bkey_cached_common *node_want; - enum six_lock_type lock_want; - - /* for iterating over held locks :*/ - u8 path_idx; - u8 level; - u64 lock_start_time; -}; - -struct lock_graph { - struct trans_waiting_for_lock g[8]; - unsigned nr; -}; - -static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) -{ - struct trans_waiting_for_lock *i; - - prt_printf(out, "Found lock cycle (%u entries):\n", g->nr); - - for (i = g->g; i < g->g + g->nr; i++) { - struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); - if (!task) - continue; - - bch2_btree_trans_to_text(out, i->trans); - bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT); - } -} - -static noinline void print_chain(struct printbuf *out, struct lock_graph *g) -{ - struct trans_waiting_for_lock *i; - - for (i = g->g; i != g->g + g->nr; i++) { - struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); - if (i != g->g) - prt_str(out, "<- "); - prt_printf(out, "%u ", task ? task->pid : 0); - } - prt_newline(out); -} - -static void lock_graph_up(struct lock_graph *g) -{ - closure_put(&g->g[--g->nr].trans->ref); -} - -static noinline void lock_graph_pop_all(struct lock_graph *g) -{ - while (g->nr) - lock_graph_up(g); -} - -static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i) -{ - while (g->g + g->nr > i) - lock_graph_up(g); -} - -static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) -{ - g->g[g->nr++] = (struct trans_waiting_for_lock) { - .trans = trans, - .node_want = trans->locking, - .lock_want = trans->locking_wait.lock_want, - }; -} - -static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) -{ - closure_get(&trans->ref); - __lock_graph_down(g, trans); -} - -static bool lock_graph_remove_non_waiters(struct lock_graph *g, - struct trans_waiting_for_lock *from) -{ - struct trans_waiting_for_lock *i; - - if (from->trans->locking != from->node_want) { - lock_graph_pop_from(g, from); - return true; - } - - for (i = from + 1; i < g->g + g->nr; i++) - if (i->trans->locking != i->node_want || - i->trans->locking_wait.start_time != i[-1].lock_start_time) { - lock_graph_pop_from(g, i); - return true; - } - - return false; -} - -static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - - count_event(c, trans_restart_would_deadlock); - - if (trace_trans_restart_would_deadlock_enabled()) { - struct printbuf buf = PRINTBUF; - - buf.atomic++; - print_cycle(&buf, g); - - trace_trans_restart_would_deadlock(trans, buf.buf); - printbuf_exit(&buf); - } -} - -static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) -{ - if (i == g->g) { - trace_would_deadlock(g, i->trans); - return btree_trans_restart_foreign_task(i->trans, - BCH_ERR_transaction_restart_would_deadlock, - _THIS_IP_); - } else { - i->trans->lock_must_abort = true; - wake_up_process(i->trans->locking_wait.task); - return 0; - } -} - -static int btree_trans_abort_preference(struct btree_trans *trans) -{ - if (trans->lock_may_not_fail) - return 0; - if (trans->locking_wait.lock_want == SIX_LOCK_write) - return 1; - if (!trans->in_traverse_all) - return 2; - return 3; -} - -static noinline __noreturn void break_cycle_fail(struct lock_graph *g) -{ - struct printbuf buf = PRINTBUF; - buf.atomic++; - - prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); - - for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) { - struct btree_trans *trans = i->trans; - - bch2_btree_trans_to_text(&buf, trans); - - prt_printf(&buf, "backtrace:\n"); - printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } - - bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - BUG(); -} - -static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, - struct trans_waiting_for_lock *from) -{ - struct trans_waiting_for_lock *i, *abort = NULL; - unsigned best = 0, pref; - int ret; - - if (lock_graph_remove_non_waiters(g, from)) - return 0; - - /* Only checking, for debugfs: */ - if (cycle) { - print_cycle(cycle, g); - ret = -1; - goto out; - } - - for (i = from; i < g->g + g->nr; i++) { - pref = btree_trans_abort_preference(i->trans); - if (pref > best) { - abort = i; - best = pref; - } - } - - if (unlikely(!best)) - break_cycle_fail(g); - - ret = abort_lock(g, abort); -out: - if (ret) - lock_graph_pop_all(g); - else - lock_graph_pop_from(g, abort); - return ret; -} - -static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, - struct printbuf *cycle) -{ - struct btree_trans *orig_trans = g->g->trans; - - for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) - if (i->trans == trans) { - closure_put(&trans->ref); - return break_cycle(g, cycle, i); - } - - if (unlikely(g->nr == ARRAY_SIZE(g->g))) { - closure_put(&trans->ref); - - if (orig_trans->lock_may_not_fail) - return 0; - - lock_graph_pop_all(g); - - if (cycle) - return 0; - - trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); - return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); - } - - __lock_graph_down(g, trans); - return 0; -} - -static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) -{ - return t1 + t2 > 1; -} - -int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) -{ - struct lock_graph g; - struct trans_waiting_for_lock *top; - struct btree_bkey_cached_common *b; - btree_path_idx_t path_idx; - int ret = 0; - - g.nr = 0; - - if (trans->lock_must_abort && !trans->lock_may_not_fail) { - if (cycle) - return -1; - - trace_would_deadlock(&g, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); - } - - lock_graph_down(&g, trans); - - /* trans->paths is rcu protected vs. freeing */ - guard(rcu)(); - if (cycle) - cycle->atomic++; -next: - if (!g.nr) - goto out; - - top = &g.g[g.nr - 1]; - - struct btree_path *paths = rcu_dereference(top->trans->paths); - if (!paths) - goto up; - - unsigned long *paths_allocated = trans_paths_allocated(paths); - - trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), - path_idx, top->path_idx) { - struct btree_path *path = paths + path_idx; - if (!path->nodes_locked) - continue; - - if (path_idx != top->path_idx) { - top->path_idx = path_idx; - top->level = 0; - top->lock_start_time = 0; - } - - for (; - top->level < BTREE_MAX_DEPTH; - top->level++, top->lock_start_time = 0) { - int lock_held = btree_node_locked_type(path, top->level); - - if (lock_held == BTREE_NODE_UNLOCKED) - continue; - - b = &READ_ONCE(path->l[top->level].b)->c; - - if (IS_ERR_OR_NULL(b)) { - /* - * If we get here, it means we raced with the - * other thread updating its btree_path - * structures - which means it can't be blocked - * waiting on a lock: - */ - if (!lock_graph_remove_non_waiters(&g, g.g)) { - /* - * If lock_graph_remove_non_waiters() - * didn't do anything, it must be - * because we're being called by debugfs - * checking for lock cycles, which - * invokes us on btree_transactions that - * aren't actually waiting on anything. - * Just bail out: - */ - lock_graph_pop_all(&g); - } - - goto next; - } - - if (list_empty_careful(&b->lock.wait_list)) - continue; - - raw_spin_lock(&b->lock.wait_lock); - list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) { - BUG_ON(b != trans->locking); - - if (top->lock_start_time && - time_after_eq64(top->lock_start_time, trans->locking_wait.start_time)) - continue; - - top->lock_start_time = trans->locking_wait.start_time; - - /* Don't check for self deadlock: */ - if (trans == top->trans || - !lock_type_conflicts(lock_held, trans->locking_wait.lock_want)) - continue; - - closure_get(&trans->ref); - raw_spin_unlock(&b->lock.wait_lock); - - ret = lock_graph_descend(&g, trans, cycle); - if (ret) - goto out; - goto next; - - } - raw_spin_unlock(&b->lock.wait_lock); - } - } -up: - if (g.nr > 1 && cycle) - print_chain(cycle, &g); - lock_graph_up(&g); - goto next; -out: - if (cycle) - --cycle->atomic; - return ret; -} - -int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) -{ - struct btree_trans *trans = p; - - return bch2_check_for_deadlock(trans, NULL); -} - -int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, - struct btree_bkey_cached_common *b, - bool lock_may_not_fail) -{ - int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; - int ret; - - /* - * Must drop our read locks before calling six_lock_write() - - * six_unlock() won't do wakeups until the reader count - * goes to 0, and it's safe because we have the node intent - * locked: - */ - six_lock_readers_add(&b->lock, -readers); - ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, - lock_may_not_fail, _RET_IP_); - six_lock_readers_add(&b->lock, readers); - - if (ret) - mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED); - - return ret; -} - -void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b) -{ - int ret = __btree_node_lock_write(trans, path, b, true); - BUG_ON(ret); -} - -/* relock */ - -static int btree_path_get_locks(struct btree_trans *trans, - struct btree_path *path, - bool upgrade, - struct get_locks_fail *f, - int restart_err) -{ - unsigned l = path->level; - - do { - if (!btree_path_node(path, l)) - break; - - if (!(upgrade - ? bch2_btree_node_upgrade(trans, path, l) - : bch2_btree_node_relock(trans, path, l))) - goto err; - - l++; - } while (l < path->locks_want); - - if (path->uptodate == BTREE_ITER_NEED_RELOCK) - path->uptodate = BTREE_ITER_UPTODATE; - - return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1; -err: - if (f) { - f->l = l; - f->b = path->l[l].b; - } - - /* - * Do transaction restart before unlocking, so we don't pop - * should_be_locked asserts - */ - if (restart_err) { - btree_trans_restart(trans, restart_err); - } else if (path->should_be_locked && !trans->restarted) { - if (upgrade) - path->locks_want = l; - return -1; - } - - __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - - /* - * When we fail to get a lock, we have to ensure that any child nodes - * can't be relocked so bch2_btree_path_traverse has to walk back up to - * the node that we failed to relock: - */ - do { - path->l[l].b = upgrade - ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) - : ERR_PTR(-BCH_ERR_no_btree_node_relock); - } while (l--); - - return -restart_err ?: -1; -} - -bool __bch2_btree_node_relock(struct btree_trans *trans, - struct btree_path *path, unsigned level, - bool trace) -{ - struct btree *b = btree_path_node(path, level); - int want = __btree_lock_want(path, level); - - if (race_fault()) - goto fail; - - if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || - (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, &b->c, level, want))) { - mark_btree_node_locked(trans, path, level, want); - return true; - } -fail: - if (trace && !trans->notrace_relock_fail) - trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); - return false; -} - -/* upgrade */ - -bool bch2_btree_node_upgrade(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - struct btree *b = path->l[level].b; - - if (!is_btree_node(path, level)) - return false; - - switch (btree_lock_want(path, level)) { - case BTREE_NODE_UNLOCKED: - BUG_ON(btree_node_locked(path, level)); - return true; - case BTREE_NODE_READ_LOCKED: - BUG_ON(btree_node_intent_locked(path, level)); - return bch2_btree_node_relock(trans, path, level); - case BTREE_NODE_INTENT_LOCKED: - break; - case BTREE_NODE_WRITE_LOCKED: - BUG(); - } - - if (btree_node_intent_locked(path, level)) - return true; - - if (race_fault()) - return false; - - if (btree_node_locked(path, level) - ? six_lock_tryupgrade(&b->c.lock) - : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) - goto success; - - if (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { - btree_node_unlock(trans, path, level); - goto success; - } - - trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level); - return false; -success: - mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); - return true; -} - -/* Btree path locking: */ - -/* - * Only for btree_cache.c - only relocks intent locks - */ -int bch2_btree_path_relock_intent(struct btree_trans *trans, - struct btree_path *path) -{ - unsigned l; - - for (l = path->level; - l < path->locks_want && btree_path_node(path, l); - l++) { - if (!bch2_btree_node_relock(trans, path, l)) { - __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); - } - } - - return 0; -} - -__flatten -bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) -{ - bool ret = !btree_path_get_locks(trans, path, false, NULL, 0); - bch2_trans_verify_locks(trans); - return ret; -} - -int __bch2_btree_path_relock(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) -{ - if (!bch2_btree_path_relock_norestart(trans, path)) { - trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); - } - - return 0; -} - -bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - path->locks_want = new_locks_want; - - /* - * If we need it locked, we can't touch it. Otherwise, we can return - * success - bch2_path_get() will use this path, and it'll just be - * retraversed: - */ - bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) || - !path->should_be_locked; - - bch2_btree_path_verify_locks(trans, path); - return ret; -} - -int __bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - unsigned old_locks = path->nodes_locked; - unsigned old_locks_want = path->locks_want; - - path->locks_want = max_t(unsigned, path->locks_want, new_locks_want); - - struct get_locks_fail f = {}; - int ret = btree_path_get_locks(trans, path, true, &f, - BCH_ERR_transaction_restart_upgrade); - if (!ret) - goto out; - - /* - * XXX: this is ugly - we'd prefer to not be mucking with other - * iterators in the btree_trans here. - * - * On failure to upgrade the iterator, setting iter->locks_want and - * calling get_locks() is sufficient to make bch2_btree_path_traverse() - * get the locks we want on transaction restart. - * - * But if this iterator was a clone, on transaction restart what we did - * to this iterator isn't going to be preserved. - * - * Possibly we could add an iterator field for the parent iterator when - * an iterator is a copy - for now, we'll just upgrade any other - * iterators with the same btree id. - * - * The code below used to be needed to ensure ancestor nodes get locked - * before interior nodes - now that's handled by - * bch2_btree_path_traverse_all(). - */ - if (!path->cached && !trans->in_traverse_all) { - struct btree_path *linked; - unsigned i; - - trans_for_each_path(trans, linked, i) - if (linked != path && - linked->cached == path->cached && - linked->btree_id == path->btree_id && - linked->locks_want < new_locks_want) { - linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true, NULL, 0); - } - } - - count_event(trans->c, trans_restart_upgrade); - if (trace_trans_restart_upgrade_enabled()) { - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_); - prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id)); - bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, "locks want %u -> %u level %u\n", - old_locks_want, new_locks_want, f.l); - prt_printf(&buf, "nodes_locked %x -> %x\n", - old_locks, path->nodes_locked); - prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) : - !f.b ? "(null)" : "(node)"); - prt_printf(&buf, "path seq %u node seq %u\n", - IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq, - path->l[f.l].lock_seq); - - trace_trans_restart_upgrade(trans->c, buf.buf); - printbuf_exit(&buf); - } -out: - bch2_trans_verify_locks(trans); - return ret; -} - -void __bch2_btree_path_downgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - unsigned l, old_locks_want = path->locks_want; - - if (trans->restarted) - return; - - EBUG_ON(path->locks_want < new_locks_want); - - path->locks_want = new_locks_want; - - while (path->nodes_locked && - (l = btree_path_highest_level_locked(path)) >= path->locks_want) { - if (l > path->level) { - btree_node_unlock(trans, path, l); - } else { - if (btree_node_intent_locked(path, l)) { - six_lock_downgrade(&path->l[l].b->c.lock); - mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED); - } - break; - } - } - - bch2_btree_path_verify_locks(trans, path); - - trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); -} - -/* Btree transaction locking: */ - -void bch2_trans_downgrade(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - if (trans->restarted) - return; - - trans_for_each_path(trans, path, i) - if (path->ref) - bch2_btree_path_downgrade(trans, path); -} - -static inline void __bch2_trans_unlock(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - __bch2_btree_path_unlock(trans, path); -} - -static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, - struct get_locks_fail *f, bool trace, ulong ip) -{ - if (!trace) - goto out; - - if (trace_trans_restart_relock_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, " %s l=%u seq=%u node seq=", - bch2_btree_id_str(path->btree_id), - f->l, path->l[f->l].lock_seq); - if (IS_ERR_OR_NULL(f->b)) { - prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); - } else { - prt_printf(&buf, "%u", f->b->c.lock.seq); - - struct six_lock_count c = - bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l); - prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - - c = six_lock_counts(&f->b->c.lock); - prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - } - - trace_trans_restart_relock(trans, ip, buf.buf); - printbuf_exit(&buf); - } - - count_event(trans->c, trans_restart_relock); -out: - __bch2_trans_unlock(trans); - bch2_trans_verify_locks(trans); -} - -static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip) -{ - bch2_trans_verify_locks(trans); - - if (unlikely(trans->restarted)) - return -((int) trans->restarted); - if (unlikely(trans->locked)) - goto out; - - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) { - struct get_locks_fail f; - int ret; - - if (path->should_be_locked && - (ret = btree_path_get_locks(trans, path, false, &f, - BCH_ERR_transaction_restart_relock))) { - bch2_trans_relock_fail(trans, path, &f, trace, ip); - return ret; - } - } - - trans_set_locked(trans, true); -out: - bch2_trans_verify_locks(trans); - return 0; -} - -int bch2_trans_relock(struct btree_trans *trans) -{ - return __bch2_trans_relock(trans, true, _RET_IP_); -} - -int bch2_trans_relock_notrace(struct btree_trans *trans) -{ - return __bch2_trans_relock(trans, false, _RET_IP_); -} - -void bch2_trans_unlock(struct btree_trans *trans) -{ - trans_set_unlocked(trans); - - __bch2_trans_unlock(trans); -} - -void bch2_trans_unlock_long(struct btree_trans *trans) -{ - bch2_trans_unlock(trans); - bch2_trans_srcu_unlock(trans); -} - -void bch2_trans_unlock_write(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) - if (btree_node_write_locked(path, l)) - bch2_btree_node_unlock_write(trans, path, path->l[l].b); -} - -int __bch2_trans_mutex_lock(struct btree_trans *trans, - struct mutex *lock) -{ - int ret = drop_locks_do(trans, (mutex_lock(lock), 0)); - - if (ret) - mutex_unlock(lock); - return ret; -} - -/* Debug */ - -void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path) -{ - if (!path->nodes_locked && btree_path_node(path, path->level)) { - /* - * A path may be uptodate and yet have nothing locked if and only if - * there is no node at path->level, which generally means we were - * iterating over all nodes and got to the end of the btree - */ - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE); - BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); - } - - if (!path->nodes_locked) - return; - - for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { - int want = btree_lock_want(path, l); - int have = btree_node_locked_type_nowrite(path, l); - - BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); - - BUG_ON(is_btree_node(path, l) && want != have); - - BUG_ON(btree_node_locked(path, l) && - path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); - } -} - -static bool bch2_trans_locked(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->nodes_locked) - return true; - return false; -} - -void __bch2_trans_verify_locks(struct btree_trans *trans) -{ - if (!trans->locked) { - BUG_ON(bch2_trans_locked(trans)); - return; - } - - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - __bch2_btree_path_verify_locks(trans, path); -} diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h deleted file mode 100644 index f2173a3316f4..000000000000 --- a/fs/bcachefs/btree_locking.h +++ /dev/null @@ -1,466 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_LOCKING_H -#define _BCACHEFS_BTREE_LOCKING_H - -/* - * Only for internal btree use: - * - * The btree iterator tracks what locks it wants to take, and what locks it - * currently has - here we have wrappers for locking/unlocking btree nodes and - * updating the iterator state - */ - -#include "btree_iter.h" -#include "six.h" - -void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); - -void bch2_trans_unlock_write(struct btree_trans *); - -static inline bool is_btree_node(struct btree_path *path, unsigned l) -{ - return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b); -} - -static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans) -{ - return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats) - ? &trans->c->btree_transaction_stats[trans->fn_idx] - : NULL; -} - -/* matches six lock types */ -enum btree_node_locked_type { - BTREE_NODE_UNLOCKED = -1, - BTREE_NODE_READ_LOCKED = SIX_LOCK_read, - BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, - BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write, -}; - -static inline int btree_node_locked_type(struct btree_path *path, - unsigned level) -{ - return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); -} - -static inline int btree_node_locked_type_nowrite(struct btree_path *path, - unsigned level) -{ - int have = btree_node_locked_type(path, level); - return have == BTREE_NODE_WRITE_LOCKED - ? BTREE_NODE_INTENT_LOCKED - : have; -} - -static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) -{ - return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; -} - -static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l) -{ - return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED; -} - -static inline bool btree_node_read_locked(struct btree_path *path, unsigned l) -{ - return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED; -} - -static inline bool btree_node_locked(struct btree_path *path, unsigned level) -{ - return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED; -} - -static inline void mark_btree_node_locked_noreset(struct btree_path *path, - unsigned level, - enum btree_node_locked_type type) -{ - /* relying on this to avoid a branch */ - BUILD_BUG_ON(SIX_LOCK_read != 0); - BUILD_BUG_ON(SIX_LOCK_intent != 1); - - path->nodes_locked &= ~(3U << (level << 1)); - path->nodes_locked |= (type + 1) << (level << 1); -} - -static inline void mark_btree_node_locked(struct btree_trans *trans, - struct btree_path *path, - unsigned level, - enum btree_node_locked_type type) -{ - mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type); -#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - path->l[level].lock_taken_time = local_clock(); -#endif -} - -static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) -{ - return level < path->locks_want - ? SIX_LOCK_intent - : SIX_LOCK_read; -} - -static inline enum btree_node_locked_type -btree_lock_want(struct btree_path *path, int level) -{ - if (level < path->level) - return BTREE_NODE_UNLOCKED; - if (level < path->locks_want) - return BTREE_NODE_INTENT_LOCKED; - if (level == path->level) - return BTREE_NODE_READ_LOCKED; - return BTREE_NODE_UNLOCKED; -} - -static void btree_trans_lock_hold_time_update(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ -#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times, - path->l[level].lock_taken_time, - local_clock()); -#endif -} - -/* unlock: */ - -void bch2_btree_node_unlock_write(struct btree_trans *, - struct btree_path *, struct btree *); - -static inline void btree_node_unlock(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - int lock_type = btree_node_locked_type(path, level); - - EBUG_ON(level >= BTREE_MAX_DEPTH); - - if (lock_type != BTREE_NODE_UNLOCKED) { - if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) { - bch2_btree_node_unlock_write(trans, path, path->l[level].b); - lock_type = BTREE_NODE_INTENT_LOCKED; - } - six_unlock_type(&path->l[level].b->c.lock, lock_type); - btree_trans_lock_hold_time_update(trans, path, level); - mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); - } -} - -static inline int btree_path_lowest_level_locked(struct btree_path *path) -{ - return __ffs(path->nodes_locked) >> 1; -} - -static inline int btree_path_highest_level_locked(struct btree_path *path) -{ - return __fls(path->nodes_locked) >> 1; -} - -static inline void __bch2_btree_path_unlock(struct btree_trans *trans, - struct btree_path *path) -{ - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK); - - while (path->nodes_locked) - btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); -} - -/* - * Updates the saved lock sequence number, so that bch2_btree_node_relock() will - * succeed: - */ -static inline void -__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b) -{ - if (!b->c.lock.write_lock_recurse) { - struct btree_path *linked; - unsigned i; - - trans_for_each_path_with_node(trans, b, linked, i) - linked->l[b->c.level].lock_seq++; - } - - six_unlock_write(&b->c.lock); -} - -static inline void -bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, - struct btree *b) -{ - EBUG_ON(path->l[b->c.level].b != b); - EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); - EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); - - mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - __bch2_btree_node_unlock_write(trans, b); -} - -int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); - -/* lock: */ - -static inline void trans_set_locked(struct btree_trans *trans, bool try) -{ - if (!trans->locked) { - lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_); - trans->locked = true; - trans->last_unlock_ip = 0; - - trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0; - current->flags |= PF_MEMALLOC_NOFS; - } -} - -static inline void trans_set_unlocked(struct btree_trans *trans) -{ - if (trans->locked) { - lock_release(&trans->dep_map, _THIS_IP_); - trans->locked = false; - trans->last_unlock_ip = _RET_IP_; - - if (!trans->pf_memalloc_nofs) - current->flags &= ~PF_MEMALLOC_NOFS; - } -} - -static inline int __btree_node_lock_nopath(struct btree_trans *trans, - struct btree_bkey_cached_common *b, - enum six_lock_type type, - bool lock_may_not_fail, - unsigned long ip) -{ - trans->lock_may_not_fail = lock_may_not_fail; - trans->lock_must_abort = false; - trans->locking = b; - - int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait, - bch2_six_check_for_deadlock, trans, ip); - WRITE_ONCE(trans->locking, NULL); - WRITE_ONCE(trans->locking_wait.start_time, 0); - - if (!ret) - trace_btree_path_lock(trans, _THIS_IP_, b); - return ret; -} - -static inline int __must_check -btree_node_lock_nopath(struct btree_trans *trans, - struct btree_bkey_cached_common *b, - enum six_lock_type type, - unsigned long ip) -{ - return __btree_node_lock_nopath(trans, b, type, false, ip); -} - -static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, - struct btree_bkey_cached_common *b, - enum six_lock_type type) -{ - int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_); - - BUG_ON(ret); -} - -/* - * Lock a btree node if we already have it locked on one of our linked - * iterators: - */ -static inline bool btree_node_lock_increment(struct btree_trans *trans, - struct btree_bkey_cached_common *b, - unsigned level, - enum btree_node_locked_type want) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (&path->l[level].b->c == b && - btree_node_locked_type(path, level) >= want) { - six_lock_increment(&b->lock, (enum six_lock_type) want); - return true; - } - - return false; -} - -static inline int btree_node_lock(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b, - unsigned level, - enum six_lock_type type, - unsigned long ip) -{ - int ret = 0; - - EBUG_ON(level >= BTREE_MAX_DEPTH); - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if (likely(six_trylock_type(&b->lock, type)) || - btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || - !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) { -#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - path->l[b->level].lock_taken_time = local_clock(); -#endif - } - - return ret; -} - -int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *, - struct btree_bkey_cached_common *b, bool); - -static inline int __btree_node_lock_write(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b, - bool lock_may_not_fail) -{ - EBUG_ON(&path->l[b->level].b->c != b); - EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock)); - EBUG_ON(!btree_node_intent_locked(path, b->level)); - - /* - * six locks are unfair, and read locks block while a thread wants a - * write lock: thus, we need to tell the cycle detector we have a write - * lock _before_ taking the lock: - */ - mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED); - - return likely(six_trylock_write(&b->lock)) - ? 0 - : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail); -} - -static inline int __must_check -bch2_btree_node_lock_write(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b) -{ - return __btree_node_lock_write(trans, path, b, false); -} - -void bch2_btree_node_lock_write_nofail(struct btree_trans *, - struct btree_path *, - struct btree_bkey_cached_common *); - -/* relock: */ - -bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *); -int __bch2_btree_path_relock(struct btree_trans *, - struct btree_path *, unsigned long); - -static inline int bch2_btree_path_relock(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) -{ - return btree_node_locked(path, path->level) - ? 0 - : __bch2_btree_path_relock(trans, path, trace_ip); -} - -bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace); - -static inline bool bch2_btree_node_relock(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - EBUG_ON(btree_node_locked(path, level) && - !btree_node_write_locked(path, level) && - btree_node_locked_type(path, level) != __btree_lock_want(path, level)); - - return likely(btree_node_locked(path, level)) || - (!IS_ERR_OR_NULL(path->l[level].b) && - __bch2_btree_node_relock(trans, path, level, true)); -} - -static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - EBUG_ON(btree_node_locked(path, level) && - btree_node_locked_type_nowrite(path, level) != - __btree_lock_want(path, level)); - - return likely(btree_node_locked(path, level)) || - (!IS_ERR_OR_NULL(path->l[level].b) && - __bch2_btree_node_relock(trans, path, level, false)); -} - -/* upgrade */ - -bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned); - -static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - return new_locks_want > path->locks_want - ? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want) - : true; -} - -int __bch2_btree_path_upgrade(struct btree_trans *, - struct btree_path *, unsigned); - -static inline int bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - - return likely(path->locks_want >= new_locks_want && path->nodes_locked) - ? 0 - : __bch2_btree_path_upgrade(trans, path, new_locks_want); -} - -/* misc: */ - -static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path) -{ - EBUG_ON(!btree_node_locked(path, path->level)); - EBUG_ON(path->uptodate); - - if (!path->should_be_locked) { - path->should_be_locked = true; - trace_btree_path_should_be_locked(trans, path); - } -} - -static inline void __btree_path_set_level_up(struct btree_trans *trans, - struct btree_path *path, - unsigned l) -{ - btree_node_unlock(trans, path, l); - path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up); -} - -static inline void btree_path_set_level_up(struct btree_trans *trans, - struct btree_path *path) -{ - __btree_path_set_level_up(trans, path, path->level++); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); -} - -/* debug */ - -struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, - struct btree_path *, - struct btree_bkey_cached_common *b, - unsigned); - -int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); - -void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *); -void __bch2_trans_verify_locks(struct btree_trans *); - -static inline void bch2_btree_path_verify_locks(struct btree_trans *trans, - struct btree_path *path) -{ - if (static_branch_unlikely(&bch2_debug_check_btree_locking)) - __bch2_btree_path_verify_locks(trans, path); -} - -static inline void bch2_trans_verify_locks(struct btree_trans *trans) -{ - if (static_branch_unlikely(&bch2_debug_check_btree_locking)) - __bch2_trans_verify_locks(trans); -} - -#endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c deleted file mode 100644 index a3fb07c60e25..000000000000 --- a/fs/bcachefs/btree_node_scan.c +++ /dev/null @@ -1,611 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_journal_iter.h" -#include "btree_node_scan.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "error.h" -#include "journal_io.h" -#include "recovery_passes.h" - -#include <linux/kthread.h> -#include <linux/min_heap.h> -#include <linux/sched/sysctl.h> -#include <linux/sort.h> - -struct find_btree_nodes_worker { - struct closure *cl; - struct find_btree_nodes *f; - struct bch_dev *ca; -}; - -static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) -{ - bch2_btree_id_level_to_text(out, n->btree_id, n->level); - prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ", - n->seq, n->journal_seq, n->cookie); - bch2_bpos_to_text(out, n->min_key); - prt_str(out, "-"); - bch2_bpos_to_text(out, n->max_key); - - if (n->range_updated) - prt_str(out, " range updated"); - - for (unsigned i = 0; i < n->nr_ptrs; i++) { - prt_char(out, ' '); - bch2_extent_ptr_to_text(out, c, n->ptrs + i); - } -} - -static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) -{ - printbuf_indent_add(out, 2); - darray_for_each(nodes, i) { - found_btree_node_to_text(out, c, i); - prt_newline(out); - } - printbuf_indent_sub(out, 2); -} - -static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) -{ - struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); - - set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); - bp->k.p = f->max_key; - bp->v.seq = cpu_to_le64(f->cookie); - bp->v.sectors_written = 0; - bp->v.flags = 0; - bp->v.sectors_written = cpu_to_le16(f->sectors_written); - bp->v.min_key = f->min_key; - SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); - memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); -} - -static inline u64 bkey_journal_seq(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode_v3: - return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq); - default: - return 0; - } -} - -static int found_btree_node_cmp_cookie(const void *_l, const void *_r) -{ - const struct found_btree_node *l = _l; - const struct found_btree_node *r = _r; - - return cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->level, r->level) ?: - cmp_int(l->cookie, r->cookie); -} - -/* - * Given two found btree nodes, if their sequence numbers are equal, take the - * one that's readable: - */ -static int found_btree_node_cmp_time(const struct found_btree_node *l, - const struct found_btree_node *r) -{ - return cmp_int(l->seq, r->seq) ?: - cmp_int(l->journal_seq, r->journal_seq); -} - -static int found_btree_node_cmp_pos(const void *_l, const void *_r) -{ - const struct found_btree_node *l = _l; - const struct found_btree_node *r = _r; - - return cmp_int(l->btree_id, r->btree_id) ?: - -cmp_int(l->level, r->level) ?: - bpos_cmp(l->min_key, r->min_key) ?: - -found_btree_node_cmp_time(l, r); -} - -static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg) -{ - return found_btree_node_cmp_pos(l, r) < 0; -} - -static inline void found_btree_node_swap(void *_l, void *_r, void *arg) -{ - struct found_btree_node *l = _l; - struct found_btree_node *r = _r; - - swap(*l, *r); -} - -static const struct min_heap_callbacks found_btree_node_heap_cbs = { - .less = found_btree_node_cmp_pos_less, - .swp = found_btree_node_swap, -}; - -static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, - struct btree *b, struct bio *bio, u64 offset) -{ - struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); - struct btree_node *bn = b->data; - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); - bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, b->data, c->opts.block_size); - - u64 submit_time = local_clock(); - submit_bio_wait(bio); - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); - - if (bio->bi_status) { - bch_err_dev_ratelimited(ca, - "IO error in try_read_btree_node() at %llu: %s", - offset, bch2_blk_status_to_str(bio->bi_status)); - return; - } - - if (le64_to_cpu(bn->magic) != bset_magic(c)) - return; - - if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { - if (!c->chacha20_key_set) - return; - - struct nonce nonce = btree_nonce(&bn->keys, 0); - unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; - - bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); - } - - if (btree_id_is_alloc(BTREE_NODE_ID(bn))) - return; - - if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) - return; - - if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) - return; - - rcu_read_lock(); - struct found_btree_node n = { - .btree_id = BTREE_NODE_ID(bn), - .level = BTREE_NODE_LEVEL(bn), - .seq = BTREE_NODE_SEQ(bn), - .cookie = le64_to_cpu(bn->keys.seq), - .min_key = bn->min_key, - .max_key = bn->max_key, - .nr_ptrs = 1, - .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, - .ptrs[0].offset = offset, - .ptrs[0].dev = ca->dev_idx, - .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)), - }; - rcu_read_unlock(); - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); - bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, b->data, c->opts.btree_node_size); - - submit_time = local_clock(); - submit_bio_wait(bio); - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); - - found_btree_node_to_key(&b->key, &n); - - CLASS(printbuf, buf)(); - if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) { - /* read_done will swap out b->data for another buffer */ - bn = b->data; - /* - * Grab journal_seq here because we want the max journal_seq of - * any bset; read_done sorts down to a single set and picks the - * max journal_seq - */ - n.journal_seq = le64_to_cpu(bn->keys.journal_seq), - n.sectors_written = b->written; - - mutex_lock(&f->lock); - if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { - bch_err(c, "try_read_btree_node() can't handle endian conversion"); - f->ret = -EINVAL; - goto unlock; - } - - if (darray_push(&f->nodes, n)) - f->ret = -ENOMEM; -unlock: - mutex_unlock(&f->lock); - } -} - -static int read_btree_nodes_worker(void *p) -{ - struct find_btree_nodes_worker *w = p; - struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); - struct bch_dev *ca = w->ca; - unsigned long last_print = jiffies; - struct btree *b = NULL; - struct bio *bio = NULL; - - b = __bch2_btree_node_mem_alloc(c); - if (!b) { - bch_err(c, "read_btree_nodes_worker: error allocating buf"); - w->f->ret = -ENOMEM; - goto err; - } - - bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL); - if (!bio) { - bch_err(c, "read_btree_nodes_worker: error allocating bio"); - w->f->ret = -ENOMEM; - goto err; - } - - for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) - for (unsigned bucket_offset = 0; - bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; - bucket_offset += btree_sectors(c)) { - if (time_after(jiffies, last_print + HZ * 30)) { - u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; - u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; - - bch_info(ca, "%s: %2u%% done", __func__, - (unsigned) div64_u64(cur_sector * 100, end_sector)); - last_print = jiffies; - } - - u64 sector = bucket * ca->mi.bucket_size + bucket_offset; - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && - !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) - continue; - - try_read_btree_node(w->f, ca, b, bio, sector); - } -err: - if (b) - __btree_node_data_free(b); - kfree(b); - bio_put(bio); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - closure_put(w->cl); - kfree(w); - return 0; -} - -static int read_btree_nodes(struct find_btree_nodes *f) -{ - struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); - struct closure cl; - int ret = 0; - - closure_init_stack(&cl); - - for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) { - if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) - continue; - - struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); - if (!w) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - ret = -ENOMEM; - goto err; - } - - w->cl = &cl; - w->f = f; - w->ca = ca; - - struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); - ret = PTR_ERR_OR_ZERO(t); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - kfree(w); - bch_err_msg(c, ret, "starting kthread"); - break; - } - - closure_get(&cl); - enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - wake_up_process(t); - } -err: - while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2)) - ; - return f->ret ?: ret; -} - -static bool nodes_overlap(const struct found_btree_node *l, - const struct found_btree_node *r) -{ - return (l->btree_id == r->btree_id && - l->level == r->level && - bpos_gt(l->max_key, r->min_key)); -} - -static int handle_overwrites(struct bch_fs *c, - struct found_btree_node *l, - found_btree_nodes *nodes_heap) -{ - struct found_btree_node *r; - - while ((r = min_heap_peek(nodes_heap)) && - nodes_overlap(l, r)) { - int cmp = found_btree_node_cmp_time(l, r); - - if (cmp > 0) { - if (bpos_cmp(l->max_key, r->max_key) >= 0) - min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); - else { - r->range_updated = true; - r->min_key = bpos_successor(l->max_key); - r->range_updated = true; - min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); - } - } else if (cmp < 0) { - BUG_ON(bpos_eq(l->min_key, r->min_key)); - - l->max_key = bpos_predecessor(r->min_key); - l->range_updated = true; - } else if (r->level) { - min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); - } else { - if (bpos_cmp(l->max_key, r->max_key) >= 0) - min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); - else { - r->range_updated = true; - r->min_key = bpos_successor(l->max_key); - r->range_updated = true; - min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); - } - } - - cond_resched(); - } - - return 0; -} - -int bch2_scan_for_btree_nodes(struct bch_fs *c) -{ - struct find_btree_nodes *f = &c->found_btree_nodes; - struct printbuf buf = PRINTBUF; - found_btree_nodes nodes_heap = {}; - size_t dst; - int ret = 0; - - if (f->nodes.nr) - return 0; - - mutex_init(&f->lock); - - ret = read_btree_nodes(f); - if (ret) - return ret; - - if (!f->nodes.nr) { - bch_err(c, "%s: no btree nodes found", __func__); - ret = -EINVAL; - goto err; - } - - if (0 && c->opts.verbose) { - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes found:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_str(c, KERN_INFO, buf.buf); - } - - sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); - - dst = 0; - darray_for_each(f->nodes, i) { - struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; - - if (prev && - prev->cookie == i->cookie) { - if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { - bch_err(c, "%s: found too many replicas for btree node", __func__); - ret = -EINVAL; - goto err; - } - prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; - } else { - f->nodes.data[dst++] = *i; - } - } - f->nodes.nr = dst; - - sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); - - if (0 && c->opts.verbose) { - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_str(c, KERN_INFO, buf.buf); - } - - swap(nodes_heap, f->nodes); - - { - /* darray must have same layout as a heap */ - min_heap_char real_heap; - BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr)); - BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size)); - BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr)); - BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size)); - } - - min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL); - - if (nodes_heap.nr) { - ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); - if (ret) - goto err; - - min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); - } - - while (true) { - ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap); - if (ret) - goto err; - - if (!nodes_heap.nr) - break; - - ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); - if (ret) - goto err; - - min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); - } - - for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++) - BUG_ON(nodes_overlap(n, n + 1)); - - if (0 && c->opts.verbose) { - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_str(c, KERN_INFO, buf.buf); - } else { - bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); - } - - eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); -err: - darray_exit(&nodes_heap); - printbuf_exit(&buf); - return ret; -} - -static int found_btree_node_range_start_cmp(const void *_l, const void *_r) -{ - const struct found_btree_node *l = _l; - const struct found_btree_node *r = _r; - - return cmp_int(l->btree_id, r->btree_id) ?: - -cmp_int(l->level, r->level) ?: - bpos_cmp(l->max_key, r->min_key); -} - -#define for_each_found_btree_node_in_range(_f, _search, _idx) \ - for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ - sizeof((_f)->nodes.data[0]), \ - found_btree_node_range_start_cmp, &search); \ - _idx < (_f)->nodes.nr && \ - (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ - (_f)->nodes.data[_idx].level == _search.level && \ - bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ - _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) - -bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) -{ - struct find_btree_nodes *f = &c->found_btree_nodes; - - struct found_btree_node search = { - .btree_id = b->c.btree_id, - .level = b->c.level, - .min_key = b->data->min_key, - .max_key = b->key.k.p, - }; - - for_each_found_btree_node_in_range(f, search, idx) - if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) - return true; - return false; -} - -int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) -{ - int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); - if (ret) - return ret; - - struct found_btree_node search = { - .btree_id = btree, - .level = 0, - .min_key = POS_MIN, - .max_key = SPOS_MAX, - }; - - for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) - return true; - return false; -} - -int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, - unsigned level, struct bpos node_min, struct bpos node_max) -{ - if (btree_id_is_alloc(btree)) - return 0; - - struct find_btree_nodes *f = &c->found_btree_nodes; - - int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); - if (ret) - return ret; - - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "recovery "); - bch2_btree_id_level_to_text(&buf, btree, level); - prt_str(&buf, " "); - bch2_bpos_to_text(&buf, node_min); - prt_str(&buf, " - "); - bch2_bpos_to_text(&buf, node_max); - - bch_info(c, "%s(): %s", __func__, buf.buf); - printbuf_exit(&buf); - } - - struct found_btree_node search = { - .btree_id = btree, - .level = level, - .min_key = node_min, - .max_key = node_max, - }; - - for_each_found_btree_node_in_range(f, search, idx) { - struct found_btree_node n = f->nodes.data[idx]; - - n.range_updated |= bpos_lt(n.min_key, node_min); - n.min_key = bpos_max(n.min_key, node_min); - - n.range_updated |= bpos_gt(n.max_key, node_max); - n.max_key = bpos_min(n.max_key, node_max); - - struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; - - found_btree_node_to_key(&tmp.k, &n); - - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); - bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); - printbuf_exit(&buf); - } - - BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), - (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = level + 1, - .btree = btree, - })); - - ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); - if (ret) - return ret; - } - - return 0; -} - -void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) -{ - darray_exit(&f->nodes); -} diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h deleted file mode 100644 index 66e6f9ed19d0..000000000000 --- a/fs/bcachefs/btree_node_scan.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_NODE_SCAN_H -#define _BCACHEFS_BTREE_NODE_SCAN_H - -int bch2_scan_for_btree_nodes(struct bch_fs *); -bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); -int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); -int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); -void bch2_find_btree_nodes_exit(struct find_btree_nodes *); - -#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */ diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h deleted file mode 100644 index 2811b6857c97..000000000000 --- a/fs/bcachefs/btree_node_scan_types.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H -#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H - -#include "darray.h" - -struct found_btree_node { - bool range_updated:1; - u8 btree_id; - u8 level; - unsigned sectors_written; - u32 seq; - u64 journal_seq; - u64 cookie; - - struct bpos min_key; - struct bpos max_key; - - unsigned nr_ptrs; - struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; -}; - -typedef DARRAY(struct found_btree_node) found_btree_nodes; - -struct find_btree_nodes { - int ret; - struct mutex lock; - found_btree_nodes nodes; -}; - -#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c deleted file mode 100644 index 639ef75b3dbd..000000000000 --- a/fs/bcachefs/btree_trans_commit.c +++ /dev/null @@ -1,1121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "btree_gc.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_journal_iter.h" -#include "btree_key_cache.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "disk_accounting.h" -#include "enumerated_ref.h" -#include "errcode.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "replicas.h" -#include "snapshot.h" - -#include <linux/prefetch.h> -#include <linux/string_helpers.h> - -static const char * const trans_commit_flags_strs[] = { -#define x(n, ...) #n, - BCH_TRANS_COMMIT_FLAGS() -#undef x - NULL -}; - -void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags) -{ - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - - prt_printf(out, "watermark=%s", bch2_watermarks[watermark]); - - flags >>= BCH_WATERMARK_BITS; - if (flags) { - prt_char(out, ' '); - bch2_prt_bitflags(out, trans_commit_flags_strs, flags); - } -} - -static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct bch_fs *c = trans->c; - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u); - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); - - if (j_k) - k = bkey_i_to_s_c(j_k); - } - - u = *k.k; - u.needs_whiteout = i->old_k.needs_whiteout; - - BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); - BUG_ON(i->old_v != k.v); -#endif -} - -static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i) -{ - return (trans->paths + i->path)->l + i->level; -} - -static inline bool same_leaf_as_prev(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return i != trans->updates && - insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b; -} - -static inline bool same_leaf_as_next(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return i + 1 < trans->updates + trans->nr_updates && - insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b; -} - -inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - struct bch_fs *c = trans->c; - - if (unlikely(btree_node_just_written(b)) && - bch2_btree_post_write_cleanup(c, b)) - bch2_trans_node_reinit_iter(trans, b); - - /* - * If the last bset has been written, or if it's gotten too big - start - * a new bset to insert into: - */ - if (want_new_bset(c, b)) - bch2_btree_init_next(trans, b); -} - -static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) -{ - while (--i >= trans->updates) { - if (same_leaf_as_prev(trans, i)) - continue; - - bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b); - } - - trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); -} - -static inline int bch2_trans_lock_write(struct btree_trans *trans) -{ - EBUG_ON(trans->write_locked); - - trans_for_each_update(trans, i) { - if (same_leaf_as_prev(trans, i)) - continue; - - if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c)) - return trans_lock_write_fail(trans, i); - - if (!i->cached) - bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b); - } - - trans->write_locked = true; - return 0; -} - -static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans) -{ - if (likely(trans->write_locked)) { - trans_for_each_update(trans, i) - if (btree_node_locked_type(trans->paths + i->path, i->level) == - BTREE_NODE_WRITE_LOCKED) - bch2_btree_node_unlock_write_inlined(trans, - trans->paths + i->path, insert_l(trans, i)->b); - trans->write_locked = false; - } -} - -/* Inserting into a given leaf node (last stage of insert): */ - -/* Handle overwrites and do insert, for non extents: */ -bool bch2_btree_bset_insert_key(struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_i *insert) -{ - struct bkey_packed *k; - unsigned clobber_u64s = 0, new_u64s = 0; - - EBUG_ON(btree_node_just_written(b)); - EBUG_ON(bset_written(b, btree_bset_last(b))); - EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); - EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); - EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); - EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); - kmsan_check_memory(insert, bkey_bytes(&insert->k)); - - k = bch2_btree_node_iter_peek_all(node_iter, b); - if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) - k = NULL; - - /* @k is the key being overwritten/deleted, if any: */ - EBUG_ON(k && bkey_deleted(k)); - - /* Deleting, but not found? nothing to do: */ - if (bkey_deleted(&insert->k) && !k) - return false; - - if (bkey_deleted(&insert->k)) { - /* Deleting: */ - btree_account_key_drop(b, k); - k->type = KEY_TYPE_deleted; - - if (k->needs_whiteout) - push_whiteout(b, insert->k.p); - k->needs_whiteout = false; - - if (k >= btree_bset_last(b)->start) { - clobber_u64s = k->u64s; - bch2_bset_delete(b, k, clobber_u64s); - goto fix_iter; - } else { - bch2_btree_path_fix_key_modified(trans, b, k); - } - - return true; - } - - if (k) { - /* Overwriting: */ - btree_account_key_drop(b, k); - k->type = KEY_TYPE_deleted; - - insert->k.needs_whiteout = k->needs_whiteout; - k->needs_whiteout = false; - - if (k >= btree_bset_last(b)->start) { - clobber_u64s = k->u64s; - goto overwrite; - } else { - bch2_btree_path_fix_key_modified(trans, b, k); - } - } - - k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); -overwrite: - bch2_bset_insert(b, k, insert, clobber_u64s); - new_u64s = k->u64s; -fix_iter: - if (clobber_u64s != new_u64s) - bch2_btree_node_iter_fix(trans, path, b, node_iter, k, - clobber_u64s, new_u64s); - return true; -} - -static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, - unsigned i, u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct btree_write *w = container_of(pin, struct btree_write, journal); - struct btree *b = container_of(w, struct btree, writes[i]); - struct btree_trans *trans = bch2_trans_get(c); - unsigned long old, new; - unsigned idx = w - b->writes; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - - old = READ_ONCE(b->flags); - do { - new = old; - - if (!(old & (1 << BTREE_NODE_dirty)) || - !!(old & (1 << BTREE_NODE_write_idx)) != idx || - w->journal.seq != seq) - break; - - new &= ~BTREE_WRITE_TYPE_MASK; - new |= BTREE_WRITE_journal_reclaim; - new |= 1 << BTREE_NODE_need_write; - } while (!try_cmpxchg(&b->flags, &old, new)); - - btree_node_write_if_need(trans, b, SIX_LOCK_read); - six_unlock_read(&b->c.lock); - - bch2_trans_put(trans); - return 0; -} - -int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) -{ - return __btree_node_flush(j, pin, 0, seq); -} - -int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) -{ - return __btree_node_flush(j, pin, 1, seq); -} - -inline void bch2_btree_add_journal_pin(struct bch_fs *c, - struct btree *b, u64 seq) -{ - struct btree_write *w = btree_current_write(b); - - bch2_journal_pin_add(&c->journal, seq, &w->journal, - btree_node_write_idx(b) == 0 - ? bch2_btree_node_flush0 - : bch2_btree_node_flush1); -} - -/** - * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node - * @trans: btree transaction object - * @path: path pointing to @insert's pos - * @insert: key to insert - * @journal_seq: sequence number of journal reservation - */ -inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, - struct btree_path *path, - struct bkey_i *insert, - u64 journal_seq) -{ - struct bch_fs *c = trans->c; - struct btree *b = path_l(path)->b; - struct bset_tree *t = bset_tree_last(b); - struct bset *i = bset(b, t); - int old_u64s = bset_u64s(t); - int old_live_u64s = b->nr.live_u64s; - int live_u64s_added, u64s_added; - - if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, - &path_l(path)->iter, insert))) - return; - - i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); - - bch2_btree_add_journal_pin(c, b, journal_seq); - - if (unlikely(!btree_node_dirty(b))) { - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - set_btree_node_dirty_acct(c, b); - } - - live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) bset_u64s(t) - old_u64s; - - if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); - if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); - - if (u64s_added > live_u64s_added && - bch2_maybe_compact_whiteouts(c, b)) - bch2_trans_node_reinit_iter(trans, b); -} - -/* Cached btree updates: */ - -/* Normal update interface: */ - -static inline void btree_insert_entry_checks(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - struct btree_path *path = trans->paths + i->path; - - BUG_ON(!bpos_eq(i->k->k.p, path->pos)); - BUG_ON(i->cached != path->cached); - BUG_ON(i->level != path->level); - BUG_ON(i->btree_id != path->btree_id); - BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id)); - EBUG_ON(!i->level && - btree_type_has_snapshots(i->btree_id) && - !(i->flags & BTREE_UPDATE_internal_snapshot_node) && - test_bit(JOURNAL_replay_done, &trans->c->journal.flags) && - i->k->k.p.snapshot && - bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); -} - -static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, - unsigned flags) -{ - return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, - trans->journal_u64s, flags, trans); -} - -#define JSET_ENTRY_LOG_U64s 4 - -static noinline void journal_transaction_name(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct jset_entry *entry = - bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_log, 0, 0, - JSET_ENTRY_LOG_U64s); - struct jset_entry_log *l = - container_of(entry, struct jset_entry_log, entry); - - memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64), - trans->fn, strlen(trans->fn), 0); -} - -static inline int btree_key_can_insert(struct btree_trans *trans, - struct btree *b, unsigned u64s) -{ - if (!bch2_btree_node_insert_fits(b, u64s)) - return bch_err_throw(trans->c, btree_insert_btree_node_full); - - return 0; -} - -noinline static int -btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, - struct btree_path *path, unsigned new_u64s) -{ - struct bkey_cached *ck = (void *) path->l[0].b; - struct bkey_i *new_k; - int ret; - - bch2_trans_unlock_updates_write(trans); - bch2_trans_unlock(trans); - - new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); - if (!new_k) { - struct bch_fs *c = trans->c; - bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(path->btree_id), new_u64s); - return bch_err_throw(c, ENOMEM_btree_key_cache_insert); - } - - ret = bch2_trans_relock(trans) ?: - bch2_trans_lock_write(trans); - if (unlikely(ret)) { - kfree(new_k); - return ret; - } - - memcpy(new_k, ck->k, ck->u64s * sizeof(u64)); - - trans_for_each_update(trans, i) - if (i->old_v == &ck->k->v) - i->old_v = &new_k->v; - - kfree(ck->k); - ck->u64s = new_u64s; - ck->k = new_k; - return 0; -} - -static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, - struct btree_path *path, unsigned u64s) -{ - struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) path->l[0].b; - unsigned new_u64s; - struct bkey_i *new_k; - unsigned watermark = flags & BCH_WATERMARK_MASK; - - EBUG_ON(path->level); - - if (watermark < BCH_WATERMARK_reclaim && - !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(c)) - return bch_err_throw(c, btree_insert_need_journal_reclaim); - - /* - * bch2_varint_decode can read past the end of the buffer by at most 7 - * bytes (it won't be used): - */ - u64s += 1; - - if (u64s <= ck->u64s) - return 0; - - new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_k)) - return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); - - trans_for_each_update(trans, i) - if (i->old_v == &ck->k->v) - i->old_v = &new_k->v; - - ck->u64s = new_u64s; - ck->k = new_k; - return 0; -} - -/* Triggers: */ - -static int run_one_mem_trigger(struct btree_trans *trans, - struct btree_insert_entry *i, - unsigned flags) -{ - verify_update_old_key(trans, i); - - if (unlikely(flags & BTREE_TRIGGER_norun)) - return 0; - - struct bkey_s_c old = { &i->old_k, i->old_v }; - struct bkey_i *new = i->k; - const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); - const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - - if (old_ops->trigger == new_ops->trigger) - return bch2_key_trigger(trans, i->btree_id, i->level, - old, bkey_i_to_s(new), - BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags); - else - return bch2_key_trigger_new(trans, i->btree_id, i->level, - bkey_i_to_s(new), flags) ?: - bch2_key_trigger_old(trans, i->btree_id, i->level, - old, flags); -} - -static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i) -{ - verify_update_old_key(trans, i); - - if ((i->flags & BTREE_TRIGGER_norun) || - !btree_node_type_has_trans_triggers(i->bkey_type)) - return 0; - - /* - * Transactional triggers create new btree_insert_entries, so we can't - * pass them a pointer to a btree_insert_entry, that memory is going to - * move: - */ - struct bkey old_k = i->old_k; - struct bkey_s_c old = { &old_k, i->old_v }; - const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); - const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - unsigned flags = i->flags|BTREE_TRIGGER_transactional; - - if (!i->insert_trigger_run && - !i->overwrite_trigger_run && - old_ops->trigger == new_ops->trigger) { - i->overwrite_trigger_run = true; - i->insert_trigger_run = true; - return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), - BTREE_TRIGGER_insert| - BTREE_TRIGGER_overwrite|flags) ?: 1; - } else if (!i->overwrite_trigger_run) { - i->overwrite_trigger_run = true; - return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; - } else if (!i->insert_trigger_run) { - i->insert_trigger_run = true; - return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; - } else { - return 0; - } -} - -static int bch2_trans_commit_run_triggers(struct btree_trans *trans) -{ - unsigned sort_id_start = 0; - - while (sort_id_start < trans->nr_updates) { - unsigned i, sort_id = trans->updates[sort_id_start].sort_order; - bool trans_trigger_run; - - /* - * For a given btree, this algorithm runs insert triggers before - * overwrite triggers: this is so that when extents are being - * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop - * references before they are re-added. - * - * Running triggers will append more updates to the list of - * updates as we're walking it: - */ - do { - trans_trigger_run = false; - - for (i = sort_id_start; - i < trans->nr_updates && trans->updates[i].sort_order <= sort_id; - i++) { - if (trans->updates[i].sort_order < sort_id) { - sort_id_start = i; - continue; - } - - int ret = run_one_trans_trigger(trans, trans->updates + i); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; - } - } while (trans_trigger_run); - - sort_id_start = i; - } - -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && - btree_node_type_has_trans_triggers(i->bkey_type) && - (!i->insert_trigger_run || !i->overwrite_trigger_run)); -#endif - return 0; -} - -static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) -{ - trans_for_each_update(trans, i) - if (btree_node_type_has_triggers(i->bkey_type) && - gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) { - int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc); - if (ret) - return ret; - } - - return 0; -} - -static inline int -bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry **stopped_at, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_trans_commit_hook *h; - unsigned u64s = 0; - int ret = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); -#if 0 - /* todo: bring back dynamic fault injection */ - if (race_fault()) { - trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); - } -#endif - /* - * Check if the insert will fit in the leaf node with the write lock - * held, otherwise another thread could write the node changing the - * amount of space available: - */ - - prefetch(&trans->c->journal.flags); - - trans_for_each_update(trans, i) { - /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, i)) - u64s = 0; - - u64s += i->k->k.u64s; - ret = !i->cached - ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s) - : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s); - if (ret) { - *stopped_at = i; - return ret; - } - - i->k->k.needs_whiteout = false; - } - - /* - * Don't get journal reservation until after we know insert will - * succeed: - */ - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { - ret = bch2_trans_journal_res_get(trans, - (flags & BCH_WATERMARK_MASK)| - JOURNAL_RES_GET_NONBLOCK); - if (ret) - return ret; - - if (unlikely(trans->journal_transaction_names)) - journal_transaction_name(trans); - } - - /* - * Not allowed to fail after we've gotten our journal reservation - we - * have to use it: - */ - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - !(flags & BCH_TRANS_COMMIT_no_journal_res)) { - if (static_branch_unlikely(&bch2_journal_seq_verify)) - trans_for_each_update(trans, i) - i->k->k.bversion.lo = trans->journal_res.seq; - else if (static_branch_unlikely(&bch2_inject_invalid_keys)) - trans_for_each_update(trans, i) - i->k->k.bversion = MAX_VERSION; - } - - h = trans->hooks; - while (h) { - ret = h->fn(trans, h); - if (ret) - return ret; - h = h->next; - } - - struct bkey_i *accounting; - - percpu_down_read(&c->mark_lock); - for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); - accounting != btree_trans_subbuf_top(trans, &trans->accounting); - accounting = bkey_next(accounting)) { - ret = bch2_accounting_trans_commit_hook(trans, - bkey_i_to_accounting(accounting), flags); - if (ret) - goto revert_fs_usage; - } - percpu_up_read(&c->mark_lock); - - /* XXX: we only want to run this if deltas are nonzero */ - bch2_trans_account_disk_usage_change(trans); - - trans_for_each_update(trans, i) - if (btree_node_type_has_atomic_triggers(i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags); - if (ret) - goto fatal_err; - } - - if (unlikely(c->gc_pos.phase)) { - ret = bch2_trans_commit_run_gc_triggers(trans); - if (ret) - goto fatal_err; - } - - struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit }; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; - - for (struct jset_entry *i = btree_trans_journal_entries_start(trans); - i != btree_trans_journal_entries_top(trans); - i = vstruct_next(i)) { - ret = bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, validate_context); - if (unlikely(ret)) { - bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", - trans->fn); - goto fatal_err; - } - } - - trans_for_each_update(trans, i) { - validate_context.level = i->level; - validate_context.btree = i->btree_id; - - ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context); - if (unlikely(ret)){ - bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", - trans->fn, (void *) i->ip_allocated); - goto fatal_err; - } - btree_insert_entry_checks(trans, i); - } - - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { - struct journal *j = &c->journal; - struct jset_entry *entry; - - trans_for_each_update(trans, i) { - if (i->key_cache_already_flushed) - continue; - - if (i->flags & BTREE_UPDATE_nojournal) - continue; - - verify_update_old_key(trans, i); - - if (trans->journal_transaction_names) { - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_overwrite, - i->btree_id, i->level, - i->old_k.u64s); - bkey_reassemble((struct bkey_i *) entry->start, - (struct bkey_s_c) { &i->old_k, i->old_v }); - } - - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, - i->btree_id, i->level, - i->k->k.u64s); - bkey_copy((struct bkey_i *) entry->start, i->k); - } - - memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - btree_trans_journal_entries_start(trans), - trans->journal_entries.u64s); - - EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s); - - trans->journal_res.offset += trans->journal_entries.u64s; - trans->journal_res.u64s -= trans->journal_entries.u64s; - - memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_write_buffer_keys, - BTREE_ID_accounting, 0, - trans->accounting.u64s)->_data, - btree_trans_subbuf_base(trans, &trans->accounting), - trans->accounting.u64s); - - if (trans->journal_seq) - *trans->journal_seq = trans->journal_res.seq; - } - - trans_for_each_update(trans, i) { - struct btree_path *path = trans->paths + i->path; - - if (!i->cached) - bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq); - else if (!i->key_cache_already_flushed) - bch2_btree_insert_key_cached(trans, flags, i); - else - bch2_btree_key_cache_drop(trans, path); - } - - return 0; -fatal_err: - bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); - percpu_down_read(&c->mark_lock); -revert_fs_usage: - for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); - i != accounting; - i = bkey_next(i)) - bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags); - percpu_up_read(&c->mark_lock); - return ret; -} - -static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) -{ - /* - * Accounting keys aren't deduped in the journal: we have to compare - * each individual update against what's in the btree to see if it has - * been applied yet, and accounting updates also don't overwrite, - * they're deltas that accumulate. - */ - trans_for_each_update(trans, i) - if (i->k->k.type != KEY_TYPE_accounting) - bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); -} - -static int bch2_trans_commit_journal_pin_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) -{ - return 0; -} - -/* - * Get journal reservation, take write locks, and attempt to do btree update(s): - */ -static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry **stopped_at, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - int ret = 0, u64s_delta = 0; - - for (unsigned idx = 0; idx < trans->nr_updates; idx++) { - struct btree_insert_entry *i = trans->updates + idx; - if (i->cached) - continue; - - u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; - u64s_delta -= i->old_btree_u64s; - - if (!same_leaf_as_next(trans, i)) { - if (u64s_delta <= 0) { - ret = bch2_foreground_maybe_merge(trans, i->path, - i->level, flags); - if (unlikely(ret)) - return ret; - } - - u64s_delta = 0; - } - } - - ret = bch2_trans_lock_write(trans); - if (unlikely(ret)) - return ret; - - ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); - - if (!ret && unlikely(trans->journal_replay_not_finished)) - bch2_drop_overwrites_from_journal(trans); - - bch2_trans_unlock_updates_write(trans); - - if (!ret && trans->journal_pin) - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, - trans->journal_pin, - bch2_trans_commit_journal_pin_flush); - - /* - * Drop journal reservation after dropping write locks, since dropping - * the journal reservation may kick off a journal write: - */ - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) - bch2_journal_res_put(&c->journal, &trans->journal_res); - - return ret; -} - -static int journal_reclaim_wait_done(struct bch_fs *c) -{ - int ret = bch2_journal_error(&c->journal) ?: - bch2_btree_key_cache_wait_done(c); - - if (!ret) - journal_reclaim_kick(&c->journal); - return ret; -} - -static noinline -int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry *i, - int ret, unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - - if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) { - /* - * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK - * flag - */ - if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < BCH_WATERMARK_reclaim) { - ret = bch_err_throw(c, journal_reclaim_would_deadlock); - goto out; - } - - ret = drop_locks_do(trans, - bch2_trans_journal_res_get(trans, - (flags & BCH_WATERMARK_MASK)| - JOURNAL_RES_GET_CHECK)); - goto out; - } - - switch (ret) { - case -BCH_ERR_btree_insert_btree_node_full: - ret = bch2_btree_split_leaf(trans, i->path, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, - trace_ip, trans->paths + i->path); - break; - case -BCH_ERR_btree_insert_need_mark_replicas: - ret = drop_locks_do(trans, - bch2_accounting_update_sb(trans)); - break; - case -BCH_ERR_btree_insert_need_journal_reclaim: - bch2_trans_unlock(trans); - - trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); - track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true); - - wait_event_freezable(c->journal.reclaim_wait, - (ret = journal_reclaim_wait_done(c))); - - track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false); - - if (ret < 0) - break; - - ret = bch2_trans_relock(trans); - break; - default: - BUG_ON(ret >= 0); - break; - } -out: - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); - - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && - (flags & BCH_TRANS_COMMIT_no_enospc), c, - "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); - - return ret; -} - -/* - * This is for updates done in the early part of fsck - btree_gc - before we've - * gone RW. we only add the new key to the list of keys for journal replay to - * do. - */ -static noinline int -do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - - BUG_ON(current != c->recovery_task); - - trans_for_each_update(trans, i) { - int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); - if (ret) - return ret; - } - - for (struct jset_entry *i = btree_trans_journal_entries_start(trans); - i != btree_trans_journal_entries_top(trans); - i = vstruct_next(i)) { - if (i->type == BCH_JSET_ENTRY_btree_keys || - i->type == BCH_JSET_ENTRY_write_buffer_keys) { - jset_entry_for_each_key(i, k) { - int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k); - if (ret) - return ret; - } - } - - if (i->type == BCH_JSET_ENTRY_btree_root) { - guard(mutex)(&c->btree_root_lock); - - struct btree_root *r = bch2_btree_id_root(c, i->btree_id); - - bkey_copy(&r->key, i->start); - r->level = i->level; - r->alive = true; - } - } - - for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); - i != btree_trans_subbuf_top(trans, &trans->accounting); - i = bkey_next(i)) { - int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i); - if (ret) - return ret; - } - - return 0; -} - -int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) -{ - struct btree_insert_entry *errored_at = NULL; - struct bch_fs *c = trans->c; - unsigned journal_u64s = 0; - int ret = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) - goto out_reset; - - if (!trans->nr_updates && - !trans->journal_entries.u64s && - !trans->accounting.u64s) - goto out_reset; - - ret = bch2_trans_commit_run_triggers(trans); - if (ret) - goto out_reset; - - if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && - unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) { - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) - ret = do_bch2_trans_commit_to_journal_replay(trans); - else - ret = bch_err_throw(c, erofs_trans_commit); - goto out_reset; - } - - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - - journal_u64s = jset_u64s(trans->accounting.u64s); - trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); - if (trans->journal_transaction_names) - journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); - - trans_for_each_update(trans, i) { - struct btree_path *path = trans->paths + i->path; - - EBUG_ON(!path->should_be_locked); - - ret = bch2_btree_path_upgrade(trans, path, i->level + 1); - if (unlikely(ret)) - goto out; - - EBUG_ON(!btree_node_intent_locked(path, i->level)); - - if (i->key_cache_already_flushed) - continue; - - if (i->flags & BTREE_UPDATE_nojournal) - continue; - - /* we're going to journal the key being updated: */ - journal_u64s += jset_u64s(i->k->k.u64s); - - /* and we're also going to log the overwrite: */ - if (trans->journal_transaction_names) - journal_u64s += jset_u64s(i->old_k.u64s); - } - - if (trans->extra_disk_res) { - ret = bch2_disk_reservation_add(c, trans->disk_res, - trans->extra_disk_res, - (flags & BCH_TRANS_COMMIT_no_enospc) - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - goto err; - } -retry: - errored_at = NULL; - bch2_trans_verify_not_unlocked_or_in_restart(trans); - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); - - trans->journal_u64s = journal_u64s + trans->journal_entries.u64s; - - ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); - - /* make sure we didn't drop or screw up locks: */ - bch2_trans_verify_locks(trans); - - if (ret) - goto err; - - trace_and_count(c, transaction_commit, trans, _RET_IP_); -out: - if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans); -out_reset: - if (!ret) - bch2_trans_downgrade(trans); - bch2_trans_reset_updates(trans); - - return ret; -err: - ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_); - if (ret) - goto out; - - /* - * We might have done another transaction commit in the error path - - * i.e. btree write buffer flush - which will have made use of - * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is - * how the journal sequence number to pin is passed in - so we must - * restart: - */ - if (flags & BCH_TRANS_COMMIT_no_journal_res) { - ret = bch_err_throw(c, transaction_restart_nested); - goto out; - } - - goto retry; -} diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h deleted file mode 100644 index 112170fd9c8f..000000000000 --- a/fs/bcachefs/btree_types.h +++ /dev/null @@ -1,937 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_TYPES_H -#define _BCACHEFS_BTREE_TYPES_H - -#include <linux/list.h> -#include <linux/rhashtable.h> - -#include "bbpos_types.h" -#include "btree_key_cache_types.h" -#include "buckets_types.h" -#include "darray.h" -#include "errcode.h" -#include "journal_types.h" -#include "replicas_types.h" -#include "six.h" - -struct open_bucket; -struct btree_update; -struct btree_trans; - -#define MAX_BSETS 3U - -struct btree_nr_keys { - - /* - * Amount of live metadata (i.e. size of node after a compaction) in - * units of u64s - */ - u16 live_u64s; - u16 bset_u64s[MAX_BSETS]; - - /* live keys only: */ - u16 packed_keys; - u16 unpacked_keys; -}; - -struct bset_tree { - /* - * We construct a binary tree in an array as if the array - * started at 1, so that things line up on the same cachelines - * better: see comments in bset.c at cacheline_to_bkey() for - * details - */ - - /* size of the binary tree and prev array */ - u16 size; - - /* function of size - precalculated for to_inorder() */ - u16 extra; - - u16 data_offset; - u16 aux_data_offset; - u16 end_offset; -}; - -struct btree_write { - struct journal_entry_pin journal; -}; - -struct btree_alloc { - struct open_buckets ob; - __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); -}; - -struct btree_bkey_cached_common { - struct six_lock lock; - u8 level; - u8 btree_id; - bool cached; -}; - -struct btree { - struct btree_bkey_cached_common c; - - struct rhash_head hash; - u64 hash_val; - - unsigned long flags; - u16 written; - u8 nsets; - u8 nr_key_bits; - u16 version_ondisk; - - struct bkey_format format; - - struct btree_node *data; - void *aux_data; - - /* - * Sets of sorted keys - the real btree node - plus a binary search tree - * - * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point - * to the memory we have allocated for this btree node. Additionally, - * set[0]->data points to the entire btree node as it exists on disk. - */ - struct bset_tree set[MAX_BSETS]; - - struct btree_nr_keys nr; - u16 sib_u64s[2]; - u16 whiteout_u64s; - u8 byte_order; - u8 unpack_fn_len; - - struct btree_write writes[2]; - - /* Key/pointer for this btree node */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); - - /* - * XXX: add a delete sequence number, so when bch2_btree_node_relock() - * fails because the lock sequence number has changed - i.e. the - * contents were modified - we can still relock the node if it's still - * the one we want, without redoing the traversal - */ - - /* - * For asynchronous splits/interior node updates: - * When we do a split, we allocate new child nodes and update the parent - * node to point to them: we update the parent in memory immediately, - * but then we must wait until the children have been written out before - * the update to the parent can be written - this is a list of the - * btree_updates that are blocking this node from being - * written: - */ - struct list_head write_blocked; - - /* - * Also for asynchronous splits/interior node updates: - * If a btree node isn't reachable yet, we don't want to kick off - * another write - because that write also won't yet be reachable and - * marking it as completed before it's reachable would be incorrect: - */ - unsigned long will_make_reachable; - - struct open_buckets ob; - - /* lru list */ - struct list_head list; -}; - -#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \ - x(cache_reserve) \ - x(lock_intent) \ - x(lock_write) \ - x(dirty) \ - x(read_in_flight) \ - x(write_in_flight) \ - x(noevict) \ - x(write_blocked) \ - x(will_make_reachable) \ - x(access_bit) - -enum bch_btree_cache_not_freed_reasons { -#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n, - BCH_BTREE_CACHE_NOT_FREED_REASONS() -#undef x - BCH_BTREE_CACHE_NOT_FREED_REASONS_NR, -}; - -struct btree_cache_list { - unsigned idx; - struct shrinker *shrink; - struct list_head list; - size_t nr; -}; - -struct btree_cache { - struct rhashtable table; - bool table_init_done; - /* - * We never free a struct btree, except on shutdown - we just put it on - * the btree_cache_freed list and reuse it later. This simplifies the - * code, and it doesn't cost us much memory as the memory usage is - * dominated by buffers that hold the actual btree node data and those - * can be freed - and the number of struct btrees allocated is - * effectively bounded. - * - * btree_cache_freeable effectively is a small cache - we use it because - * high order page allocations can be rather expensive, and it's quite - * common to delete and allocate btree nodes in quick succession. It - * should never grow past ~2-3 nodes in practice. - */ - struct mutex lock; - struct list_head freeable; - struct list_head freed_pcpu; - struct list_head freed_nonpcpu; - struct btree_cache_list live[2]; - - size_t nr_freeable; - size_t nr_reserve; - size_t nr_by_btree[BTREE_ID_NR]; - atomic_long_t nr_dirty; - - /* shrinker stats */ - size_t nr_freed; - u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR]; - - /* - * If we need to allocate memory for a new btree node and that - * allocation fails, we can cannibalize another node in the btree cache - * to satisfy the allocation - lock to guarantee only one thread does - * this at a time: - */ - struct task_struct *alloc_lock; - struct closure_waitlist alloc_wait; - - struct bbpos pinned_nodes_start; - struct bbpos pinned_nodes_end; - /* btree id mask: 0 for leaves, 1 for interior */ - u64 pinned_nodes_mask[2]; -}; - -struct btree_node_iter { - struct btree_node_iter_set { - u16 k, end; - } data[MAX_BSETS]; -}; - -#define BTREE_ITER_FLAGS() \ - x(slots) \ - x(intent) \ - x(prefetch) \ - x(is_extents) \ - x(not_extents) \ - x(cached) \ - x(with_key_cache) \ - x(with_updates) \ - x(with_journal) \ - x(snapshot_field) \ - x(all_snapshots) \ - x(filter_snapshots) \ - x(nopreserve) \ - x(cached_nofill) \ - x(key_cache_fill) \ - -#define STR_HASH_FLAGS() \ - x(must_create) \ - x(must_replace) - -#define BTREE_UPDATE_FLAGS() \ - x(internal_snapshot_node) \ - x(nojournal) \ - x(key_cache_reclaim) - - -/* - * BTREE_TRIGGER_norun - don't run triggers at all - * - * BTREE_TRIGGER_transactional - we're running transactional triggers as part of - * a transaction commit: triggers may generate new updates - * - * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction - * commit: we have our journal reservation, we're holding btree node write - * locks, and we know the transaction is going to commit (returning an error - * here is a fatal error, causing us to go emergency read-only) - * - * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage - * - * BTREE_TRIGGER_insert - @new is entering the btree - * BTREE_TRIGGER_overwrite - @old is leaving the btree - */ -#define BTREE_TRIGGER_FLAGS() \ - x(norun) \ - x(transactional) \ - x(atomic) \ - x(check_repair) \ - x(gc) \ - x(insert) \ - x(overwrite) \ - x(is_root) - -enum { -#define x(n) BTREE_ITER_FLAG_BIT_##n, - BTREE_ITER_FLAGS() - STR_HASH_FLAGS() - BTREE_UPDATE_FLAGS() - BTREE_TRIGGER_FLAGS() -#undef x -}; - -/* iter flags must fit in a u16: */ -//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15); - -enum btree_iter_update_trigger_flags { -#define x(n) BTREE_ITER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, - BTREE_ITER_FLAGS() -#undef x -#define x(n) STR_HASH_##n = 1U << BTREE_ITER_FLAG_BIT_##n, - STR_HASH_FLAGS() -#undef x -#define x(n) BTREE_UPDATE_##n = 1U << BTREE_ITER_FLAG_BIT_##n, - BTREE_UPDATE_FLAGS() -#undef x -#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, - BTREE_TRIGGER_FLAGS() -#undef x -}; - -enum btree_path_uptodate { - BTREE_ITER_UPTODATE = 0, - BTREE_ITER_NEED_RELOCK = 1, - BTREE_ITER_NEED_TRAVERSE = 2, -}; - -#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG) -#define TRACK_PATH_ALLOCATED -#endif - -typedef u16 btree_path_idx_t; - -struct btree_path { - btree_path_idx_t sorted_idx; - u8 ref; - u8 intent_ref; - - /* btree_iter_copy starts here: */ - struct bpos pos; - - enum btree_id btree_id:5; - bool cached:1; - bool preserve:1; - enum btree_path_uptodate uptodate:2; - /* - * When true, failing to relock this path will cause the transaction to - * restart: - */ - bool should_be_locked:1; - unsigned level:3, - locks_want:3; - u8 nodes_locked; - - struct btree_path_level { - struct btree *b; - struct btree_node_iter iter; - u32 lock_seq; -#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - u64 lock_taken_time; -#endif - } l[BTREE_MAX_DEPTH]; -#ifdef TRACK_PATH_ALLOCATED - unsigned long ip_allocated; -#endif -}; - -static inline struct btree_path_level *path_l(struct btree_path *path) -{ - return path->l + path->level; -} - -static inline unsigned long btree_path_ip_allocated(struct btree_path *path) -{ -#ifdef TRACK_PATH_ALLOCATED - return path->ip_allocated; -#else - return _THIS_IP_; -#endif -} - -/* - * @pos - iterator's current position - * @level - current btree depth - * @locks_want - btree level below which we start taking intent locks - * @nodes_locked - bitmask indicating which nodes in @nodes are locked - * @nodes_intent_locked - bitmask indicating which locks are intent locks - */ -struct btree_iter { - btree_path_idx_t path; - btree_path_idx_t update_path; - btree_path_idx_t key_cache_path; - - enum btree_id btree_id:8; - u8 min_depth; - - /* btree_iter_copy starts here: */ - u16 flags; - - /* When we're filtering by snapshot, the snapshot ID we're looking for: */ - unsigned snapshot; - - struct bpos pos; - /* - * Current unpacked key - so that bch2_btree_iter_next()/ - * bch2_btree_iter_next_slot() can correctly advance pos. - */ - struct bkey k; - - /* BTREE_ITER_with_journal: */ - size_t journal_idx; -#ifdef TRACK_PATH_ALLOCATED - unsigned long ip_allocated; -#endif -}; - -#define BKEY_CACHED_ACCESSED 0 -#define BKEY_CACHED_DIRTY 1 - -struct bkey_cached { - struct btree_bkey_cached_common c; - - unsigned long flags; - u16 u64s; - struct bkey_cached_key key; - - struct rhash_head hash; - - struct journal_entry_pin journal; - u64 seq; - - struct bkey_i *k; - struct rcu_head rcu; -}; - -static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) -{ - return !b->cached - ? container_of(b, struct btree, c)->key.k.p - : container_of(b, struct bkey_cached, c)->key.pos; -} - -struct btree_insert_entry { - unsigned flags; - u8 sort_order; - u8 bkey_type; - enum btree_id btree_id:8; - u8 level:4; - bool cached:1; - bool insert_trigger_run:1; - bool overwrite_trigger_run:1; - bool key_cache_already_flushed:1; - /* - * @old_k may be a key from the journal; @old_btree_u64s always refers - * to the size of the key being overwritten in the btree: - */ - u8 old_btree_u64s; - btree_path_idx_t path; - struct bkey_i *k; - /* key being overwritten: */ - struct bkey old_k; - const struct bch_val *old_v; - unsigned long ip_allocated; -}; - -/* Number of btree paths we preallocate, usually enough */ -#define BTREE_ITER_INITIAL 64 -/* - * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code - * paths should run inside this limit, and if they don't it usually indicates a - * bug (leaking/duplicated btree paths). - * - * exception: some fsck paths - * - * bugs with excessive path usage seem to have possibly been eliminated now, so - * we might consider eliminating this (and btree_trans_too_many_iter()) at some - * point. - */ -#define BTREE_ITER_NORMAL_LIMIT 256 -/* never exceed limit */ -#define BTREE_ITER_MAX (1U << 10) - -struct btree_trans_commit_hook; -typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); - -struct btree_trans_commit_hook { - btree_trans_commit_hook_fn *fn; - struct btree_trans_commit_hook *next; -}; - -#define BTREE_TRANS_MEM_MAX (1U << 16) - -#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 - -struct btree_trans_paths { - unsigned long nr_paths; - struct btree_path paths[]; -}; - -struct trans_kmalloc_trace { - unsigned long ip; - size_t bytes; -}; -typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace; - -struct btree_trans_subbuf { - u16 base; - u16 u64s; - u16 size;; -}; - -struct btree_trans { - struct bch_fs *c; - - unsigned long *paths_allocated; - struct btree_path *paths; - btree_path_idx_t *sorted; - struct btree_insert_entry *updates; - - void *mem; - unsigned mem_top; - unsigned mem_bytes; - unsigned realloc_bytes_required; -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_trans_kmalloc_trace trans_kmalloc_trace; -#endif - - btree_path_idx_t nr_sorted; - btree_path_idx_t nr_paths; - btree_path_idx_t nr_paths_max; - btree_path_idx_t nr_updates; - u8 fn_idx; - u8 lock_must_abort; - bool lock_may_not_fail:1; - bool srcu_held:1; - bool locked:1; - bool pf_memalloc_nofs:1; - bool write_locked:1; - bool used_mempool:1; - bool in_traverse_all:1; - bool paths_sorted:1; - bool memory_allocation_failure:1; - bool journal_transaction_names:1; - bool journal_replay_not_finished:1; - bool notrace_relock_fail:1; - enum bch_errcode restarted:16; - u32 restart_count; -#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS - u32 restart_count_this_trans; -#endif - - u64 last_begin_time; - unsigned long last_begin_ip; - unsigned long last_restarted_ip; -#ifdef CONFIG_BCACHEFS_DEBUG - bch_stacktrace last_restarted_trace; -#endif - unsigned long last_unlock_ip; - unsigned long srcu_lock_time; - - const char *fn; - struct btree_bkey_cached_common *locking; - struct six_lock_waiter locking_wait; - int srcu_idx; - - /* update path: */ - struct btree_trans_subbuf journal_entries; - struct btree_trans_subbuf accounting; - - struct btree_trans_commit_hook *hooks; - struct journal_entry_pin *journal_pin; - - struct journal_res journal_res; - u64 *journal_seq; - struct disk_reservation *disk_res; - - struct bch_fs_usage_base fs_usage_delta; - - unsigned journal_u64s; - unsigned extra_disk_res; /* XXX kill */ - - __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif - /* Entries before this are zeroed out on every bch2_trans_get() call */ - - struct list_head list; - struct closure ref; - - unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)]; - struct btree_trans_paths trans_paths; - struct btree_path _paths[BTREE_ITER_INITIAL]; - btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4]; - struct btree_insert_entry _updates[BTREE_ITER_INITIAL]; -}; - -static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter) -{ - return trans->paths + iter->path; -} - -static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter) -{ - return iter->key_cache_path - ? trans->paths + iter->key_cache_path - : NULL; -} - -#define BCH_BTREE_WRITE_TYPES() \ - x(initial, 0) \ - x(init_next_bset, 1) \ - x(cache_reclaim, 2) \ - x(journal_reclaim, 3) \ - x(interior, 4) - -enum btree_write_type { -#define x(t, n) BTREE_WRITE_##t, - BCH_BTREE_WRITE_TYPES() -#undef x - BTREE_WRITE_TYPE_NR, -}; - -#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1) -#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR)) - -#define BTREE_FLAGS() \ - x(read_in_flight) \ - x(read_error) \ - x(dirty) \ - x(need_write) \ - x(write_blocked) \ - x(will_make_reachable) \ - x(noevict) \ - x(write_idx) \ - x(accessed) \ - x(write_in_flight) \ - x(write_in_flight_inner) \ - x(just_written) \ - x(dying) \ - x(fake) \ - x(need_rewrite) \ - x(need_rewrite_error) \ - x(need_rewrite_degraded) \ - x(need_rewrite_ptr_written_zero) \ - x(never_write) \ - x(pinned) - -enum btree_flags { - /* First bits for btree node write type */ - BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1, -#define x(flag) BTREE_NODE_##flag, - BTREE_FLAGS() -#undef x -}; - -#define x(flag) \ -static inline bool btree_node_ ## flag(struct btree *b) \ -{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ - \ -static inline void set_btree_node_ ## flag(struct btree *b) \ -{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ - \ -static inline void clear_btree_node_ ## flag(struct btree *b) \ -{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } - -BTREE_FLAGS() -#undef x - -#define BTREE_NODE_REWRITE_REASON() \ - x(none) \ - x(unknown) \ - x(error) \ - x(degraded) \ - x(ptr_written_zero) - -enum btree_node_rewrite_reason { -#define x(n) BTREE_NODE_REWRITE_##n, - BTREE_NODE_REWRITE_REASON() -#undef x -}; - -static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b) -{ - if (btree_node_need_rewrite_ptr_written_zero(b)) - return BTREE_NODE_REWRITE_ptr_written_zero; - if (btree_node_need_rewrite_degraded(b)) - return BTREE_NODE_REWRITE_degraded; - if (btree_node_need_rewrite_error(b)) - return BTREE_NODE_REWRITE_error; - if (btree_node_need_rewrite(b)) - return BTREE_NODE_REWRITE_unknown; - return BTREE_NODE_REWRITE_none; -} - -static inline struct btree_write *btree_current_write(struct btree *b) -{ - return b->writes + btree_node_write_idx(b); -} - -static inline struct btree_write *btree_prev_write(struct btree *b) -{ - return b->writes + (btree_node_write_idx(b) ^ 1); -} - -static inline struct bset_tree *bset_tree_last(struct btree *b) -{ - EBUG_ON(!b->nsets); - return b->set + b->nsets - 1; -} - -static inline void * -__btree_node_offset_to_ptr(const struct btree *b, u16 offset) -{ - return (void *) ((u64 *) b->data + offset); -} - -static inline u16 -__btree_node_ptr_to_offset(const struct btree *b, const void *p) -{ - u16 ret = (u64 *) p - (u64 *) b->data; - - EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); - return ret; -} - -static inline struct bset *bset(const struct btree *b, - const struct bset_tree *t) -{ - return __btree_node_offset_to_ptr(b, t->data_offset); -} - -static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) -{ - t->end_offset = - __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); -} - -static inline void set_btree_bset(struct btree *b, struct bset_tree *t, - const struct bset *i) -{ - t->data_offset = __btree_node_ptr_to_offset(b, i); - set_btree_bset_end(b, t); -} - -static inline struct bset *btree_bset_first(struct btree *b) -{ - return bset(b, b->set); -} - -static inline struct bset *btree_bset_last(struct btree *b) -{ - return bset(b, bset_tree_last(b)); -} - -static inline u16 -__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) -{ - return __btree_node_ptr_to_offset(b, k); -} - -static inline struct bkey_packed * -__btree_node_offset_to_key(const struct btree *b, u16 k) -{ - return __btree_node_offset_to_ptr(b, k); -} - -static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) -{ - return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); -} - -#define btree_bkey_first(_b, _t) \ -({ \ - EBUG_ON(bset(_b, _t)->start != \ - __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ - \ - bset(_b, _t)->start; \ -}) - -#define btree_bkey_last(_b, _t) \ -({ \ - EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ - vstruct_last(bset(_b, _t))); \ - \ - __btree_node_offset_to_key(_b, (_t)->end_offset); \ -}) - -static inline unsigned bset_u64s(struct bset_tree *t) -{ - return t->end_offset - t->data_offset - - sizeof(struct bset) / sizeof(u64); -} - -static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) -{ - return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; -} - -static inline unsigned bset_byte_offset(struct btree *b, void *i) -{ - return i - (void *) b->data; -} - -enum btree_node_type { - BKEY_TYPE_btree, -#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1, - BCH_BTREE_IDS() -#undef x - BKEY_TYPE_NR -}; - -/* Type of a key in btree @id at level @level: */ -static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) -{ - return level ? BKEY_TYPE_btree : (unsigned) id + 1; -} - -/* Type of keys @b contains: */ -static inline enum btree_node_type btree_node_type(struct btree *b) -{ - return __btree_node_type(b->c.level, b->c.btree_id); -} - -const char *bch2_btree_node_type_str(enum btree_node_type); - -#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ - (BIT_ULL(BKEY_TYPE_extents)| \ - BIT_ULL(BKEY_TYPE_alloc)| \ - BIT_ULL(BKEY_TYPE_inodes)| \ - BIT_ULL(BKEY_TYPE_stripes)| \ - BIT_ULL(BKEY_TYPE_reflink)| \ - BIT_ULL(BKEY_TYPE_subvolumes)| \ - BIT_ULL(BKEY_TYPE_btree)) - -#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ - (BIT_ULL(BKEY_TYPE_alloc)| \ - BIT_ULL(BKEY_TYPE_inodes)| \ - BIT_ULL(BKEY_TYPE_stripes)| \ - BIT_ULL(BKEY_TYPE_snapshots)) - -#define BTREE_NODE_TYPE_HAS_TRIGGERS \ - (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ - BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS) - -static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type) -{ - return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS; -} - -static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type) -{ - return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS; -} - -static inline bool btree_node_type_has_triggers(enum btree_node_type type) -{ - return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS; -} - -static inline bool btree_id_is_extents(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline bool btree_node_type_is_extents(enum btree_node_type type) -{ - return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1); -} - -static inline bool btree_type_has_snapshots(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline bool btree_type_has_snapshot_field(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline bool btree_type_has_ptrs(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline bool btree_type_uses_write_buffer(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline u8 btree_trigger_order(enum btree_id btree) -{ - switch (btree) { - case BTREE_ID_alloc: - return U8_MAX; - case BTREE_ID_stripes: - return U8_MAX - 1; - default: - return btree; - } -} - -struct btree_root { - struct btree *b; - - /* On disk root - see async splits: */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); - u8 level; - u8 alive; - s16 error; -}; - -enum btree_gc_coalesce_fail_reason { - BTREE_GC_COALESCE_FAIL_RESERVE_GET, - BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, - BTREE_GC_COALESCE_FAIL_FORMAT_FITS, -}; - -enum btree_node_sibling { - btree_prev_sib, - btree_next_sib, -}; - -struct get_locks_fail { - unsigned l; - struct btree *b; -}; - -#endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c deleted file mode 100644 index ee657b9f4b96..000000000000 --- a/fs/bcachefs/btree_update.c +++ /dev/null @@ -1,916 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_update.h" -#include "btree_iter.h" -#include "btree_journal_iter.h" -#include "btree_locking.h" -#include "buckets.h" -#include "debug.h" -#include "errcode.h" -#include "error.h" -#include "extents.h" -#include "keylist.h" -#include "snapshot.h" -#include "trace.h" - -#include <linux/string_helpers.h> - -static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, - const struct btree_insert_entry *r) -{ - return cmp_int(l->sort_order, r->sort_order) ?: - cmp_int(l->cached, r->cached) ?: - -cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p); -} - -static int __must_check -bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t, - struct bkey_i *, enum btree_iter_update_trigger_flags, - unsigned long ip); - -static noinline int extent_front_merge(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bkey_i **insert, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_i *update; - int ret; - - if (unlikely(trans->journal_replay_not_finished)) - return 0; - - update = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(update); - if (ret) - return ret; - - if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) - return 0; - - ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?: - bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p); - if (ret < 0) - return ret; - if (ret) - return 0; - - ret = bch2_btree_delete_at(trans, iter, flags); - if (ret) - return ret; - - *insert = update; - return 0; -} - -static noinline int extent_back_merge(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - int ret; - - if (unlikely(trans->journal_replay_not_finished)) - return 0; - - ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: - bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); - if (ret < 0) - return ret; - if (ret) - return 0; - - bch2_bkey_merge(c, bkey_i_to_s(insert), k); - return 0; -} - -/* - * When deleting, check if we need to emit a whiteout (because we're overwriting - * something in an ancestor snapshot) - */ -static int need_whiteout_for_snapshot(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k; - u32 snapshot = pos.snapshot; - int ret; - - if (!bch2_snapshot_parent(trans->c, pos.snapshot)) - return 0; - - pos.snapshot++; - - for_each_btree_key_norestart(trans, iter, btree_id, pos, - BTREE_ITER_all_snapshots| - BTREE_ITER_nopreserve, k, ret) { - if (!bkey_eq(k.k->p, pos)) - break; - - if (bch2_snapshot_is_ancestor(trans->c, snapshot, - k.k->p.snapshot)) { - ret = !bkey_whiteout(k.k); - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - snapshot_id_list *s) -{ - int ret = 0; - - darray_for_each(*s, id) { - pos.snapshot = *id; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos, - BTREE_ITER_not_extents| - BTREE_ITER_intent); - ret = bkey_err(k); - if (ret) - break; - - if (k.k->type == KEY_TYPE_deleted) { - struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); - ret = PTR_ERR_OR_ZERO(update); - if (ret) { - bch2_trans_iter_exit(trans, &iter); - break; - } - - bkey_init(&update->k); - update->k.p = pos; - update->k.type = KEY_TYPE_whiteout; - - ret = bch2_trans_update(trans, &iter, update, - BTREE_UPDATE_internal_snapshot_node); - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - break; - } - - darray_exit(s); - return ret; -} - -int bch2_trans_update_extent_overwrite(struct btree_trans *trans, - struct btree_iter *iter, - enum btree_iter_update_trigger_flags flags, - struct bkey_s_c old, - struct bkey_s_c new) -{ - enum btree_id btree_id = iter->btree_id; - struct bkey_i *update; - struct bpos new_start = bkey_start_pos(new.k); - unsigned front_split = bkey_lt(bkey_start_pos(old.k), new_start); - unsigned back_split = bkey_gt(old.k->p, new.k->p); - unsigned middle_split = (front_split || back_split) && - old.k->p.snapshot != new.k->p.snapshot; - unsigned nr_splits = front_split + back_split + middle_split; - int ret = 0, compressed_sectors; - - /* - * If we're going to be splitting a compressed extent, note it - * so that __bch2_trans_commit() can increase our disk - * reservation: - */ - if (nr_splits > 1 && - (compressed_sectors = bch2_bkey_sectors_compressed(old))) - trans->extra_disk_res += compressed_sectors * (nr_splits - 1); - - if (front_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_back(new_start, update); - - ret = bch2_insert_snapshot_whiteouts(trans, btree_id, - old.k->p, update->k.p) ?: - bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_internal_snapshot_node|flags); - if (ret) - return ret; - } - - /* If we're overwriting in a different snapshot - middle split: */ - if (middle_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_front(new_start, update); - bch2_cut_back(new.k->p, update); - - ret = bch2_insert_snapshot_whiteouts(trans, btree_id, - old.k->p, update->k.p) ?: - bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_internal_snapshot_node|flags); - if (ret) - return ret; - } - - if (bkey_le(old.k->p, new.k->p)) { - update = bch2_trans_kmalloc(trans, sizeof(*update)); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bkey_init(&update->k); - update->k.p = old.k->p; - update->k.p.snapshot = new.k->p.snapshot; - - if (new.k->p.snapshot != old.k->p.snapshot) { - update->k.type = KEY_TYPE_whiteout; - } else if (btree_type_has_snapshots(btree_id)) { - ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); - if (ret < 0) - return ret; - if (ret) - update->k.type = KEY_TYPE_whiteout; - } - - ret = bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_internal_snapshot_node|flags); - if (ret) - return ret; - } - - if (back_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_front(new.k->p, update); - - ret = bch2_trans_update_by_path(trans, iter->path, update, - BTREE_UPDATE_internal_snapshot_node| - flags, _RET_IP_); - if (ret) - return ret; - } - - return 0; -} - -static int bch2_trans_update_extent(struct btree_trans *trans, - struct btree_iter *orig_iter, - struct bkey_i *insert, - enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - struct bkey_s_c k; - enum btree_id btree_id = orig_iter->btree_id; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), - BTREE_ITER_intent| - BTREE_ITER_with_updates| - BTREE_ITER_not_extents); - k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) - goto out; - - if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { - if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { - ret = extent_front_merge(trans, &iter, k, &insert, flags); - if (ret) - goto err; - } - - goto next; - } - - while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { - bool done = bkey_lt(insert->k.p, k.k->p); - - ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); - if (ret) - goto err; - - if (done) - goto out; -next: - bch2_btree_iter_advance(trans, &iter); - k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) - goto out; - } - - if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { - ret = extent_back_merge(trans, &iter, insert, k); - if (ret) - goto err; - } -out: - if (!bkey_deleted(&insert->k)) - ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); -err: - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static noinline int flush_new_cached_update(struct btree_trans *trans, - struct btree_insert_entry *i, - enum btree_iter_update_trigger_flags flags, - unsigned long ip) -{ - struct bkey k; - int ret; - - btree_path_idx_t path_idx = - bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, - BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, path_idx, 0); - if (ret) - goto out; - - struct btree_path *btree_path = trans->paths + path_idx; - - /* - * The old key in the insert entry might actually refer to an existing - * key in the btree that has been deleted from cache and not yet - * flushed. Check for this and skip the flush so we don't run triggers - * against a stale key. - */ - bch2_btree_path_peek_slot_exact(btree_path, &k); - if (!bkey_deleted(&k)) - goto out; - - i->key_cache_already_flushed = true; - i->flags |= BTREE_TRIGGER_norun; - - btree_path_set_should_be_locked(trans, btree_path); - ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); -out: - bch2_path_put(trans, path_idx, true); - return ret; -} - -static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags, - unsigned long ip) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i, n; - int cmp; - - struct btree_path *path = trans->paths + path_idx; - EBUG_ON(!path->should_be_locked); - EBUG_ON(trans->nr_updates >= trans->nr_paths); - EBUG_ON(!bpos_eq(k->k.p, path->pos)); - - n = (struct btree_insert_entry) { - .flags = flags, - .sort_order = btree_trigger_order(path->btree_id), - .bkey_type = __btree_node_type(path->level, path->btree_id), - .btree_id = path->btree_id, - .level = path->level, - .cached = path->cached, - .path = path_idx, - .k = k, - .ip_allocated = ip, - }; - -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) - BUG_ON(i != trans->updates && - btree_insert_entry_cmp(i - 1, i) >= 0); -#endif - - /* - * Pending updates are kept sorted: first, find position of new update, - * then delete/trim any updates the new update overwrites: - */ - for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) { - cmp = btree_insert_entry_cmp(&n, i); - if (cmp <= 0) - break; - } - - bool overwrite = !cmp && i < trans->updates + trans->nr_updates; - - if (overwrite) { - EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); - - bch2_path_put(trans, i->path, true); - i->flags = n.flags; - i->cached = n.cached; - i->k = n.k; - i->path = n.path; - i->ip_allocated = n.ip_allocated; - } else { - array_insert_item(trans->updates, trans->nr_updates, - i - trans->updates, n); - - i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; - i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); - - if (j_k) { - i->old_k = j_k->k; - i->old_v = &j_k->v; - } - } - } - - __btree_path_get(trans, trans->paths + i->path, true); - - trace_update_by_path(trans, path, i, overwrite); - - /* - * If a key is present in the key cache, it must also exist in the - * btree - this is necessary for cache coherency. When iterating over - * a btree that's cached in the key cache, the btree iter code checks - * the key cache - but the key has to exist in the btree for that to - * work: - */ - if (path->cached && !i->old_btree_u64s) - return flush_new_cached_update(trans, i, flags, ip); - - return 0; -} - -static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, - struct btree_iter *iter, - struct btree_path *path) -{ - struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter); - - if (!key_cache_path || - !key_cache_path->should_be_locked || - !bpos_eq(key_cache_path->pos, iter->pos)) { - struct bkey_cached *ck; - int ret; - - if (!iter->key_cache_path) - iter->key_cache_path = - bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_intent| - BTREE_ITER_cached, _THIS_IP_); - - iter->key_cache_path = - bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached); - if (unlikely(ret)) - return ret; - - ck = (void *) trans->paths[iter->key_cache_path].l[0].b; - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); - } - - btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); - } - - return 0; -} - -int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags, - unsigned long ip) -{ - kmsan_check_memory(k, bkey_bytes(&k->k)); - - btree_path_idx_t path_idx = iter->update_path ?: iter->path; - int ret; - - if (iter->flags & BTREE_ITER_is_extents) - return bch2_trans_update_extent(trans, iter, k, flags); - - if (bkey_deleted(&k->k) && - !(flags & BTREE_UPDATE_key_cache_reclaim) && - (iter->flags & BTREE_ITER_filter_snapshots)) { - ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); - if (unlikely(ret < 0)) - return ret; - - if (ret) - k->k.type = KEY_TYPE_whiteout; - } - - /* - * Ensure that updates to cached btrees go to the key cache: - */ - struct btree_path *path = trans->paths + path_idx; - if (!(flags & BTREE_UPDATE_key_cache_reclaim) && - !path->cached && - !path->level && - btree_id_cached(trans->c, path->btree_id)) { - ret = bch2_trans_update_get_key_cache(trans, iter, path); - if (ret) - return ret; - - path_idx = iter->key_cache_path; - } - - return bch2_trans_update_by_path(trans, path_idx, k, flags, ip); -} - -int bch2_btree_insert_clone_trans(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) -{ - struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k)); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - bkey_copy(n, k); - return bch2_btree_insert_trans(trans, btree, n, 0); -} - -void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, - struct btree_trans_subbuf *buf, - unsigned u64s) -{ - unsigned new_top = buf->u64s + u64s; - unsigned new_size = buf->size; - - BUG_ON(roundup_pow_of_two(new_top) > U16_MAX); - - if (new_top > new_size) - new_size = roundup_pow_of_two(new_top); - - void *n = bch2_trans_kmalloc_nomemzero(trans, new_size * sizeof(u64)); - if (IS_ERR(n)) - return n; - - unsigned offset = (u64 *) n - (u64 *) trans->mem; - BUG_ON(offset > U16_MAX); - - if (buf->u64s) - memcpy(n, - btree_trans_subbuf_base(trans, buf), - buf->size * sizeof(u64)); - buf->base = (u64 *) n - (u64 *) trans->mem; - buf->size = new_size; - - void *p = btree_trans_subbuf_top(trans, buf); - buf->u64s = new_top; - return p; -} - -int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, - enum btree_id btree, struct bpos end) -{ - bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); - struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter); - int ret = bkey_err(k); - if (ret) - goto err; - - bch2_btree_iter_advance(trans, iter); - k = bch2_btree_iter_peek_slot(trans, iter); - ret = bkey_err(k); - if (ret) - goto err; - - BUG_ON(k.k->type != KEY_TYPE_deleted); - - if (bkey_gt(k.k->p, end)) { - ret = bch_err_throw(trans->c, ENOSPC_btree_slot); - goto err; - } - - return 0; -err: - bch2_trans_iter_exit(trans, iter); - return ret; -} - -void bch2_trans_commit_hook(struct btree_trans *trans, - struct btree_trans_commit_hook *h) -{ - h->next = trans->hooks; - trans->hooks = h; -} - -int bch2_btree_insert_nonextent(struct btree_trans *trans, - enum btree_id btree, struct bkey_i *k, - enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, btree, k->k.p, - BTREE_ITER_cached| - BTREE_ITER_not_extents| - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), - BTREE_ITER_intent|flags); - int ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/** - * bch2_btree_insert - insert keys into the extent btree - * @c: pointer to struct bch_fs - * @id: btree to insert into - * @k: key to insert - * @disk_res: must be non-NULL whenever inserting or potentially - * splitting data extents - * @flags: transaction commit flags - * @iter_flags: btree iter update trigger flags - * - * Returns: 0 on success, error code on failure - */ -int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, - struct disk_reservation *disk_res, int flags, - enum btree_iter_update_trigger_flags iter_flags) -{ - return bch2_trans_commit_do(c, disk_res, NULL, flags, - bch2_btree_insert_trans(trans, id, k, iter_flags)); -} - -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned update_flags) -{ - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); - int ret = PTR_ERR_OR_ZERO(k); - if (ret) - return ret; - - bkey_init(&k->k); - k->k.p = iter->pos; - return bch2_trans_update(trans, iter, k, update_flags); -} - -int bch2_btree_delete(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - unsigned update_flags) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, btree, pos, - BTREE_ITER_cached| - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, update_flags); - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, - struct bpos start, struct bpos end, - unsigned update_flags, - u64 *journal_seq) -{ - u32 restart_count = trans->restart_count; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); - while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(trans->c, 0); - struct bkey_i delete; - - ret = bkey_err(k); - if (ret) - goto err; - - bkey_init(&delete.k); - - /* - * This could probably be more efficient for extents: - */ - - /* - * For extents, iter.pos won't necessarily be the same as - * bkey_start_pos(k.k) (for non extents they always will be the - * same). It's important that we delete starting from iter.pos - * because the range we want to delete could start in the middle - * of k. - * - * (bch2_btree_iter_peek() does guarantee that iter.pos >= - * bkey_start_pos(k.k)). - */ - delete.k.p = iter.pos; - - if (iter.flags & BTREE_ITER_is_extents) - bch2_key_resize(&delete.k, - bpos_min(end, k.k->p).offset - - iter.pos.offset); - - ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: - bch2_trans_commit(trans, &disk_res, journal_seq, - BCH_TRANS_COMMIT_no_enospc); - bch2_disk_reservation_put(trans->c, &disk_res); -err: - /* - * the bch2_trans_begin() call is in a weird place because we - * need to call it after every transaction commit, to avoid path - * overflow, but don't want to call it if the delete operation - * is a no-op and we have no work to do: - */ - bch2_trans_begin(trans); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - return ret ?: trans_was_restarted(trans, restart_count); -} - -/* - * bch_btree_delete_range - delete everything within a given range - * - * Range is a half open interval - [start, end) - */ -int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, - struct bpos start, struct bpos end, - unsigned update_flags, - u64 *journal_seq) -{ - int ret = bch2_trans_run(c, - bch2_btree_delete_range_trans(trans, id, start, end, - update_flags, journal_seq)); - if (ret == -BCH_ERR_transaction_restart_nested) - ret = 0; - return ret; -} - -int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set) -{ - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); - int ret = PTR_ERR_OR_ZERO(k); - if (ret) - return ret; - - bkey_init(&k->k); - k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k->k.p = iter->pos; - if (iter->flags & BTREE_ITER_is_extents) - bch2_key_resize(&k->k, 1); - - return bch2_trans_update(trans, iter, k, 0); -} - -int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, - struct bpos pos, bool set) -{ - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - - int ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_bit_mod_iter(trans, &iter, set); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, - struct bpos pos, bool set) -{ - struct bkey_i k; - - bkey_init(&k.k); - k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k.k.p = pos; - - return bch2_trans_update_buffered(trans, btree, &k); -} - -static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len) -{ - unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); - - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); - int ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); - journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0); - return 0; -} - -int bch2_trans_log_str(struct btree_trans *trans, const char *str) -{ - return __bch2_trans_log_str(trans, str, strlen(str)); -} - -int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) -{ - int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - return ret; - - return __bch2_trans_log_str(trans, buf->buf, buf->pos); -} - -int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree, - unsigned level, struct bkey_i *k) -{ - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); - int ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - journal_entry_init(e, BCH_JSET_ENTRY_log_bkey, btree, level, k->k.u64s); - bkey_copy(e->start, k); - return 0; -} - -__printf(3, 0) -static int -__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, - va_list args) -{ - struct printbuf buf = PRINTBUF; - prt_vprintf(&buf, fmt, args); - - unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - - int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - goto err; - - if (!test_bit(JOURNAL_running, &c->journal.flags)) { - ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); - if (ret) - goto err; - - struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); - journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0); - c->journal.early_journal_entries.nr += jset_u64s(u64s); - } else { - ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, - bch2_trans_log_msg(trans, &buf)); - } -err: - printbuf_exit(&buf); - return ret; -} - -__printf(2, 3) -int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = __bch2_fs_log_msg(c, 0, fmt, args); - va_end(args); - return ret; -} - -/* - * Use for logging messages during recovery to enable reserved space and avoid - * blocking. - */ -__printf(2, 3) -int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); - va_end(args); - return ret; -} diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h deleted file mode 100644 index 0b98ab959719..000000000000 --- a/fs/bcachefs/btree_update.h +++ /dev/null @@ -1,429 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_UPDATE_H -#define _BCACHEFS_BTREE_UPDATE_H - -#include "btree_iter.h" -#include "journal.h" -#include "snapshot.h" - -struct bch_fs; -struct btree; - -void bch2_btree_node_prep_for_write(struct btree_trans *, - struct btree_path *, struct btree *); -bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, - struct btree *, struct btree_node_iter *, - struct bkey_i *); - -int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64); -int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64); -void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); - -void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, - struct bkey_i *, u64); - -#define BCH_TRANS_COMMIT_FLAGS() \ - x(no_enospc, "don't check for enospc") \ - x(no_check_rw, "don't attempt to take a ref on c->writes") \ - x(no_journal_res, "don't take a journal reservation, instead " \ - "pin journal entry referred to by trans->journal_res.seq") \ - x(journal_reclaim, "operation required for journal reclaim; may return error" \ - "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\ - x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied") - -enum __bch_trans_commit_flags { - /* First bits for bch_watermark: */ - __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS, -#define x(n, ...) __BCH_TRANS_COMMIT_##n, - BCH_TRANS_COMMIT_FLAGS() -#undef x -}; - -enum bch_trans_commit_flags { -#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n), - BCH_TRANS_COMMIT_FLAGS() -#undef x -}; - -void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); - -int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); -int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); - -int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, - struct bkey_i *, enum btree_iter_update_trigger_flags); - -int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, - enum btree_iter_update_trigger_flags); -int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct - disk_reservation *, int flags, enum - btree_iter_update_trigger_flags iter_flags); - -int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, - struct bpos, struct bpos, unsigned, u64 *); -int bch2_btree_delete_range(struct bch_fs *, enum btree_id, - struct bpos, struct bpos, unsigned, u64 *); - -int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool); -int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); -int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); - -static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, - enum btree_id btree, struct bpos pos) -{ - return bch2_btree_bit_mod_buffered(trans, btree, pos, false); -} - -int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, - struct bpos, snapshot_id_list *); - -/* - * For use when splitting extents in existing snapshots: - * - * If @old_pos is an interior snapshot node, iterate over descendent snapshot - * nodes: for every descendent snapshot in whiche @old_pos is overwritten and - * not visible, emit a whiteout at @new_pos. - */ -static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id btree, - struct bpos old_pos, - struct bpos new_pos) -{ - BUG_ON(old_pos.snapshot != new_pos.snapshot); - - if (!btree_type_has_snapshots(btree) || - bkey_eq(old_pos, new_pos)) - return 0; - - snapshot_id_list s; - int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s); - if (ret) - return ret; - - return s.nr - ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s) - : 0; -} - -int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, - enum btree_iter_update_trigger_flags, - struct bkey_s_c, struct bkey_s_c); - -int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, - enum btree_id, struct bpos); - -int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_iter_update_trigger_flags, - unsigned long); - -static inline int __must_check -bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags) -{ - return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_); -} - -static inline void *btree_trans_subbuf_base(struct btree_trans *trans, - struct btree_trans_subbuf *buf) -{ - return (u64 *) trans->mem + buf->base; -} - -static inline void *btree_trans_subbuf_top(struct btree_trans *trans, - struct btree_trans_subbuf *buf) -{ - return (u64 *) trans->mem + buf->base + buf->u64s; -} - -void *__bch2_trans_subbuf_alloc(struct btree_trans *, - struct btree_trans_subbuf *, - unsigned); - -static inline void * -bch2_trans_subbuf_alloc(struct btree_trans *trans, - struct btree_trans_subbuf *buf, - unsigned u64s) -{ - if (buf->u64s + u64s > buf->size) - return __bch2_trans_subbuf_alloc(trans, buf, u64s); - - void *p = btree_trans_subbuf_top(trans, buf); - buf->u64s += u64s; - return p; -} - -static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans) -{ - return btree_trans_subbuf_base(trans, &trans->journal_entries); -} - -static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) -{ - return btree_trans_subbuf_top(trans, &trans->journal_entries); -} - -static inline struct jset_entry * -bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) -{ - return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s); -} - -int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); - -int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *); - -static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) -{ - kmsan_check_memory(k, bkey_bytes(&k->k)); - - EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); - - if (unlikely(!btree_type_uses_write_buffer(btree))) { - int ret = bch2_btree_write_buffer_insert_err(trans->c, btree, k); - dump_stack(); - return ret; - } - /* - * Most updates skip the btree write buffer until journal replay is - * finished because synchronization with journal replay relies on having - * a btree node locked - if we're overwriting a key in the journal that - * journal replay hasn't yet replayed, we have to mark it as - * overwritten. - * - * But accounting updates don't overwrite, they're deltas, and they have - * to be flushed to the btree strictly in order for journal replay to be - * able to tell which updates need to be applied: - */ - if (k->k.type != KEY_TYPE_accounting && - unlikely(trans->journal_replay_not_finished)) - return bch2_btree_insert_clone_trans(trans, btree, k); - - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); - int ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s); - bkey_copy(e->start, k); - return 0; -} - -void bch2_trans_commit_hook(struct btree_trans *, - struct btree_trans_commit_hook *); -int __bch2_trans_commit(struct btree_trans *, unsigned); - -int bch2_trans_log_str(struct btree_trans *, const char *); -int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); -int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *); - -__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); -__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); - -/** - * bch2_trans_commit - insert keys at given iterator positions - * - * This is main entry point for btree updates. - * - * Return values: - * -EROFS: filesystem read only - * -EIO: journal or btree node IO error - */ -static inline int bch2_trans_commit(struct btree_trans *trans, - struct disk_reservation *disk_res, - u64 *journal_seq, - unsigned flags) -{ - trans->disk_res = disk_res; - trans->journal_seq = journal_seq; - - return __bch2_trans_commit(trans, flags); -} - -#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ - lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_flags))) - -#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ - nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_flags))) - -#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \ - bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) - -#define trans_for_each_update(_trans, _i) \ - for (struct btree_insert_entry *_i = (_trans)->updates; \ - (_i) < (_trans)->updates + (_trans)->nr_updates; \ - (_i)++) - -static inline void bch2_trans_reset_updates(struct btree_trans *trans) -{ - trans_for_each_update(trans, i) - bch2_path_put(trans, i->path, true); - - trans->nr_updates = 0; - trans->journal_entries.u64s = 0; - trans->journal_entries.size = 0; - trans->accounting.u64s = 0; - trans->accounting.size = 0; - trans->hooks = NULL; - trans->extra_disk_res = 0; -} - -static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, - unsigned type, unsigned min_bytes) -{ - unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); - struct bkey_i *mut; - - if (type && k.k->type != type) - return ERR_PTR(-ENOENT); - - /* extra padding for varint_decode_fast... */ - mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8); - if (!IS_ERR(mut)) { - bkey_reassemble(mut, k); - - if (unlikely(bytes > bkey_bytes(k.k))) { - memset((void *) mut + bkey_bytes(k.k), 0, - bytes - bkey_bytes(k.k)); - mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64)); - } - } - return mut; -} - -static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) -{ - return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); -} - -#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type) \ - bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k, \ - KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) - -static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k, - enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned min_bytes) -{ - struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes); - int ret; - - if (IS_ERR(mut)) - return mut; - - ret = bch2_trans_update(trans, iter, mut, flags); - if (ret) - return ERR_PTR(ret); - - *k = bkey_i_to_s_c(mut); - return mut; -} - -static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c *k, - enum btree_iter_update_trigger_flags flags) -{ - return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0); -} - -#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type) \ - bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\ - KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) - -static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned min_bytes) -{ - struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, - btree_id, pos, flags|BTREE_ITER_intent, type); - struct bkey_i *ret = IS_ERR(k.k) - ? ERR_CAST(k.k) - : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); - if (IS_ERR(ret)) - bch2_trans_iter_exit(trans, iter); - return ret; -} - -static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags) -{ - return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); -} - -static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned min_bytes) -{ - struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, - btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); - int ret; - - if (IS_ERR(mut)) - return mut; - - ret = bch2_trans_update(trans, iter, mut, flags); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return ERR_PTR(ret); - } - - return mut; -} - -static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags, - unsigned min_bytes) -{ - return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); -} - -static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags) -{ - return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); -} - -#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ - bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter, \ - _btree_id, _pos, _flags, \ - KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) - -static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, - enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned val_size) -{ - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size); - int ret; - - if (IS_ERR(k)) - return k; - - bkey_init(&k->k); - k->k.p = iter->pos; - k->k.type = type; - set_bkey_val_bytes(&k->k, val_size); - - ret = bch2_trans_update(trans, iter, k, flags); - if (unlikely(ret)) - return ERR_PTR(ret); - return k; -} - -#define bch2_bkey_alloc(_trans, _iter, _flags, _type) \ - bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags, \ - KEY_TYPE_##_type, sizeof(struct bch_##_type))) - -#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c deleted file mode 100644 index 553059b33bfd..000000000000 --- a/fs/bcachefs/btree_update_interior.c +++ /dev/null @@ -1,2854 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_gc.h" -#include "btree_journal_iter.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "buckets.h" -#include "clock.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "io_write.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "keylist.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-members.h" -#include "super-io.h" -#include "trace.h" - -#include <linux/random.h> - -static const char * const bch2_btree_update_modes[] = { -#define x(t) #t, - BTREE_UPDATE_MODES() -#undef x - NULL -}; - -static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *); - -static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - btree_path_idx_t, struct btree *, struct keylist *); -static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); - -/* - * Verify that child nodes correctly span parent node's range: - */ -int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) -{ - struct bch_fs *c = trans->c; - struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 - ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key - : b->data->min_key; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - struct bkey_buf prev; - int ret = 0; - - BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, - b->data->min_key)); - - bch2_bkey_buf_init(&prev); - bkey_init(&prev.k->k); - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - - if (b == btree_node_root(c, b)) { - if (!bpos_eq(b->data->min_key, POS_MIN)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "btree root with incorrect min_key: "); - bch2_bpos_to_text(&buf, b->data->min_key); - prt_newline(&buf); - - bch2_count_fsck_err(c, btree_root_bad_min_key, &buf); - goto err; - } - - if (!bpos_eq(b->data->max_key, SPOS_MAX)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "btree root with incorrect max_key: "); - bch2_bpos_to_text(&buf, b->data->max_key); - prt_newline(&buf); - - bch2_count_fsck_err(c, btree_root_bad_max_key, &buf); - goto err; - } - } - - if (!b->c.level) - goto out; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - if (k.k->type != KEY_TYPE_btree_ptr_v2) - goto out; - - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - - struct bpos expected_min = bkey_deleted(&prev.k->k) - ? node_min - : bpos_successor(prev.k->k.p); - - if (!bpos_eq(expected_min, bp.v->min_key)) { - prt_str(&buf, "end of prev node doesn't match start of next node"); - prt_str(&buf, "\nprev "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - prt_str(&buf, "\nnext "); - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf); - goto err; - } - - bch2_bkey_buf_reassemble(&prev, c, k); - bch2_btree_and_journal_iter_advance(&iter); - } - - if (bkey_deleted(&prev.k->k)) { - prt_printf(&buf, "empty interior node\n"); - bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf); - goto err; - } - - if (!bpos_eq(prev.k->k.p, b->key.k.p)) { - prt_str(&buf, "last child node doesn't end at end of parent node\nchild: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - prt_newline(&buf); - - bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf); - goto err; - } -out: - bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev, c); - printbuf_exit(&buf); - return ret; -err: - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_newline(&buf); - - ret = __bch2_topology_error(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - BUG_ON(!ret); - goto out; -} - -/* Calculate ideal packed bkey format for new btree nodes: */ - -static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) -{ - struct bkey_packed *k; - struct bkey uk; - - for_each_bset(b, t) - bset_tree_for_each_key(b, t, k) - if (!bkey_deleted(k)) { - uk = bkey_unpack_key(b, k); - bch2_bkey_format_add_key(s, &uk); - } -} - -static struct bkey_format bch2_btree_calc_format(struct btree *b) -{ - struct bkey_format_state s; - - bch2_bkey_format_init(&s); - bch2_bkey_format_add_pos(&s, b->data->min_key); - bch2_bkey_format_add_pos(&s, b->data->max_key); - __bch2_btree_calc_format(&s, b); - - return bch2_bkey_format_done(&s); -} - -static size_t btree_node_u64s_with_format(struct btree_nr_keys nr, - struct bkey_format *old_f, - struct bkey_format *new_f) -{ - /* stupid integer promotion rules */ - ssize_t delta = - (((int) new_f->key_u64s - old_f->key_u64s) * - (int) nr.packed_keys) + - (((int) new_f->key_u64s - BKEY_U64s) * - (int) nr.unpacked_keys); - - BUG_ON(delta + nr.live_u64s < 0); - - return nr.live_u64s + delta; -} - -/** - * bch2_btree_node_format_fits - check if we could rewrite node with a new format - * - * @c: filesystem handle - * @b: btree node to rewrite - * @nr: number of keys for new node (i.e. b->nr) - * @new_f: bkey format to translate keys to - * - * Returns: true if all re-packed keys will be able to fit in a new node. - * - * Assumes all keys will successfully pack with the new format. - */ -static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, - struct btree_nr_keys nr, - struct bkey_format *new_f) -{ - size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f); - - return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b); -} - -/* Btree node freeing/allocation: */ - -static void __btree_node_free(struct btree_trans *trans, struct btree *b) -{ - struct bch_fs *c = trans->c; - - trace_and_count(c, btree_node_free, trans, b); - - BUG_ON(btree_node_write_blocked(b)); - BUG_ON(btree_node_dirty(b)); - BUG_ON(btree_node_need_write(b)); - BUG_ON(b == btree_node_root(c, b)); - BUG_ON(b->ob.nr); - BUG_ON(!list_empty(&b->write_blocked)); - BUG_ON(b->will_make_reachable); - - clear_btree_node_noevict(b); -} - -static void bch2_btree_node_free_inmem(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - struct bch_fs *c = trans->c; - - bch2_btree_node_lock_write_nofail(trans, path, &b->c); - - __btree_node_free(trans, b); - - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - - six_unlock_write(&b->c.lock); - mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - - bch2_trans_node_drop(trans, b); -} - -static void bch2_btree_node_free_never_used(struct btree_update *as, - struct btree_trans *trans, - struct btree *b) -{ - struct bch_fs *c = as->c; - struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; - - BUG_ON(!list_empty(&b->write_blocked)); - BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); - - b->will_make_reachable = 0; - closure_put(&as->cl); - - clear_btree_node_will_make_reachable(b); - clear_btree_node_accessed(b); - clear_btree_node_dirty_acct(c, b); - clear_btree_node_need_write(b); - - mutex_lock(&c->btree_cache.lock); - __bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - - BUG_ON(p->nr >= ARRAY_SIZE(p->b)); - p->b[p->nr++] = b; - - six_unlock_intent(&b->c.lock); - - bch2_trans_node_drop(trans, b); -} - -static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, - struct disk_reservation *res, - struct closure *cl, - bool interior_node, - unsigned target, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct write_point *wp; - struct btree *b; - BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - struct open_buckets obs = { .nr = 0 }; - struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim - ? BTREE_NODE_RESERVE - : 0; - int ret; - - b = bch2_btree_node_mem_alloc(trans, interior_node); - if (IS_ERR(b)) - return b; - - BUG_ON(b->ob.nr); - - mutex_lock(&c->btree_reserve_cache_lock); - if (c->btree_reserve_cache_nr > nr_reserve) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - obs = a->ob; - bkey_copy(&tmp.k, &a->k); - mutex_unlock(&c->btree_reserve_cache_lock); - goto out; - } - mutex_unlock(&c->btree_reserve_cache_lock); -retry: - ret = bch2_alloc_sectors_start_trans(trans, - target ?: - c->opts.metadata_target ?: - c->opts.foreground_target, - 0, - writepoint_ptr(&c->btree_write_point), - &devs_have, - res->nr_replicas, - min(res->nr_replicas, - c->opts.metadata_replicas_required), - watermark, - target ? BCH_WRITE_only_specified_devs : 0, - cl, &wp); - if (unlikely(ret)) - goto err; - - if (wp->sectors_free < btree_sectors(c)) { - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (ob->sectors_free < btree_sectors(c)) - ob->sectors_free = 0; - - bch2_alloc_sectors_done(c, wp); - goto retry; - } - - bkey_btree_ptr_v2_init(&tmp.k); - bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); - - bch2_open_bucket_get(c, wp, &obs); - bch2_alloc_sectors_done(c, wp); -out: - bkey_copy(&b->key, &tmp.k); - b->ob = obs; - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - return b; -err: - bch2_btree_node_to_freelist(c, b); - return ERR_PTR(ret); -} - -static struct btree *bch2_btree_node_alloc(struct btree_update *as, - struct btree_trans *trans, - unsigned level) -{ - struct bch_fs *c = as->c; - struct btree *b; - struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; - int ret; - - BUG_ON(level >= BTREE_MAX_DEPTH); - BUG_ON(!p->nr); - - b = p->b[--p->nr]; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - - set_btree_node_accessed(b); - set_btree_node_dirty_acct(c, b); - set_btree_node_need_write(b); - - bch2_bset_init_first(b, &b->data->keys); - b->c.level = level; - b->c.btree_id = as->btree_id; - b->version_ondisk = c->sb.version; - - memset(&b->nr, 0, sizeof(b->nr)); - b->data->magic = cpu_to_le64(bset_magic(c)); - memset(&b->data->_ptr, 0, sizeof(b->data->_ptr)); - b->data->flags = 0; - SET_BTREE_NODE_ID(b->data, as->btree_id); - SET_BTREE_NODE_LEVEL(b->data, level); - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); - - bp->v.mem_ptr = 0; - bp->v.seq = b->data->keys.seq; - bp->v.sectors_written = 0; - } - - SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); - - bch2_btree_build_aux_trees(b); - - ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); - BUG_ON(ret); - - trace_and_count(c, btree_node_alloc, trans, b); - bch2_increment_clock(c, btree_sectors(c), WRITE); - return b; -} - -static void btree_set_min(struct btree *b, struct bpos pos) -{ - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) - bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; - b->data->min_key = pos; -} - -static void btree_set_max(struct btree *b, struct bpos pos) -{ - b->key.k.p = pos; - b->data->max_key = pos; -} - -static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, - struct btree_trans *trans, - struct btree *b) -{ - struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level); - struct bkey_format format = bch2_btree_calc_format(b); - - /* - * The keys might expand with the new format - if they wouldn't fit in - * the btree node anymore, use the old format for now: - */ - if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format)) - format = b->format; - - SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); - - btree_set_min(n, b->data->min_key); - btree_set_max(n, b->data->max_key); - - n->data->format = format; - btree_node_set_format(n, format); - - bch2_btree_sort_into(as->c, n, b); - - btree_node_reset_sib_u64s(n); - return n; -} - -static struct btree *__btree_root_alloc(struct btree_update *as, - struct btree_trans *trans, unsigned level) -{ - struct btree *b = bch2_btree_node_alloc(as, trans, level); - - btree_set_min(b, POS_MIN); - btree_set_max(b, SPOS_MAX); - b->data->format = bch2_btree_calc_format(b); - - btree_node_set_format(b, b->data->format); - bch2_btree_build_aux_trees(b); - - return b; -} - -static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans) -{ - struct bch_fs *c = as->c; - struct prealloc_nodes *p; - - for (p = as->prealloc_nodes; - p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); - p++) { - while (p->nr) { - struct btree *b = p->b[--p->nr]; - - mutex_lock(&c->btree_reserve_cache_lock); - - if (c->btree_reserve_cache_nr < - ARRAY_SIZE(c->btree_reserve_cache)) { - struct btree_alloc *a = - &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; - - a->ob = b->ob; - b->ob.nr = 0; - bkey_copy(&a->k, &b->key); - } else { - bch2_open_buckets_put(c, &b->ob); - } - - mutex_unlock(&c->btree_reserve_cache_lock); - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - __btree_node_free(trans, b); - bch2_btree_node_to_freelist(c, b); - } - } -} - -static int bch2_btree_reserve_get(struct btree_trans *trans, - struct btree_update *as, - unsigned nr_nodes[2], - unsigned target, - unsigned flags, - struct closure *cl) -{ - struct btree *b; - unsigned interior; - int ret = 0; - - BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); - - /* - * Protects reaping from the btree node cache and using the btree node - * open bucket reserve: - */ - ret = bch2_btree_cache_cannibalize_lock(trans, cl); - if (ret) - return ret; - - for (interior = 0; interior < 2; interior++) { - struct prealloc_nodes *p = as->prealloc_nodes + interior; - - while (p->nr < nr_nodes[interior]) { - b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, - interior, target, flags); - if (IS_ERR(b)) { - ret = PTR_ERR(b); - goto err; - } - - p->b[p->nr++] = b; - } - } -err: - bch2_btree_cache_cannibalize_unlock(trans); - return ret; -} - -/* Asynchronous interior node update machinery */ - -static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans) -{ - struct bch_fs *c = as->c; - - if (as->took_gc_lock) - up_read(&c->gc_lock); - as->took_gc_lock = false; - - bch2_journal_pin_drop(&c->journal, &as->journal); - bch2_journal_pin_flush(&c->journal, &as->journal); - bch2_disk_reservation_put(c, &as->disk_res); - bch2_btree_reserve_put(as, trans); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], - as->start_time); - - mutex_lock(&c->btree_interior_update_lock); - list_del(&as->unwritten_list); - list_del(&as->list); - - closure_debug_destroy(&as->cl); - mempool_free(as, &c->btree_interior_update_pool); - - /* - * Have to do the wakeup with btree_interior_update_lock still held, - * since being on btree_interior_update_list is our ref on @c: - */ - closure_wake_up(&c->btree_interior_update_wait); - - mutex_unlock(&c->btree_interior_update_lock); -} - -static void btree_update_add_key(struct btree_update *as, - struct keylist *keys, struct btree *b) -{ - struct bkey_i *k = &b->key; - - BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s > - ARRAY_SIZE(as->_old_keys)); - - bkey_copy(keys->top, k); - bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1; - - bch2_keylist_push(keys); -} - -static bool btree_update_new_nodes_marked_sb(struct btree_update *as) -{ - for_each_keylist_key(&as->new_keys, k) - if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k))) - return false; - return true; -} - -static void btree_update_new_nodes_mark_sb(struct btree_update *as) -{ - struct bch_fs *c = as->c; - - mutex_lock(&c->sb_lock); - for_each_keylist_key(&as->new_keys, k) - bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); -} - -/* - * The transactional part of an interior btree node update, where we journal the - * update we did to the interior node and update alloc info: - */ -static int btree_update_nodes_written_trans(struct btree_trans *trans, - struct btree_update *as) -{ - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s); - int ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64)); - - trans->journal_pin = &as->journal; - - for_each_keylist_key(&as->old_keys, k) { - unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; - - ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k), - BTREE_TRIGGER_transactional); - if (ret) - return ret; - } - - for_each_keylist_key(&as->new_keys, k) { - unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; - - ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k), - BTREE_TRIGGER_transactional); - if (ret) - return ret; - } - - return 0; -} - -/* If the node has been reused, we might be reading uninitialized memory - that's fine: */ -static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq) -{ - struct btree_node *b_data = READ_ONCE(b->data); - - return (b_data ? b_data->keys.seq : 0) == seq; -} - -static void btree_update_nodes_written(struct btree_update *as) -{ - struct bch_fs *c = as->c; - struct btree *b; - struct btree_trans *trans = bch2_trans_get(c); - u64 journal_seq = 0; - unsigned i; - int ret; - - /* - * If we're already in an error state, it might be because a btree node - * was never written, and we might be trying to free that same btree - * node here, but it won't have been marked as allocated and we'll see - * spurious disk usage inconsistencies in the transactional part below - * if we don't skip it: - */ - ret = bch2_journal_error(&c->journal); - if (ret) - goto err; - - if (!btree_update_new_nodes_marked_sb(as)) - btree_update_new_nodes_mark_sb(as); - - /* - * Wait for any in flight writes to finish before we free the old nodes - * on disk. But we haven't pinned those old nodes in the btree cache, - * they might have already been evicted. - * - * The update we're completing deleted references to those nodes from the - * btree, so we know if they've been evicted they can't be pulled back in. - * We just have to check if the nodes we have pointers to are still those - * old nodes, and haven't been reused. - * - * This can't be done locklessly because the data buffer might have been - * vmalloc allocated, and they're not RCU freed. We also need the - * __no_kmsan_checks annotation because even with the btree node read - * lock, nothing tells us that the data buffer has been initialized (if - * the btree node has been reused for a different node, and the data - * buffer swapped for a new data buffer). - */ - for (i = 0; i < as->nr_old_nodes; i++) { - b = as->old_nodes[i]; - - bch2_trans_begin(trans); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]); - six_unlock_read(&b->c.lock); - bch2_trans_unlock_long(trans); - - if (seq_matches) - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, - TASK_UNINTERRUPTIBLE); - } - - /* - * We did an update to a parent node where the pointers we added pointed - * to child nodes that weren't written yet: now, the child nodes have - * been written so we can write out the update to the interior node. - */ - - /* - * We can't call into journal reclaim here: we'd block on the journal - * reclaim lock, but we may need to release the open buckets we have - * pinned in order for other btree updates to make forward progress, and - * journal reclaim does btree updates when flushing bkey_cached entries, - * which may require allocations as well. - */ - ret = commit_do(trans, &as->disk_res, &journal_seq, - BCH_WATERMARK_interior_updates| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_journal_reclaim, - btree_update_nodes_written_trans(trans, as)); - bch2_trans_unlock(trans); - - bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, - "%s", bch2_err_str(ret)); -err: - /* - * Ensure transaction is unlocked before using btree_node_lock_nopath() - * (the use of which is always suspect, we need to work on removing this - * in the future) - * - * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get() - * calls bch2_path_upgrade(), before we call path_make_mut(), so we may - * rarely end up with a locked path besides the one we have here: - */ - bch2_trans_unlock(trans); - bch2_trans_begin(trans); - - /* - * We have to be careful because another thread might be getting ready - * to free as->b and calling btree_update_reparent() on us - we'll - * recheck under btree_update_lock below: - */ - b = READ_ONCE(as->b); - if (b) { - /* - * @b is the node we did the final insert into: - * - * On failure to get a journal reservation, we still have to - * unblock the write and allow most of the write path to happen - * so that shutdown works, but the i->journal_seq mechanism - * won't work to prevent the btree write from being visible (we - * didn't get a journal sequence number) - instead - * __bch2_btree_node_write() doesn't do the actual write if - * we're in journal error state: - */ - - btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans, - as->btree_id, b->c.level, b->key.k.p); - struct btree_path *path = trans->paths + path_idx; - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); - mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED); - path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); - path->l[b->c.level].b = b; - - bch2_btree_node_lock_write_nofail(trans, path, &b->c); - - mutex_lock(&c->btree_interior_update_lock); - - list_del(&as->write_blocked_list); - if (list_empty(&b->write_blocked)) - clear_btree_node_write_blocked(b); - - /* - * Node might have been freed, recheck under - * btree_interior_update_lock: - */ - if (as->b == b) { - BUG_ON(!b->c.level); - BUG_ON(!btree_node_dirty(b)); - - if (!ret) { - struct bset *last = btree_bset_last(b); - - last->journal_seq = cpu_to_le64( - max(journal_seq, - le64_to_cpu(last->journal_seq))); - - bch2_btree_add_journal_pin(c, b, journal_seq); - } else { - /* - * If we didn't get a journal sequence number we - * can't write this btree node, because recovery - * won't know to ignore this write: - */ - set_btree_node_never_write(b); - } - } - - mutex_unlock(&c->btree_interior_update_lock); - - mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - six_unlock_write(&b->c.lock); - - btree_node_write_if_need(trans, b, SIX_LOCK_intent); - btree_node_unlock(trans, path, b->c.level); - bch2_path_put(trans, path_idx, true); - } - - bch2_journal_pin_drop(&c->journal, &as->journal); - - mutex_lock(&c->btree_interior_update_lock); - for (i = 0; i < as->nr_new_nodes; i++) { - b = as->new_nodes[i]; - - BUG_ON(b->will_make_reachable != (unsigned long) as); - b->will_make_reachable = 0; - clear_btree_node_will_make_reachable(b); - } - mutex_unlock(&c->btree_interior_update_lock); - - for (i = 0; i < as->nr_new_nodes; i++) { - b = as->new_nodes[i]; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - btree_node_write_if_need(trans, b, SIX_LOCK_read); - six_unlock_read(&b->c.lock); - } - - for (i = 0; i < as->nr_open_buckets; i++) - bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); - - bch2_btree_update_free(as, trans); - bch2_trans_put(trans); -} - -static void btree_interior_update_work(struct work_struct *work) -{ - struct bch_fs *c = - container_of(work, struct bch_fs, btree_interior_update_work); - struct btree_update *as; - - while (1) { - mutex_lock(&c->btree_interior_update_lock); - as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, - struct btree_update, unwritten_list); - if (as && !as->nodes_written) - as = NULL; - mutex_unlock(&c->btree_interior_update_lock); - - if (!as) - break; - - btree_update_nodes_written(as); - } -} - -static CLOSURE_CALLBACK(btree_update_set_nodes_written) -{ - closure_type(as, struct btree_update, cl); - struct bch_fs *c = as->c; - - mutex_lock(&c->btree_interior_update_lock); - as->nodes_written = true; - mutex_unlock(&c->btree_interior_update_lock); - - queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); -} - -/* - * We're updating @b with pointers to nodes that haven't finished writing yet: - * block @b from being written until @as completes - */ -static void btree_update_updated_node(struct btree_update *as, struct btree *b) -{ - struct bch_fs *c = as->c; - - BUG_ON(as->mode != BTREE_UPDATE_none); - BUG_ON(as->update_level_end < b->c.level); - BUG_ON(!btree_node_dirty(b)); - BUG_ON(!b->c.level); - - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - - as->mode = BTREE_UPDATE_node; - as->b = b; - as->update_level_end = b->c.level; - - set_btree_node_write_blocked(b); - list_add(&as->write_blocked_list, &b->write_blocked); - - mutex_unlock(&c->btree_interior_update_lock); -} - -static int bch2_update_reparent_journal_pin_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) -{ - return 0; -} - -static void btree_update_reparent(struct btree_update *as, - struct btree_update *child) -{ - struct bch_fs *c = as->c; - - lockdep_assert_held(&c->btree_interior_update_lock); - - child->b = NULL; - child->mode = BTREE_UPDATE_update; - - bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, - bch2_update_reparent_journal_pin_flush); -} - -static void btree_update_updated_root(struct btree_update *as, struct btree *b) -{ - struct bkey_i *insert = &b->key; - struct bch_fs *c = as->c; - - BUG_ON(as->mode != BTREE_UPDATE_none); - - BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > - ARRAY_SIZE(as->journal_entries)); - - as->journal_u64s += - journal_entry_set((void *) &as->journal_entries[as->journal_u64s], - BCH_JSET_ENTRY_btree_root, - b->c.btree_id, b->c.level, - insert, insert->k.u64s); - - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - - as->mode = BTREE_UPDATE_root; - mutex_unlock(&c->btree_interior_update_lock); -} - -/* - * bch2_btree_update_add_new_node: - * - * This causes @as to wait on @b to be written, before it gets to - * bch2_btree_update_nodes_written - * - * Additionally, it sets b->will_make_reachable to prevent any additional writes - * to @b from happening besides the first until @b is reachable on disk - * - * And it adds @b to the list of @as's new nodes, so that we can update sector - * counts in bch2_btree_update_nodes_written: - */ -static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) -{ - struct bch_fs *c = as->c; - - closure_get(&as->cl); - - mutex_lock(&c->btree_interior_update_lock); - BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); - BUG_ON(b->will_make_reachable); - - as->new_nodes[as->nr_new_nodes++] = b; - b->will_make_reachable = 1UL|(unsigned long) as; - set_btree_node_will_make_reachable(b); - - mutex_unlock(&c->btree_interior_update_lock); - - btree_update_add_key(as, &as->new_keys, b); - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data; - unsigned sectors = round_up(bytes, block_bytes(c)) >> 9; - - bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = - cpu_to_le16(sectors); - } -} - -/* - * returns true if @b was a new node - */ -static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) -{ - struct btree_update *as; - unsigned long v; - unsigned i; - - mutex_lock(&c->btree_interior_update_lock); - /* - * When b->will_make_reachable != 0, it owns a ref on as->cl that's - * dropped when it gets written by bch2_btree_complete_write - the - * xchg() is for synchronization with bch2_btree_complete_write: - */ - v = xchg(&b->will_make_reachable, 0); - clear_btree_node_will_make_reachable(b); - as = (struct btree_update *) (v & ~1UL); - - if (!as) { - mutex_unlock(&c->btree_interior_update_lock); - return; - } - - for (i = 0; i < as->nr_new_nodes; i++) - if (as->new_nodes[i] == b) - goto found; - - BUG(); -found: - array_remove_item(as->new_nodes, as->nr_new_nodes, i); - mutex_unlock(&c->btree_interior_update_lock); - - if (v & 1) - closure_put(&as->cl); -} - -static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) -{ - while (b->ob.nr) - as->open_buckets[as->nr_open_buckets++] = - b->ob.v[--b->ob.nr]; -} - -static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) -{ - return 0; -} - -/* - * @b is being split/rewritten: it may have pointers to not-yet-written btree - * nodes and thus outstanding btree_updates - redirect @b's - * btree_updates to point to this btree_update: - */ -static void bch2_btree_interior_update_will_free_node(struct btree_update *as, - struct btree *b) -{ - struct bch_fs *c = as->c; - struct btree_update *p, *n; - struct btree_write *w; - - set_btree_node_dying(b); - - if (btree_node_fake(b)) - return; - - mutex_lock(&c->btree_interior_update_lock); - - /* - * Does this node have any btree_update operations preventing - * it from being written? - * - * If so, redirect them to point to this btree_update: we can - * write out our new nodes, but we won't make them visible until those - * operations complete - */ - list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { - list_del_init(&p->write_blocked_list); - btree_update_reparent(as, p); - - /* - * for flush_held_btree_writes() waiting on updates to flush or - * nodes to be writeable: - */ - closure_wake_up(&c->btree_interior_update_wait); - } - - clear_btree_node_dirty_acct(c, b); - clear_btree_node_need_write(b); - clear_btree_node_write_blocked(b); - - /* - * Does this node have unwritten data that has a pin on the journal? - * - * If so, transfer that pin to the btree_update operation - - * note that if we're freeing multiple nodes, we only need to keep the - * oldest pin of any of the nodes we're freeing. We'll release the pin - * when the new nodes are persistent and reachable on disk: - */ - w = btree_current_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, - bch2_btree_update_will_free_node_journal_pin_flush); - bch2_journal_pin_drop(&c->journal, &w->journal); - - w = btree_prev_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, - bch2_btree_update_will_free_node_journal_pin_flush); - bch2_journal_pin_drop(&c->journal, &w->journal); - - mutex_unlock(&c->btree_interior_update_lock); - - /* - * Is this a node that isn't reachable on disk yet? - * - * Nodes that aren't reachable yet have writes blocked until they're - * reachable - now that we've cancelled any pending writes and moved - * things waiting on that write to wait on this update, we can drop this - * node from the list of nodes that the other update is making - * reachable, prior to freeing it: - */ - btree_update_drop_new_node(c, b); - - btree_update_add_key(as, &as->old_keys, b); - - as->old_nodes[as->nr_old_nodes] = b; - as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; - as->nr_old_nodes++; -} - -static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans) -{ - struct bch_fs *c = as->c; - u64 start_time = as->start_time; - - BUG_ON(as->mode == BTREE_UPDATE_none); - - if (as->took_gc_lock) - up_read(&as->c->gc_lock); - as->took_gc_lock = false; - - bch2_btree_reserve_put(as, trans); - - continue_at(&as->cl, btree_update_set_nodes_written, - as->c->btree_interior_update_worker); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], - start_time); -} - -static const char * const btree_node_reawrite_reason_strs[] = { -#define x(n) #n, - BTREE_NODE_REWRITE_REASON() -#undef x - NULL, -}; - -static struct btree_update * -bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level_start, bool split, - unsigned target, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_update *as; - u64 start_time = local_clock(); - int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) - ? BCH_DISK_RESERVATION_NOFAIL : 0; - unsigned nr_nodes[2] = { 0, 0 }; - unsigned level_end = level_start; - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - int ret = 0; - u32 restart_count = trans->restart_count; - - BUG_ON(!path->should_be_locked); - - if (watermark == BCH_WATERMARK_copygc) - watermark = BCH_WATERMARK_btree_copygc; - if (watermark < BCH_WATERMARK_btree) - watermark = BCH_WATERMARK_btree; - - flags &= ~BCH_WATERMARK_MASK; - flags |= watermark; - - if (watermark < BCH_WATERMARK_reclaim && - test_bit(JOURNAL_space_low, &c->journal.flags)) { - if (flags & BCH_TRANS_COMMIT_journal_reclaim) - return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); - - ret = drop_locks_do(trans, - ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; })); - if (ret) - return ERR_PTR(ret); - } - - while (1) { - nr_nodes[!!level_end] += 1 + split; - level_end++; - - ret = bch2_btree_path_upgrade(trans, path, level_end + 1); - if (ret) - return ERR_PTR(ret); - - if (!btree_path_node(path, level_end)) { - /* Allocating new root? */ - nr_nodes[1] += split; - level_end = BTREE_MAX_DEPTH; - break; - } - - /* - * Always check for space for two keys, even if we won't have to - * split at prior level - it might have been a merge instead: - */ - if (bch2_btree_node_insert_fits(path->l[level_end].b, - BKEY_BTREE_PTR_U64s_MAX * 2)) - break; - - split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); - } - - if (!down_read_trylock(&c->gc_lock)) { - ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); - if (ret) { - up_read(&c->gc_lock); - return ERR_PTR(ret); - } - } - - as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); - memset(as, 0, sizeof(*as)); - closure_init(&as->cl, NULL); - as->c = c; - as->start_time = start_time; - as->ip_started = _RET_IP_; - as->mode = BTREE_UPDATE_none; - as->flags = flags; - as->took_gc_lock = true; - as->btree_id = path->btree_id; - as->update_level_start = level_start; - as->update_level_end = level_end; - INIT_LIST_HEAD(&as->list); - INIT_LIST_HEAD(&as->unwritten_list); - INIT_LIST_HEAD(&as->write_blocked_list); - bch2_keylist_init(&as->old_keys, as->_old_keys); - bch2_keylist_init(&as->new_keys, as->_new_keys); - bch2_keylist_init(&as->parent_keys, as->inline_keys); - - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->list, &c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); - - struct btree *b = btree_path_node(path, path->level); - as->node_start = b->data->min_key; - as->node_end = b->data->max_key; - as->node_needed_rewrite = btree_node_rewrite_reason(b); - as->node_written = b->written; - as->node_sectors = btree_buf_bytes(b) >> 9; - as->node_remaining = __bch2_btree_u64s_remaining(b, - btree_bkey_last(b, bset_tree_last(b))); - - /* - * We don't want to allocate if we're in an error state, that can cause - * deadlock on emergency shutdown due to open buckets getting stuck in - * the btree_reserve_cache after allocator shutdown has cleared it out. - * This check needs to come after adding us to the btree_interior_update - * list but before calling bch2_btree_reserve_get, to synchronize with - * __bch2_fs_read_only(). - */ - ret = bch2_journal_error(&c->journal); - if (ret) - goto err; - - ret = bch2_disk_reservation_get(c, &as->disk_res, - (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), - READ_ONCE(c->opts.metadata_replicas), - disk_res_flags); - if (ret) - goto err; - - ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL); - if (bch2_err_matches(ret, ENOSPC) || - bch2_err_matches(ret, ENOMEM)) { - struct closure cl; - - /* - * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK - * flag - */ - if (bch2_err_matches(ret, ENOSPC) && - (flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < BCH_WATERMARK_reclaim) { - ret = bch_err_throw(c, journal_reclaim_would_deadlock); - goto err; - } - - closure_init_stack(&cl); - - do { - ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); - if (!bch2_err_matches(ret, BCH_ERR_operation_blocked)) - break; - bch2_trans_unlock(trans); - bch2_wait_on_allocator(c, &cl); - } while (1); - } - - if (ret) { - trace_and_count(c, btree_reserve_get_fail, trans->fn, - _RET_IP_, nr_nodes[0] + nr_nodes[1], ret); - goto err; - } - - ret = bch2_trans_relock(trans); - if (ret) - goto err; - - bch2_trans_verify_not_restarted(trans, restart_count); - return as; -err: - bch2_btree_update_free(as, trans); - if (!bch2_err_matches(ret, ENOSPC) && - !bch2_err_matches(ret, EROFS) && - ret != -BCH_ERR_journal_reclaim_would_deadlock && - ret != -BCH_ERR_journal_shutdown) - bch_err_fn_ratelimited(c, ret); - return ERR_PTR(ret); -} - -/* Btree root updates: */ - -static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) -{ - /* Root nodes cannot be reaped */ - mutex_lock(&c->btree_cache.lock); - list_del_init(&b->list); - mutex_unlock(&c->btree_cache.lock); - - mutex_lock(&c->btree_root_lock); - bch2_btree_id_root(c, b->c.btree_id)->b = b; - mutex_unlock(&c->btree_root_lock); - - bch2_recalc_btree_reserve(c); -} - -static int bch2_btree_set_root(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - bool nofail) -{ - struct bch_fs *c = as->c; - - trace_and_count(c, btree_node_set_root, trans, b); - - struct btree *old = btree_node_root(c, b); - - /* - * Ensure no one is using the old root while we switch to the - * new root: - */ - if (nofail) { - bch2_btree_node_lock_write_nofail(trans, path, &old->c); - } else { - int ret = bch2_btree_node_lock_write(trans, path, &old->c); - if (ret) - return ret; - } - - bch2_btree_set_root_inmem(c, b); - - btree_update_updated_root(as, b); - - /* - * Unlock old root after new root is visible: - * - * The new root isn't persistent, but that's ok: we still have - * an intent lock on the new root, and any updates that would - * depend on the new root would have to update the new root. - */ - bch2_btree_node_unlock_write(trans, path, old); - return 0; -} - -/* Interior node updates: */ - -static void bch2_insert_fixup_btree_ptr(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_i *insert) -{ - struct bch_fs *c = as->c; - struct bkey_packed *k; - struct printbuf buf = PRINTBUF; - unsigned long old, new; - - BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && - !btree_ptr_sectors_written(bkey_i_to_s_c(insert))); - - if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) - bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); - - struct bkey_validate_context from = (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level, - .btree = b->c.btree_id, - .flags = BCH_VALIDATE_commit, - }; - if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?: - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) { - bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); - dump_stack(); - } - - BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > - ARRAY_SIZE(as->journal_entries)); - - as->journal_u64s += - journal_entry_set((void *) &as->journal_entries[as->journal_u64s], - BCH_JSET_ENTRY_btree_keys, - b->c.btree_id, b->c.level, - insert, insert->k.u64s); - - while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && - bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) - bch2_btree_node_iter_advance(node_iter, b); - - bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); - set_btree_node_dirty_acct(c, b); - - old = READ_ONCE(b->flags); - do { - new = old; - - new &= ~BTREE_WRITE_TYPE_MASK; - new |= BTREE_WRITE_interior; - new |= 1 << BTREE_NODE_need_write; - } while (!try_cmpxchg(&b->flags, &old, new)); - - printbuf_exit(&buf); -} - -static int -bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter node_iter, - struct keylist *keys) -{ - struct bkey_i *insert = bch2_keylist_front(keys); - struct bkey_packed *k; - - BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); - - while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && - (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) - ; - - for (; - insert != keys->top && bpos_le(insert->k.p, b->key.k.p); - insert = bkey_next(insert)) - bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); - - int ret = bch2_btree_node_check_topology(trans, b); - if (ret) { - struct printbuf buf = PRINTBUF; - - for (struct bkey_i *k = keys->keys; - k != insert; - k = bkey_next(k)) { - bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); - prt_newline(&buf); - } - - bch2_fs_fatal_error(as->c, "%ps -> %s(): check_topology error %s: inserted keys\n%s", - (void *) _RET_IP_, __func__, bch2_err_str(ret), buf.buf); - dump_stack(); - return ret; - } - - memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data); - keys->top_p -= insert->_data - keys->keys_p; - return 0; -} - -static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) -{ - if (insert_keys) - for_each_keylist_key(insert_keys, k) - if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos)) - return true; - return false; -} - -/* - * Move keys from n1 (original replacement node, now lower node) to n2 (higher - * node) - */ -static void __btree_split_node(struct btree_update *as, - struct btree_trans *trans, - struct btree *b, - struct btree *n[2], - struct keylist *insert_keys) -{ - struct bkey_packed *k; - struct bpos n1_pos = POS_MIN; - struct btree_node_iter iter; - struct bset *bsets[2]; - struct bkey_format_state format[2]; - struct bkey_packed *out[2]; - struct bkey uk; - unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5; - struct { unsigned nr_keys, val_u64s; } nr_keys[2]; - int i; - - memset(&nr_keys, 0, sizeof(nr_keys)); - - for (i = 0; i < 2; i++) { - BUG_ON(n[i]->nsets != 1); - - bsets[i] = btree_bset_first(n[i]); - out[i] = bsets[i]->start; - - SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1); - bch2_bkey_format_init(&format[i]); - } - - u64s = 0; - for_each_btree_node_key(b, k, &iter) { - if (bkey_deleted(k)) - continue; - - uk = bkey_unpack_key(b, k); - - if (b->c.level && - u64s < n1_u64s && - u64s + k->u64s >= n1_u64s && - (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) || - key_deleted_in_insert(insert_keys, uk.p))) - n1_u64s += k->u64s; - - i = u64s >= n1_u64s; - u64s += k->u64s; - if (!i) - n1_pos = uk.p; - bch2_bkey_format_add_key(&format[i], &uk); - - nr_keys[i].nr_keys++; - nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k); - } - - btree_set_min(n[0], b->data->min_key); - btree_set_max(n[0], n1_pos); - btree_set_min(n[1], bpos_successor(n1_pos)); - btree_set_max(n[1], b->data->max_key); - - for (i = 0; i < 2; i++) { - bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key); - bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key); - - n[i]->data->format = bch2_bkey_format_done(&format[i]); - - unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + - nr_keys[i].val_u64s; - if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b)) - n[i]->data->format = b->format; - - btree_node_set_format(n[i], n[i]->data->format); - } - - u64s = 0; - for_each_btree_node_key(b, k, &iter) { - if (bkey_deleted(k)) - continue; - - i = u64s >= n1_u64s; - u64s += k->u64s; - - if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k) - ? &b->format: &bch2_bkey_format_current, k)) - out[i]->format = KEY_FORMAT_LOCAL_BTREE; - else - bch2_bkey_unpack(b, (void *) out[i], k); - - out[i]->needs_whiteout = false; - - btree_keys_account_key_add(&n[i]->nr, 0, out[i]); - out[i] = bkey_p_next(out[i]); - } - - for (i = 0; i < 2; i++) { - bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data); - - BUG_ON(!bsets[i]->u64s); - - set_btree_bset_end(n[i], n[i]->set); - - btree_node_reset_sib_u64s(n[i]); - - bch2_verify_btree_nr_keys(n[i]); - - BUG_ON(bch2_btree_node_check_topology(trans, n[i])); - } -} - -/* - * For updates to interior nodes, we've got to do the insert before we split - * because the stuff we're inserting has to be inserted atomically. Post split, - * the keys might have to go in different nodes and the split would no longer be - * atomic. - * - * Worse, if the insert is from btree node coalescing, if we do the insert after - * we do the split (and pick the pivot) - the pivot we pick might be between - * nodes that were coalesced, and thus in the middle of a child node post - * coalescing: - */ -static int btree_split_insert_keys(struct btree_update *as, - struct btree_trans *trans, - btree_path_idx_t path_idx, - struct btree *b, - struct keylist *keys) -{ - struct btree_path *path = trans->paths + path_idx; - - if (!bch2_keylist_empty(keys) && - bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { - struct btree_node_iter node_iter; - - bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); - - int ret = bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); - if (ret) - return ret; - } - - return 0; -} - -static int btree_split(struct btree_update *as, struct btree_trans *trans, - btree_path_idx_t path, struct btree *b, - struct keylist *keys) -{ - struct bch_fs *c = as->c; - struct btree *parent = btree_node_parent(trans->paths + path, b); - struct btree *n1, *n2 = NULL, *n3 = NULL; - btree_path_idx_t path1 = 0, path2 = 0; - u64 start_time = local_clock(); - int ret = 0; - - bch2_verify_btree_nr_keys(b); - BUG_ON(!parent && (b != btree_node_root(c, b))); - BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); - - ret = bch2_btree_node_check_topology(trans, b); - if (ret) - return ret; - - if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { - struct btree *n[2]; - - trace_and_count(c, btree_node_split, trans, b); - - n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); - n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); - - __btree_split_node(as, trans, b, n, keys); - - if (keys) { - ret = btree_split_insert_keys(as, trans, path, n1, keys) ?: - btree_split_insert_keys(as, trans, path, n2, keys); - if (ret) - goto err; - BUG_ON(!bch2_keylist_empty(keys)); - } - - bch2_btree_build_aux_trees(n2); - bch2_btree_build_aux_trees(n1); - - bch2_btree_update_add_new_node(as, n1); - bch2_btree_update_add_new_node(as, n2); - six_unlock_write(&n2->c.lock); - six_unlock_write(&n1->c.lock); - - path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); - six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + path1, n1); - - path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p); - six_lock_increment(&n2->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + path2, n2); - - /* - * Note that on recursive parent_keys == keys, so we - * can't start adding new keys to parent_keys before emptying it - * out (which we did with btree_split_insert_keys() above) - */ - bch2_keylist_add(&as->parent_keys, &n1->key); - bch2_keylist_add(&as->parent_keys, &n2->key); - - if (!parent) { - /* Depth increases, make a new root */ - n3 = __btree_root_alloc(as, trans, b->c.level + 1); - - bch2_btree_update_add_new_node(as, n3); - six_unlock_write(&n3->c.lock); - - trans->paths[path2].locks_want++; - BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level)); - six_lock_increment(&n3->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + path2, n3); - - n3->sib_u64s[0] = U16_MAX; - n3->sib_u64s[1] = U16_MAX; - - ret = btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); - if (ret) - goto err; - } - } else { - trace_and_count(c, btree_node_compact, trans, b); - - n1 = bch2_btree_node_alloc_replacement(as, trans, b); - - if (keys) { - ret = btree_split_insert_keys(as, trans, path, n1, keys); - if (ret) - goto err; - BUG_ON(!bch2_keylist_empty(keys)); - } - - bch2_btree_build_aux_trees(n1); - bch2_btree_update_add_new_node(as, n1); - six_unlock_write(&n1->c.lock); - - path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); - six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + path1, n1); - - if (parent) - bch2_keylist_add(&as->parent_keys, &n1->key); - } - - /* New nodes all written, now make them visible: */ - - if (parent) { - /* Split a non root node */ - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); - } else if (n3) { - ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false); - } else { - /* Root filled up but didn't need to be split */ - ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false); - } - - if (ret) - goto err; - - bch2_btree_interior_update_will_free_node(as, b); - - if (n3) { - bch2_btree_update_get_open_buckets(as, n3); - bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0); - } - if (n2) { - bch2_btree_update_get_open_buckets(as, n2); - bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0); - } - bch2_btree_update_get_open_buckets(as, n1); - bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0); - - /* - * The old node must be freed (in memory) _before_ unlocking the new - * nodes - else another thread could re-acquire a read lock on the old - * node after another thread has locked and updated the new node, thus - * seeing stale data: - */ - bch2_btree_node_free_inmem(trans, trans->paths + path, b); - - if (n3) - bch2_trans_node_add(trans, trans->paths + path, n3); - if (n2) - bch2_trans_node_add(trans, trans->paths + path2, n2); - bch2_trans_node_add(trans, trans->paths + path1, n1); - - if (n3) - six_unlock_intent(&n3->c.lock); - if (n2) - six_unlock_intent(&n2->c.lock); - six_unlock_intent(&n1->c.lock); -out: - if (path2) { - __bch2_btree_path_unlock(trans, trans->paths + path2); - bch2_path_put(trans, path2, true); - } - if (path1) { - __bch2_btree_path_unlock(trans, trans->paths + path1); - bch2_path_put(trans, path1, true); - } - - bch2_trans_verify_locks(trans); - - bch2_time_stats_update(&c->times[n2 - ? BCH_TIME_btree_node_split - : BCH_TIME_btree_node_compact], - start_time); - return ret; -err: - if (n3) - bch2_btree_node_free_never_used(as, trans, n3); - if (n2) - bch2_btree_node_free_never_used(as, trans, n2); - bch2_btree_node_free_never_used(as, trans, n1); - goto out; -} - -/** - * bch2_btree_insert_node - insert bkeys into a given btree node - * - * @as: btree_update object - * @trans: btree_trans object - * @path_idx: path that points to current node - * @b: node to insert keys into - * @keys: list of keys to insert - * - * Returns: 0 on success, typically transaction restart error on failure - * - * Inserts as many keys as it can into a given btree node, splitting it if full. - * If a split occurred, this function will return early. This can only happen - * for leaf nodes -- inserts into interior nodes have to be atomic. - */ -static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, - btree_path_idx_t path_idx, struct btree *b, - struct keylist *keys) -{ - struct bch_fs *c = as->c; - struct btree_path *path = trans->paths + path_idx, *linked; - unsigned i; - int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); - int old_live_u64s = b->nr.live_u64s; - int live_u64s_added, u64s_added; - int ret; - - lockdep_assert_held(&c->gc_lock); - BUG_ON(!b->c.level); - BUG_ON(!as || as->b); - bch2_verify_keylist_sorted(keys); - - if (!btree_node_intent_locked(path, b->c.level)) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "%s(): node not locked at level %u\n", - __func__, b->c.level); - bch2_btree_update_to_text(&buf, as); - bch2_btree_path_to_text(&buf, trans, path_idx); - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return -EIO; - } - - ret = bch2_btree_node_lock_write(trans, path, &b->c); - if (ret) - return ret; - - bch2_btree_node_prep_for_write(trans, path, b); - - if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) { - bch2_btree_node_unlock_write(trans, path, b); - goto split; - } - - - ret = bch2_btree_node_check_topology(trans, b) ?: - bch2_btree_insert_keys_interior(as, trans, path, b, - path->l[b->c.level].iter, keys); - if (ret) { - bch2_btree_node_unlock_write(trans, path, b); - return ret; - } - - trans_for_each_path_with_node(trans, b, linked, i) - bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - - bch2_trans_verify_paths(trans); - - live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; - - if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); - if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); - - if (u64s_added > live_u64s_added && - bch2_maybe_compact_whiteouts(c, b)) - bch2_trans_node_reinit_iter(trans, b); - - btree_update_updated_node(as, b); - bch2_btree_node_unlock_write(trans, path, b); - return 0; -split: - /* - * We could attempt to avoid the transaction restart, by calling - * bch2_btree_path_upgrade() and allocating more nodes: - */ - if (b->c.level >= as->update_level_end) { - trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); - } - - return btree_split(as, trans, path_idx, b, keys); -} - -int bch2_btree_split_leaf(struct btree_trans *trans, - btree_path_idx_t path, - unsigned flags) -{ - /* btree_split & merge may both cause paths array to be reallocated */ - struct btree *b = path_l(trans->paths + path)->b; - struct btree_update *as; - unsigned l; - int ret = 0; - - as = bch2_btree_update_start(trans, trans->paths + path, - trans->paths[path].level, - true, 0, flags); - if (IS_ERR(as)) - return PTR_ERR(as); - - ret = btree_split(as, trans, path, b, NULL); - if (ret) { - bch2_btree_update_free(as, trans); - return ret; - } - - bch2_btree_update_done(as, trans); - - for (l = trans->paths[path].level + 1; - btree_node_intent_locked(&trans->paths[path], l) && !ret; - l++) - ret = bch2_foreground_maybe_merge(trans, path, l, flags); - - return ret; -} - -static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, - btree_path_idx_t path_idx) -{ - struct bch_fs *c = as->c; - struct btree_path *path = trans->paths + path_idx; - struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b; - - BUG_ON(!btree_node_locked(path, b->c.level)); - - n = __btree_root_alloc(as, trans, b->c.level + 1); - - bch2_btree_update_add_new_node(as, n); - six_unlock_write(&n->c.lock); - - path->locks_want++; - BUG_ON(btree_node_locked(path, n->c.level)); - six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path, n); - - n->sib_u64s[0] = U16_MAX; - n->sib_u64s[1] = U16_MAX; - - bch2_keylist_add(&as->parent_keys, &b->key); - btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); - - int ret = bch2_btree_set_root(as, trans, path, n, true); - BUG_ON(ret); - - bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); - bch2_trans_node_add(trans, path, n); - six_unlock_intent(&n->c.lock); - - mutex_lock(&c->btree_cache.lock); - list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); - mutex_unlock(&c->btree_cache.lock); - - bch2_trans_verify_locks(trans); -} - -int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; - - if (btree_node_fake(b)) - return bch2_btree_split_leaf(trans, path, flags); - - struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, b->c.level, - true, 0, flags); - if (IS_ERR(as)) - return PTR_ERR(as); - - __btree_increase_depth(as, trans, path); - bch2_btree_update_done(as, trans); - return 0; -} - -int __bch2_foreground_maybe_merge(struct btree_trans *trans, - btree_path_idx_t path, - unsigned level, - unsigned flags, - enum btree_node_sibling sib) -{ - struct bch_fs *c = trans->c; - struct btree_update *as; - struct bkey_format_state new_s; - struct bkey_format new_f; - struct bkey_i delete; - struct btree *b, *m, *n, *prev, *next, *parent; - struct bpos sib_pos; - size_t sib_u64s; - enum btree_id btree = trans->paths[path].btree_id; - btree_path_idx_t sib_path = 0, new_path = 0; - u64 start_time = local_clock(); - int ret = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - BUG_ON(!trans->paths[path].should_be_locked); - BUG_ON(!btree_node_locked(&trans->paths[path], level)); - - /* - * Work around a deadlock caused by the btree write buffer not doing - * merges and leaving tons of merges for us to do - we really don't need - * to be doing merges at all from the interior update path, and if the - * interior update path is generating too many new interior updates we - * deadlock: - */ - if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) - return 0; - - if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) { - flags &= ~BCH_WATERMARK_MASK; - flags |= BCH_WATERMARK_btree; - flags |= BCH_TRANS_COMMIT_journal_reclaim; - } - - b = trans->paths[path].l[level].b; - - if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || - (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { - b->sib_u64s[sib] = U16_MAX; - return 0; - } - - sib_pos = sib == btree_prev_sib - ? bpos_predecessor(b->data->min_key) - : bpos_successor(b->data->max_key); - - sib_path = bch2_path_get(trans, btree, sib_pos, - U8_MAX, level, BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, sib_path, false); - if (ret) - goto err; - - btree_path_set_should_be_locked(trans, trans->paths + sib_path); - - m = trans->paths[sib_path].l[level].b; - - if (btree_node_parent(trans->paths + path, b) != - btree_node_parent(trans->paths + sib_path, m)) { - b->sib_u64s[sib] = U16_MAX; - goto out; - } - - if (sib == btree_prev_sib) { - prev = m; - next = b; - } else { - prev = b; - next = m; - } - - if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { - struct printbuf buf = PRINTBUF; - - printbuf_indent_add_nextline(&buf, 2); - prt_printf(&buf, "%s(): ", __func__); - ret = __bch2_topology_error(c, &buf); - prt_newline(&buf); - - prt_printf(&buf, "prev ends at "); - bch2_bpos_to_text(&buf, prev->data->max_key); - prt_newline(&buf); - - prt_printf(&buf, "next starts at "); - bch2_bpos_to_text(&buf, next->data->min_key); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - goto err; - } - - bch2_bkey_format_init(&new_s); - bch2_bkey_format_add_pos(&new_s, prev->data->min_key); - __bch2_btree_calc_format(&new_s, prev); - __bch2_btree_calc_format(&new_s, next); - bch2_bkey_format_add_pos(&new_s, next->data->max_key); - new_f = bch2_bkey_format_done(&new_s); - - sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) + - btree_node_u64s_with_format(m->nr, &m->format, &new_f); - - if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { - sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); - sib_u64s /= 2; - sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); - } - - sib_u64s = min(sib_u64s, btree_max_u64s(c)); - sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); - b->sib_u64s[sib] = sib_u64s; - - if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) - goto out; - - parent = btree_node_parent(trans->paths + path, b); - as = bch2_btree_update_start(trans, trans->paths + path, level, false, - 0, BCH_TRANS_COMMIT_no_enospc|flags); - ret = PTR_ERR_OR_ZERO(as); - if (ret) - goto err; - - as->node_start = prev->data->min_key; - as->node_end = next->data->max_key; - - trace_and_count(c, btree_node_merge, trans, b); - - n = bch2_btree_node_alloc(as, trans, b->c.level); - - SET_BTREE_NODE_SEQ(n->data, - max(BTREE_NODE_SEQ(b->data), - BTREE_NODE_SEQ(m->data)) + 1); - - btree_set_min(n, prev->data->min_key); - btree_set_max(n, next->data->max_key); - - n->data->format = new_f; - btree_node_set_format(n, new_f); - - bch2_btree_sort_into(c, n, prev); - bch2_btree_sort_into(c, n, next); - - bch2_btree_build_aux_trees(n); - bch2_btree_update_add_new_node(as, n); - six_unlock_write(&n->c.lock); - - new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p); - six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + new_path, n); - - bkey_init(&delete.k); - delete.k.p = prev->key.k.p; - bch2_keylist_add(&as->parent_keys, &delete); - bch2_keylist_add(&as->parent_keys, &n->key); - - bch2_trans_verify_paths(trans); - - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); - if (ret) - goto err_free_update; - - bch2_btree_interior_update_will_free_node(as, b); - bch2_btree_interior_update_will_free_node(as, m); - - bch2_trans_verify_paths(trans); - - bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); - - bch2_btree_node_free_inmem(trans, trans->paths + path, b); - bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); - - bch2_trans_node_add(trans, trans->paths + path, n); - - bch2_trans_verify_paths(trans); - - six_unlock_intent(&n->c.lock); - - bch2_btree_update_done(as, trans); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); -out: -err: - if (new_path) - bch2_path_put(trans, new_path, true); - bch2_path_put(trans, sib_path, true); - bch2_trans_verify_locks(trans); - if (ret == -BCH_ERR_journal_reclaim_would_deadlock) - ret = 0; - if (!ret) - ret = bch2_trans_relock(trans); - return ret; -err_free_update: - bch2_btree_node_free_never_used(as, trans, n); - bch2_btree_update_free(as, trans); - goto out; -} - -static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, - struct btree *b) -{ - bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, - BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(trans, iter); - if (ret) - goto err; - - /* has node been freed? */ - if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { - /* node has been freed: */ - BUG_ON(!btree_node_dying(b)); - ret = bch_err_throw(trans->c, btree_node_dying); - goto err; - } - - BUG_ON(!btree_node_hashed(b)); - return 0; -err: - bch2_trans_iter_exit(trans, iter); - return ret; -} - -int bch2_btree_node_rewrite(struct btree_trans *trans, - struct btree_iter *iter, - struct btree *b, - unsigned target, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree *n, *parent; - struct btree_update *as; - btree_path_idx_t new_path = 0; - int ret; - - flags |= BCH_TRANS_COMMIT_no_enospc; - - struct btree_path *path = btree_iter_path(trans, iter); - parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, b->c.level, - false, target, flags); - ret = PTR_ERR_OR_ZERO(as); - if (ret) - goto out; - - n = bch2_btree_node_alloc_replacement(as, trans, b); - - bch2_btree_build_aux_trees(n); - bch2_btree_update_add_new_node(as, n); - six_unlock_write(&n->c.lock); - - new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p); - six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + new_path, n); - - trace_and_count(c, btree_node_rewrite, trans, b); - - if (parent) { - bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); - } else { - ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false); - } - - if (ret) - goto err; - - bch2_btree_interior_update_will_free_node(as, b); - - bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); - - bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); - - bch2_trans_node_add(trans, trans->paths + iter->path, n); - six_unlock_intent(&n->c.lock); - - bch2_btree_update_done(as, trans); -out: - if (new_path) - bch2_path_put(trans, new_path, true); - bch2_trans_downgrade(trans); - return ret; -err: - bch2_btree_node_free_never_used(as, trans, n); - bch2_btree_update_free(as, trans); - goto out; -} - -int bch2_btree_node_rewrite_key(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_i *k, unsigned flags) -{ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, - btree, k->k.p, - BTREE_MAX_DEPTH, level, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto out; - - bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); - ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags) - : -ENOENT; -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_btree_node_rewrite_pos(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bpos pos, - unsigned target, - unsigned flags) -{ - BUG_ON(!level); - - /* Traverse one depth lower to get a pointer to the node itself: */ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err; - - ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, - struct btree *b, unsigned flags) -{ - struct btree_iter iter; - int ret = get_iter_to_node(trans, &iter, b); - if (ret) - return ret == -BCH_ERR_btree_node_dying ? 0 : ret; - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -struct async_btree_rewrite { - struct bch_fs *c; - struct work_struct work; - struct list_head list; - enum btree_id btree_id; - unsigned level; - struct bkey_buf key; -}; - -static void async_btree_node_rewrite_work(struct work_struct *work) -{ - struct async_btree_rewrite *a = - container_of(work, struct async_btree_rewrite, work); - struct bch_fs *c = a->c; - - int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, - a->btree_id, a->level, a->key.k, 0)); - if (!bch2_err_matches(ret, ENOENT) && - !bch2_err_matches(ret, EROFS)) - bch_err_fn_ratelimited(c, ret); - - spin_lock(&c->btree_node_rewrites_lock); - list_del(&a->list); - spin_unlock(&c->btree_node_rewrites_lock); - - closure_wake_up(&c->btree_node_rewrites_wait); - - bch2_bkey_buf_exit(&a->key, c); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite); - kfree(a); -} - -void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) -{ - struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS); - if (!a) - return; - - a->c = c; - a->btree_id = b->c.btree_id; - a->level = b->c.level; - INIT_WORK(&a->work, async_btree_node_rewrite_work); - - bch2_bkey_buf_init(&a->key); - bch2_bkey_buf_copy(&a->key, c, &b->key); - - bool now = false, pending = false; - - spin_lock(&c->btree_node_rewrites_lock); - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) && - enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { - list_add(&a->list, &c->btree_node_rewrites); - now = true; - } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { - list_add(&a->list, &c->btree_node_rewrites_pending); - pending = true; - } - spin_unlock(&c->btree_node_rewrites_lock); - - if (now) { - queue_work(c->btree_node_rewrite_worker, &a->work); - } else if (pending) { - /* bch2_do_pending_node_rewrites will execute */ - } else { - bch2_bkey_buf_exit(&a->key, c); - kfree(a); - } -} - -void bch2_async_btree_node_rewrites_flush(struct bch_fs *c) -{ - closure_wait_event(&c->btree_node_rewrites_wait, - list_empty(&c->btree_node_rewrites)); -} - -void bch2_do_pending_node_rewrites(struct bch_fs *c) -{ - while (1) { - spin_lock(&c->btree_node_rewrites_lock); - struct async_btree_rewrite *a = - list_pop_entry(&c->btree_node_rewrites_pending, - struct async_btree_rewrite, list); - if (a) - list_add(&a->list, &c->btree_node_rewrites); - spin_unlock(&c->btree_node_rewrites_lock); - - if (!a) - break; - - enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite); - queue_work(c->btree_node_rewrite_worker, &a->work); - } -} - -void bch2_free_pending_node_rewrites(struct bch_fs *c) -{ - while (1) { - spin_lock(&c->btree_node_rewrites_lock); - struct async_btree_rewrite *a = - list_pop_entry(&c->btree_node_rewrites_pending, - struct async_btree_rewrite, list); - spin_unlock(&c->btree_node_rewrites_lock); - - if (!a) - break; - - bch2_bkey_buf_exit(&a->key, c); - kfree(a); - } -} - -static int __bch2_btree_node_update_key(struct btree_trans *trans, - struct btree_iter *iter, - struct btree *b, struct btree *new_hash, - struct bkey_i *new_key, - unsigned commit_flags, - bool skip_triggers) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter2 = {}; - struct btree *parent; - int ret; - - if (!skip_triggers) { - ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, - bkey_i_to_s_c(&b->key), - BTREE_TRIGGER_transactional) ?: - bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, - bkey_i_to_s(new_key), - BTREE_TRIGGER_transactional); - if (ret) - return ret; - } - - if (new_hash) { - bkey_copy(&new_hash->key, new_key); - ret = bch2_btree_node_hash_insert(&c->btree_cache, - new_hash, b->c.level, b->c.btree_id); - BUG_ON(ret); - } - - parent = btree_node_parent(btree_iter_path(trans, iter), b); - if (parent) { - bch2_trans_copy_iter(trans, &iter2, iter); - - iter2.path = bch2_btree_path_make_mut(trans, iter2.path, - iter2.flags & BTREE_ITER_intent, - _THIS_IP_); - - struct btree_path *path2 = btree_iter_path(trans, &iter2); - BUG_ON(path2->level != b->c.level); - BUG_ON(!bpos_eq(path2->pos, new_key->k.p)); - - btree_path_set_level_up(trans, path2); - - trans->paths_sorted = false; - - ret = bch2_btree_iter_traverse(trans, &iter2) ?: - bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); - if (ret) - goto err; - } else { - BUG_ON(btree_node_root(c, b) != b); - - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, - jset_u64s(new_key->k.u64s)); - ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - journal_entry_set(e, - BCH_JSET_ENTRY_btree_root, - b->c.btree_id, b->c.level, - new_key, new_key->k.u64s); - } - - ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); - if (ret) - goto err; - - bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); - - if (new_hash) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, new_hash); - - __bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, new_key); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - mutex_unlock(&c->btree_cache.lock); - } else { - bkey_copy(&b->key, new_key); - } - - bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b); -out: - bch2_trans_iter_exit(trans, &iter2); - return ret; -err: - if (new_hash) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - } - goto out; -} - -int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, - struct btree *b, struct bkey_i *new_key, - unsigned commit_flags, bool skip_triggers) -{ - struct bch_fs *c = trans->c; - struct btree *new_hash = NULL; - struct btree_path *path = btree_iter_path(trans, iter); - struct closure cl; - int ret = 0; - - ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1); - if (ret) - return ret; - - closure_init_stack(&cl); - - /* - * check btree_ptr_hash_val() after @b is locked by - * btree_iter_traverse(): - */ - if (btree_ptr_hash_val(new_key) != b->hash_val) { - ret = bch2_btree_cache_cannibalize_lock(trans, &cl); - if (ret) { - ret = drop_locks_do(trans, (closure_sync(&cl), 0)); - if (ret) - return ret; - } - - new_hash = bch2_btree_node_mem_alloc(trans, false); - ret = PTR_ERR_OR_ZERO(new_hash); - if (ret) - goto err; - } - - path->intent_ref++; - ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key, - commit_flags, skip_triggers); - --path->intent_ref; - - if (new_hash) - bch2_btree_node_to_freelist(c, new_hash); -err: - closure_sync(&cl); - bch2_btree_cache_cannibalize_unlock(trans); - return ret; -} - -int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, - struct btree *b, struct bkey_i *new_key, - unsigned commit_flags, bool skip_triggers) -{ - struct btree_iter iter; - int ret = get_iter_to_node(trans, &iter, b); - if (ret) - return ret == -BCH_ERR_btree_node_dying ? 0 : ret; - - bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, - !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); - - ret = bch2_btree_node_update_key(trans, &iter, b, new_key, - commit_flags, skip_triggers); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* Init code: */ - -/* - * Only for filesystem bringup, when first reading the btree roots or allocating - * btree roots when initializing a new filesystem: - */ -void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) -{ - BUG_ON(btree_node_root(c, b)); - - bch2_btree_set_root_inmem(c, b); -} - -int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level) -{ - struct bch_fs *c = trans->c; - struct closure cl; - struct btree *b; - int ret; - - closure_init_stack(&cl); - - do { - ret = bch2_btree_cache_cannibalize_lock(trans, &cl); - closure_sync(&cl); - } while (ret); - - b = bch2_btree_node_mem_alloc(trans, false); - bch2_btree_cache_cannibalize_unlock(trans); - - ret = PTR_ERR_OR_ZERO(b); - if (ret) - return ret; - - set_btree_node_fake(b); - set_btree_node_need_rewrite(b); - b->c.level = level; - b->c.btree_id = id; - - bkey_btree_ptr_init(&b->key); - b->key.k.p = SPOS_MAX; - *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; - - bch2_bset_init_first(b, &b->data->keys); - bch2_btree_build_aux_trees(b); - - b->data->flags = 0; - btree_set_min(b, POS_MIN); - btree_set_max(b, SPOS_MAX); - b->data->format = bch2_btree_calc_format(b); - btree_node_set_format(b, b->data->format); - - ret = bch2_btree_node_hash_insert(&c->btree_cache, b, - b->c.level, b->c.btree_id); - BUG_ON(ret); - - bch2_btree_set_root_inmem(c, b); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - return 0; -} - -void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) -{ - bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level))); -} - -static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) -{ - prt_printf(out, "%ps: ", (void *) as->ip_started); - bch2_trans_commit_flags_to_text(out, as->flags); - - prt_str(out, " "); - bch2_btree_id_to_text(out, as->btree_id); - prt_printf(out, " l=%u-%u ", - as->update_level_start, - as->update_level_end); - bch2_bpos_to_text(out, as->node_start); - prt_char(out, ' '); - bch2_bpos_to_text(out, as->node_end); - prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s", - as->node_written, - as->node_sectors, - as->node_remaining, - btree_node_reawrite_reason_strs[as->node_needed_rewrite]); - - prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - bch2_btree_update_modes[as->mode], - as->nodes_written, - closure_nr_remaining(&as->cl), - as->journal.seq); -} - -void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct btree_update *as; - - mutex_lock(&c->btree_interior_update_lock); - list_for_each_entry(as, &c->btree_interior_update_list, list) - bch2_btree_update_to_text(out, as); - mutex_unlock(&c->btree_interior_update_lock); -} - -static bool bch2_btree_interior_updates_pending(struct bch_fs *c) -{ - bool ret; - - mutex_lock(&c->btree_interior_update_lock); - ret = !list_empty(&c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); - - return ret; -} - -bool bch2_btree_interior_updates_flush(struct bch_fs *c) -{ - bool ret = bch2_btree_interior_updates_pending(c); - - if (ret) - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_pending(c)); - return ret; -} - -void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry) -{ - struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); - - mutex_lock(&c->btree_root_lock); - - r->level = entry->level; - r->alive = true; - bkey_copy(&r->key, (struct bkey_i *) entry->start); - - mutex_unlock(&c->btree_root_lock); -} - -struct jset_entry * -bch2_btree_roots_to_journal_entries(struct bch_fs *c, - struct jset_entry *end, - unsigned long skip) -{ - unsigned i; - - mutex_lock(&c->btree_root_lock); - - for (i = 0; i < btree_id_nr_alive(c); i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (r->alive && !test_bit(i, &skip)) { - journal_entry_set(end, BCH_JSET_ENTRY_btree_root, - i, r->level, &r->key, r->key.k.u64s); - end = vstruct_next(end); - } - } - - mutex_unlock(&c->btree_root_lock); - - return end; -} - -static void bch2_btree_alloc_to_text(struct printbuf *out, - struct bch_fs *c, - struct btree_alloc *a) -{ - printbuf_indent_add(out, 2); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k)); - prt_newline(out); - - struct open_bucket *ob; - unsigned i; - open_bucket_for_each(c, &a->ob, ob, i) - bch2_open_bucket_to_text(out, c, ob); - - printbuf_indent_sub(out, 2); -} - -void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c) -{ - for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++) - bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]); -} - -void bch2_fs_btree_interior_update_exit(struct bch_fs *c) -{ - WARN_ON(!list_empty(&c->btree_node_rewrites)); - WARN_ON(!list_empty(&c->btree_node_rewrites_pending)); - - if (c->btree_node_rewrite_worker) - destroy_workqueue(c->btree_node_rewrite_worker); - if (c->btree_interior_update_worker) - destroy_workqueue(c->btree_interior_update_worker); - mempool_exit(&c->btree_interior_update_pool); -} - -void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) -{ - mutex_init(&c->btree_reserve_cache_lock); - INIT_LIST_HEAD(&c->btree_interior_update_list); - INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); - mutex_init(&c->btree_interior_update_lock); - INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); - - INIT_LIST_HEAD(&c->btree_node_rewrites); - INIT_LIST_HEAD(&c->btree_node_rewrites_pending); - spin_lock_init(&c->btree_node_rewrites_lock); -} - -int bch2_fs_btree_interior_update_init(struct bch_fs *c) -{ - c->btree_interior_update_worker = - alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); - if (!c->btree_interior_update_worker) - return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); - - c->btree_node_rewrite_worker = - alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); - if (!c->btree_node_rewrite_worker) - return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); - - if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, - sizeof(struct btree_update))) - return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init); - - return 0; -} diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h deleted file mode 100644 index ac04e45a8515..000000000000 --- a/fs/bcachefs/btree_update_interior.h +++ /dev/null @@ -1,364 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H -#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H - -#include "btree_cache.h" -#include "btree_locking.h" -#include "btree_update.h" - -#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) - -#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) - -int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); - -#define BTREE_UPDATE_MODES() \ - x(none) \ - x(node) \ - x(root) \ - x(update) - -enum btree_update_mode { -#define x(n) BTREE_UPDATE_##n, - BTREE_UPDATE_MODES() -#undef x -}; - -/* - * Tracks an in progress split/rewrite of a btree node and the update to the - * parent node: - * - * When we split/rewrite a node, we do all the updates in memory without - * waiting for any writes to complete - we allocate the new node(s) and update - * the parent node, possibly recursively up to the root. - * - * The end result is that we have one or more new nodes being written - - * possibly several, if there were multiple splits - and then a write (updating - * an interior node) which will make all these new nodes visible. - * - * Additionally, as we split/rewrite nodes we free the old nodes - but the old - * nodes can't be freed (their space on disk can't be reclaimed) until the - * update to the interior node that makes the new node visible completes - - * until then, the old nodes are still reachable on disk. - * - */ -struct btree_update { - struct closure cl; - struct bch_fs *c; - u64 start_time; - unsigned long ip_started; - - struct list_head list; - struct list_head unwritten_list; - - enum btree_update_mode mode; - enum bch_trans_commit_flags flags; - unsigned nodes_written:1; - unsigned took_gc_lock:1; - - enum btree_id btree_id; - struct bpos node_start; - struct bpos node_end; - enum btree_node_rewrite_reason node_needed_rewrite; - u16 node_written; - u16 node_sectors; - u16 node_remaining; - - unsigned update_level_start; - unsigned update_level_end; - - struct disk_reservation disk_res; - - /* - * BTREE_UPDATE_node: - * The update that made the new nodes visible was a regular update to an - * existing interior node - @b. We can't write out the update to @b - * until the new nodes we created are finished writing, so we block @b - * from writing by putting this btree_interior update on the - * @b->write_blocked list with @write_blocked_list: - */ - struct btree *b; - struct list_head write_blocked_list; - - /* - * We may be freeing nodes that were dirty, and thus had journal entries - * pinned: we need to transfer the oldest of those pins to the - * btree_update operation, and release it when the new node(s) - * are all persistent and reachable: - */ - struct journal_entry_pin journal; - - /* Preallocated nodes we reserve when we start the update: */ - struct prealloc_nodes { - struct btree *b[BTREE_UPDATE_NODES_MAX]; - unsigned nr; - } prealloc_nodes[2]; - - /* Nodes being freed: */ - struct keylist old_keys; - u64 _old_keys[BTREE_UPDATE_NODES_MAX * - BKEY_BTREE_PTR_U64s_MAX]; - - /* Nodes being added: */ - struct keylist new_keys; - u64 _new_keys[BTREE_UPDATE_NODES_MAX * - BKEY_BTREE_PTR_U64s_MAX]; - - /* New nodes, that will be made reachable by this update: */ - struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; - unsigned nr_new_nodes; - - struct btree *old_nodes[BTREE_UPDATE_NODES_MAX]; - __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX]; - unsigned nr_old_nodes; - - open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * - BCH_REPLICAS_MAX]; - open_bucket_idx_t nr_open_buckets; - - unsigned journal_u64s; - u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; - - /* Only here to reduce stack usage on recursive splits: */ - struct keylist parent_keys; - /* - * Enough room for btree_split's keys without realloc - btree node - * pointers never have crc/compression info, so we only need to acount - * for the pointers for three keys - */ - u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; -}; - -struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, - struct btree_trans *, - struct btree *, - struct bkey_format); - -int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); - -int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned); - -int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, - unsigned, unsigned, enum btree_node_sibling); - -static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, - btree_path_idx_t path_idx, - unsigned level, unsigned flags, - enum btree_node_sibling sib) -{ - struct btree_path *path = trans->paths + path_idx; - struct btree *b; - - EBUG_ON(!btree_node_locked(path, level)); - - if (static_branch_unlikely(&bch2_btree_node_merging_disabled)) - return 0; - - b = path->l[level].b; - if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) - return 0; - - return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib); -} - -static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, - btree_path_idx_t path, - unsigned level, - unsigned flags) -{ - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, - btree_prev_sib) ?: - bch2_foreground_maybe_merge_sibling(trans, path, level, flags, - btree_next_sib); -} - -int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, - struct btree *, unsigned, unsigned); -int bch2_btree_node_rewrite_key(struct btree_trans *, - enum btree_id, unsigned, - struct bkey_i *, unsigned); -int bch2_btree_node_rewrite_pos(struct btree_trans *, - enum btree_id, unsigned, - struct bpos, unsigned, unsigned); -int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, - struct btree *, unsigned); - -void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); - -int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, - struct btree *, struct bkey_i *, - unsigned, bool); -int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, - struct bkey_i *, unsigned, bool); - -void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); - -int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned); -void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); - -static inline unsigned btree_update_reserve_required(struct bch_fs *c, - struct btree *b) -{ - unsigned depth = btree_node_root(c, b)->c.level + 1; - - /* - * Number of nodes we might have to allocate in a worst case btree - * split operation - we split all the way up to the root, then allocate - * a new root, unless we're already at max depth: - */ - if (depth < BTREE_MAX_DEPTH) - return (depth - b->c.level) * 2 + 1; - else - return (depth - b->c.level) * 2 - 1; -} - -static inline void btree_node_reset_sib_u64s(struct btree *b) -{ - b->sib_u64s[0] = b->nr.live_u64s; - b->sib_u64s[1] = b->nr.live_u64s; -} - -static inline void *btree_data_end(struct btree *b) -{ - return (void *) b->data + btree_buf_bytes(b); -} - -static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b) -{ - return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s); -} - -static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b) -{ - return btree_data_end(b); -} - -static inline void *write_block(struct btree *b) -{ - return (void *) b->data + (b->written << 9); -} - -static inline bool __btree_addr_written(struct btree *b, void *p) -{ - return p < write_block(b); -} - -static inline bool bset_written(struct btree *b, struct bset *i) -{ - return __btree_addr_written(b, i); -} - -static inline bool bkey_written(struct btree *b, struct bkey_packed *k) -{ - return __btree_addr_written(b, k); -} - -static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end) -{ - ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + - b->whiteout_u64s; - ssize_t total = btree_buf_bytes(b) >> 3; - - /* Always leave one extra u64 for bch2_varint_decode: */ - used++; - - return total - used; -} - -static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b) -{ - ssize_t remaining = __bch2_btree_u64s_remaining(b, - btree_bkey_last(b, bset_tree_last(b))); - - BUG_ON(remaining < 0); - - if (bset_written(b, btree_bset_last(b))) - return 0; - - return remaining; -} - -#define BTREE_WRITE_SET_U64s_BITS 9 - -static inline unsigned btree_write_set_buffer(struct btree *b) -{ - /* - * Could buffer up larger amounts of keys for btrees with larger keys, - * pending benchmarking: - */ - return 8 << BTREE_WRITE_SET_U64s_BITS; -} - -static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b) -{ - struct bset_tree *t = bset_tree_last(b); - struct btree_node_entry *bne = max(write_block(b), - (void *) btree_bkey_last(b, t)); - ssize_t remaining_space = - __bch2_btree_u64s_remaining(b, bne->keys.start); - - if (unlikely(bset_written(b, bset(b, t)))) { - if (b->written + block_sectors(c) <= btree_sectors(c)) - return bne; - } else { - if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && - remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) - return bne; - } - - return NULL; -} - -static inline void push_whiteout(struct btree *b, struct bpos pos) -{ - struct bkey_packed k; - - BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s); - EBUG_ON(btree_node_just_written(b)); - - if (!bkey_pack_pos(&k, pos, b)) { - struct bkey *u = (void *) &k; - - bkey_init(u); - u->p = pos; - } - - k.needs_whiteout = true; - - b->whiteout_u64s += k.u64s; - bkey_p_copy(unwritten_whiteouts_start(b), &k); -} - -/* - * write lock must be held on @b (else the dirty bset that we were going to - * insert into could be written out from under us) - */ -static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s) -{ - if (unlikely(btree_node_need_rewrite(b))) - return false; - - return u64s <= bch2_btree_keys_u64s_remaining(b); -} - -void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); - -bool bch2_btree_interior_updates_flush(struct bch_fs *); - -void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); -struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, - struct jset_entry *, unsigned long); - -void bch2_async_btree_node_rewrites_flush(struct bch_fs *); -void bch2_do_pending_node_rewrites(struct bch_fs *); -void bch2_free_pending_node_rewrites(struct bch_fs *); - -void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_btree_interior_update_exit(struct bch_fs *); -void bch2_fs_btree_interior_update_init_early(struct bch_fs *); -int bch2_fs_btree_interior_update_init(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c deleted file mode 100644 index 4b095235a0d2..000000000000 --- a/fs/bcachefs/btree_write_buffer.c +++ /dev/null @@ -1,893 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_buf.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "disk_accounting.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" - -#include <linux/prefetch.h> -#include <linux/sort.h> - -static int bch2_btree_write_buffer_journal_flush(struct journal *, - struct journal_entry_pin *, u64); - -static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) -{ - return (cmp_int(l->hi, r->hi) ?: - cmp_int(l->mi, r->mi) ?: - cmp_int(l->lo, r->lo)) >= 0; -} - -static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) -{ -#ifdef CONFIG_X86_64 - int cmp; - - asm("mov (%[l]), %%rax;" - "sub (%[r]), %%rax;" - "mov 8(%[l]), %%rax;" - "sbb 8(%[r]), %%rax;" - "mov 16(%[l]), %%rax;" - "sbb 16(%[r]), %%rax;" - : "=@ccae" (cmp) - : [l] "r" (l), [r] "r" (r) - : "rax", "cc"); - - EBUG_ON(cmp != __wb_key_ref_cmp(l, r)); - return cmp; -#else - return __wb_key_ref_cmp(l, r); -#endif -} - -static int wb_key_seq_cmp(const void *_l, const void *_r) -{ - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; - - return cmp_int(l->journal_seq, r->journal_seq); -} - -/* Compare excluding idx, the low 24 bits: */ -static inline bool wb_key_eq(const void *_l, const void *_r) -{ - const struct wb_key_ref *l = _l; - const struct wb_key_ref *r = _r; - - return !((l->hi ^ r->hi)| - (l->mi ^ r->mi)| - ((l->lo >> 24) ^ (r->lo >> 24))); -} - -static noinline void wb_sort(struct wb_key_ref *base, size_t num) -{ - size_t n = num, a = num / 2; - - if (!a) /* num < 2 || size == 0 */ - return; - - for (;;) { - size_t b, c, d; - - if (a) /* Building heap: sift down --a */ - --a; - else if (--n) /* Sorting: Extract root to --n */ - swap(base[0], base[n]); - else /* Sort complete */ - break; - - /* - * Sift element at "a" down into heap. This is the - * "bottom-up" variant, which significantly reduces - * calls to cmp_func(): we find the sift-down path all - * the way to the leaves (one compare per level), then - * backtrack to find where to insert the target element. - * - * Because elements tend to sift down close to the leaves, - * this uses fewer compares than doing two per level - * on the way down. (A bit more than half as many on - * average, 3/4 worst-case.) - */ - for (b = a; c = 2*b + 1, (d = c + 1) < n;) - b = wb_key_ref_cmp(base + c, base + d) ? c : d; - if (d == n) /* Special case last leaf with no sibling */ - b = c; - - /* Now backtrack from "b" to the correct location for "a" */ - while (b != a && wb_key_ref_cmp(base + a, base + b)) - b = (b - 1) / 2; - c = b; /* Where "a" belongs */ - while (b != a) { /* Shift it into place */ - b = (b - 1) / 2; - swap(base[b], base[c]); - } - } -} - -static noinline int wb_flush_one_slowpath(struct btree_trans *trans, - struct btree_iter *iter, - struct btree_write_buffered_key *wb) -{ - struct btree_path *path = btree_iter_path(trans, iter); - - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - - trans->journal_res.seq = wb->journal_seq; - - return bch2_trans_update(trans, iter, &wb->k, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_journal_res| - BCH_TRANS_COMMIT_journal_reclaim); -} - -static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter, - struct btree_write_buffered_key *wb, - bool *write_locked, - bool *accounting_accumulated, - size_t *fast) -{ - struct btree_path *path; - int ret; - - EBUG_ON(!wb->journal_seq); - EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); - EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); - - ret = bch2_btree_iter_traverse(trans, iter); - if (ret) - return ret; - - if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) { - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u); - - if (k.k->type == KEY_TYPE_accounting) - bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k), - bkey_s_c_to_accounting(k)); - } - *accounting_accumulated = true; - - /* - * We can't clone a path that has write locks: unshare it now, before - * set_pos and traverse(): - */ - if (btree_iter_path(trans, iter)->ref > 1) - iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_); - - path = btree_iter_path(trans, iter); - - if (!*write_locked) { - ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); - if (ret) - return ret; - - bch2_btree_node_prep_for_write(trans, path, path->l[0].b); - *write_locked = true; - } - - if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) { - *write_locked = false; - return wb_flush_one_slowpath(trans, iter, wb); - } - - EBUG_ON(!bpos_eq(wb->k.k.p, path->pos)); - - bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); - (*fast)++; - return 0; -} - -/* - * Update a btree with a write buffered key using the journal seq of the - * original write buffer insert. - * - * It is not safe to rejournal the key once it has been inserted into the write - * buffer because that may break recovery ordering. For example, the key may - * have already been modified in the active write buffer in a seq that comes - * before the current transaction. If we were to journal this key again and - * crash, recovery would process updates in the wrong order. - */ -static int -btree_write_buffered_insert(struct btree_trans *trans, - struct btree_write_buffered_key *wb) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), - BTREE_ITER_cached|BTREE_ITER_intent); - - trans->journal_res.seq = wb->journal_seq; - - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, &wb->k, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) -{ - struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer); - struct journal *j = &c->journal; - - if (!wb->inc.keys.nr) - return; - - bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, - bch2_btree_write_buffer_journal_flush); - - darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); - darray_resize(&wb->sorted, wb->flushing.keys.size); - - if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) { - swap(wb->flushing.keys, wb->inc.keys); - goto out; - } - - size_t nr = min(darray_room(wb->flushing.keys), - wb->sorted.size - wb->flushing.keys.nr); - nr = min(nr, wb->inc.keys.nr); - - memcpy(&darray_top(wb->flushing.keys), - wb->inc.keys.data, - sizeof(wb->inc.keys.data[0]) * nr); - - memmove(wb->inc.keys.data, - wb->inc.keys.data + nr, - sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr)); - - wb->flushing.keys.nr += nr; - wb->inc.keys.nr -= nr; -out: - if (!wb->inc.keys.nr) - bch2_journal_pin_drop(j, &wb->inc.pin); - else - bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, - bch2_btree_write_buffer_journal_flush); - - if (j->watermark) { - spin_lock(&j->lock); - bch2_journal_set_watermark(j); - spin_unlock(&j->lock); - } - - BUG_ON(wb->sorted.size < wb->flushing.keys.nr); -} - -int bch2_btree_write_buffer_insert_err(struct bch_fs *c, - enum btree_id btree, struct bkey_i *k) -{ - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "attempting to do write buffer update on non wb btree="); - bch2_btree_id_to_text(&buf, btree); - prt_str(&buf, "\n"); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - - bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); - return -EROFS; -} - -static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_iter iter = {}; - size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; - bool write_locked = false; - bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); - int ret = 0; - - ret = bch2_journal_error(&c->journal); - if (ret) - return ret; - - bch2_trans_unlock(trans); - bch2_trans_begin(trans); - - mutex_lock(&wb->inc.lock); - move_keys_from_inc_to_flushing(wb); - mutex_unlock(&wb->inc.lock); - - for (size_t i = 0; i < wb->flushing.keys.nr; i++) { - wb->sorted.data[i].idx = i; - wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; - memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos)); - } - wb->sorted.nr = wb->flushing.keys.nr; - - /* - * We first sort so that we can detect and skip redundant updates, and - * then we attempt to flush in sorted btree order, as this is most - * efficient. - * - * However, since we're not flushing in the order they appear in the - * journal we won't be able to drop our journal pin until everything is - * flushed - which means this could deadlock the journal if we weren't - * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail - * if it would block taking a journal reservation. - * - * If that happens, simply skip the key so we can optimistically insert - * as many keys as possible in the fast path. - */ - wb_sort(wb->sorted.data, wb->sorted.nr); - - darray_for_each(wb->sorted, i) { - struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; - - if (unlikely(!btree_type_uses_write_buffer(k->btree))) { - ret = bch2_btree_write_buffer_insert_err(trans->c, k->btree, &k->k); - goto err; - } - - for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) - prefetch(&wb->flushing.keys.data[n->idx]); - - BUG_ON(!k->journal_seq); - - if (!accounting_replay_done && - k->k.k.type == KEY_TYPE_accounting) { - slowpath++; - continue; - } - - if (i + 1 < &darray_top(wb->sorted) && - wb_key_eq(i, i + 1)) { - struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; - - if (k->k.k.type == KEY_TYPE_accounting && - n->k.k.type == KEY_TYPE_accounting) - bch2_accounting_accumulate(bkey_i_to_accounting(&n->k), - bkey_i_to_s_c_accounting(&k->k)); - - overwritten++; - n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); - k->journal_seq = 0; - continue; - } - - if (write_locked) { - struct btree_path *path = btree_iter_path(trans, &iter); - - if (path->btree_id != i->btree || - bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - write_locked = false; - - ret = lockrestart_do(trans, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_foreground_maybe_merge(trans, iter.path, 0, - BCH_WATERMARK_reclaim| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc)); - if (ret) - goto err; - } - } - - if (!iter.path || iter.btree_id != k->btree) { - bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, - BTREE_ITER_intent|BTREE_ITER_all_snapshots); - } - - bch2_btree_iter_set_pos(trans, &iter, k->k.k.p); - btree_iter_path(trans, &iter)->preserve = false; - - bool accounting_accumulated = false; - do { - if (race_fault()) { - ret = bch_err_throw(c, journal_reclaim_would_deadlock); - break; - } - - ret = wb_flush_one(trans, &iter, k, &write_locked, - &accounting_accumulated, &fast); - if (!write_locked) - bch2_trans_begin(trans); - } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); - - if (!ret) { - k->journal_seq = 0; - } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { - slowpath++; - ret = 0; - } else - break; - } - - if (write_locked) { - struct btree_path *path = btree_iter_path(trans, &iter); - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - goto err; - - if (slowpath) { - /* - * Flush in the order they were present in the journal, so that - * we can release journal pins: - * The fastpath zapped the seq of keys that were successfully flushed so - * we can skip those here. - */ - trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); - - sort_nonatomic(wb->flushing.keys.data, - wb->flushing.keys.nr, - sizeof(wb->flushing.keys.data[0]), - wb_key_seq_cmp, NULL); - - darray_for_each(wb->flushing.keys, i) { - if (!i->journal_seq) - continue; - - if (!accounting_replay_done && - i->k.k.type == KEY_TYPE_accounting) { - could_not_insert++; - continue; - } - - if (!could_not_insert) - bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, - bch2_btree_write_buffer_journal_flush); - - bch2_trans_begin(trans); - - ret = commit_do(trans, NULL, NULL, - BCH_WATERMARK_reclaim| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_journal_res , - btree_write_buffered_insert(trans, i)); - if (ret) - goto err; - - i->journal_seq = 0; - } - - /* - * If journal replay hasn't finished with accounting keys we - * can't flush accounting keys at all - condense them and leave - * them for next time. - * - * Q: Can the write buffer overflow? - * A Shouldn't be any actual risk. It's just new accounting - * updates that the write buffer can't flush, and those are only - * going to be generated by interior btree node updates as - * journal replay has to split/rewrite nodes to make room for - * its updates. - * - * And for those new acounting updates, updates to the same - * counters get accumulated as they're flushed from the journal - * to the write buffer - see the patch for eytzingcer tree - * accumulated. So we could only overflow if the number of - * distinct counters touched somehow was very large. - */ - if (could_not_insert) { - struct btree_write_buffered_key *dst = wb->flushing.keys.data; - - darray_for_each(wb->flushing.keys, i) - if (i->journal_seq) - *dst++ = *i; - wb->flushing.keys.nr = dst - wb->flushing.keys.data; - } - } -err: - if (ret || !could_not_insert) { - bch2_journal_pin_drop(j, &wb->flushing.pin); - wb->flushing.keys.nr = 0; - } - - bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret)); - trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0); - return ret; -} - -static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) -{ - struct journal_keys_to_wb dst; - int ret = 0; - - bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); - - for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { - jset_entry_for_each_key(entry, k) { - ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); - if (ret) - goto out; - } - - entry->type = BCH_JSET_ENTRY_btree_keys; - } -out: - ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; - return ret; -} - -static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) -{ - struct journal *j = &c->journal; - struct journal_buf *buf; - bool blocked; - int ret = 0; - - while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) { - ret = bch2_journal_keys_to_write_buffer(c, buf); - - if (!blocked && !ret) { - spin_lock(&j->lock); - buf->need_flush_to_write_buffer = false; - spin_unlock(&j->lock); - } - - mutex_unlock(&j->buf_lock); - - if (blocked) { - bch2_journal_unblock(j); - break; - } - } - - return ret; -} - -static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq, - bool *did_work) -{ - struct bch_fs *c = trans->c; - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret = 0, fetch_from_journal_err; - - do { - bch2_trans_unlock(trans); - - fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq); - - *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr; - - /* - * On memory allocation failure, bch2_btree_write_buffer_flush_locked() - * is not guaranteed to empty wb->inc: - */ - mutex_lock(&wb->flushing.lock); - ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flushing.lock); - } while (!ret && - (fetch_from_journal_err || - (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) || - (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq))); - - return ret; -} - -static int bch2_btree_write_buffer_journal_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - bool did_work = false; - - return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work)); -} - -int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - bool did_work = false; - - trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); - - return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work); -} - -/* - * The write buffer requires flushing when going RO: keys in the journal for the - * write buffer don't have a journal pin yet - */ -bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c) -{ - if (bch2_journal_error(&c->journal)) - return false; - - bool did_work = false; - bch2_trans_run(c, btree_write_buffer_flush_seq(trans, - journal_cur_seq(&c->journal), &did_work)); - return did_work; -} - -int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret = 0; - - if (mutex_trylock(&wb->flushing.lock)) { - ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flushing.lock); - } - - return ret; -} - -int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer)) - return bch_err_throw(c, erofs_no_writes); - - int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); - return ret; -} - -/* - * In check and repair code, when checking references to write buffer btrees we - * need to issue a flush before we have a definitive error: this issues a flush - * if this is a key we haven't yet checked. - */ -int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, - struct bkey_s_c referring_k, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct bkey_buf tmp; - int ret = 0; - - bch2_bkey_buf_init(&tmp); - - if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { - if (trace_write_buffer_maybe_flush_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, referring_k); - trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf); - printbuf_exit(&buf); - } - - bch2_bkey_buf_reassemble(&tmp, c, referring_k); - - if (bkey_is_btree_ptr(referring_k.k)) { - bch2_trans_unlock(trans); - bch2_btree_interior_updates_flush(c); - } - - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - bch2_bkey_buf_copy(last_flushed, c, tmp.k); - - /* can we avoid the unconditional restart? */ - trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_); - ret = bch_err_throw(c, transaction_restart_write_buffer_flush); - } -err: - bch2_bkey_buf_exit(&tmp, c); - return ret; -} - -static void bch2_btree_write_buffer_flush_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret; - - mutex_lock(&wb->flushing.lock); - do { - ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); - } while (!ret && bch2_btree_write_buffer_should_flush(c)); - mutex_unlock(&wb->flushing.lock); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); -} - -static void wb_accounting_sort(struct btree_write_buffer *wb) -{ - eytzinger0_sort(wb->accounting.data, wb->accounting.nr, - sizeof(wb->accounting.data[0]), - wb_key_cmp, NULL); -} - -int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree, - struct bkey_i_accounting *k) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_write_buffered_key new = { .btree = btree }; - - bkey_copy(&new.k, &k->k_i); - - int ret = darray_push(&wb->accounting, new); - if (ret) - return ret; - - wb_accounting_sort(wb); - return 0; -} - -int bch2_journal_key_to_wb_slowpath(struct bch_fs *c, - struct journal_keys_to_wb *dst, - enum btree_id btree, struct bkey_i *k) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret; -retry: - ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); - if (!ret && dst->wb == &wb->flushing) - ret = darray_resize(&wb->sorted, wb->flushing.keys.size); - - if (unlikely(ret)) { - if (dst->wb == &c->btree_write_buffer.flushing) { - mutex_unlock(&dst->wb->lock); - dst->wb = &c->btree_write_buffer.inc; - bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin, - bch2_btree_write_buffer_journal_flush); - goto retry; - } - - return ret; - } - - dst->room = darray_room(dst->wb->keys); - if (dst->wb == &wb->flushing) - dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); - BUG_ON(!dst->room); - BUG_ON(!dst->seq); - - struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); - wb_k->journal_seq = dst->seq; - wb_k->btree = btree; - bkey_copy(&wb_k->k, k); - dst->wb->keys.nr++; - dst->room--; - return 0; -} - -void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - if (mutex_trylock(&wb->flushing.lock)) { - mutex_lock(&wb->inc.lock); - move_keys_from_inc_to_flushing(wb); - - /* - * Attempt to skip wb->inc, and add keys directly to - * wb->flushing, saving us a copy later: - */ - - if (!wb->inc.keys.nr) { - dst->wb = &wb->flushing; - } else { - mutex_unlock(&wb->flushing.lock); - dst->wb = &wb->inc; - } - } else { - mutex_lock(&wb->inc.lock); - dst->wb = &wb->inc; - } - - dst->room = darray_room(dst->wb->keys); - if (dst->wb == &wb->flushing) - dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); - dst->seq = seq; - - bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, - bch2_btree_write_buffer_journal_flush); - - darray_for_each(wb->accounting, i) - memset(&i->k.v, 0, bkey_val_bytes(&i->k.k)); -} - -int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - unsigned live_accounting_keys = 0; - int ret = 0; - - darray_for_each(wb->accounting, i) - if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) { - i->journal_seq = dst->seq; - live_accounting_keys++; - ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k); - if (ret) - break; - } - - if (live_accounting_keys * 2 < wb->accounting.nr) { - struct btree_write_buffered_key *dst = wb->accounting.data; - - darray_for_each(wb->accounting, src) - if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k))) - *dst++ = *src; - wb->accounting.nr = dst - wb->accounting.data; - wb_accounting_sort(wb); - } - - if (!dst->wb->keys.nr) - bch2_journal_pin_drop(&c->journal, &dst->wb->pin); - - if (bch2_btree_write_buffer_should_flush(c) && - __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) && - !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); - - if (dst->wb == &wb->flushing) - mutex_unlock(&wb->flushing.lock); - mutex_unlock(&wb->inc.lock); - - return ret; -} - -static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) -{ - if (wb->keys.size >= new_size) - return 0; - - if (!mutex_trylock(&wb->lock)) - return -EINTR; - - int ret = darray_resize(&wb->keys, new_size); - mutex_unlock(&wb->lock); - return ret; -} - -int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - return wb_keys_resize(&wb->flushing, new_size) ?: - wb_keys_resize(&wb->inc, new_size); -} - -void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && - !bch2_journal_error(&c->journal)); - - darray_exit(&wb->accounting); - darray_exit(&wb->sorted); - darray_exit(&wb->flushing.keys); - darray_exit(&wb->inc.keys); -} - -void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - mutex_init(&wb->inc.lock); - mutex_init(&wb->flushing.lock); - INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); -} - -int bch2_fs_btree_write_buffer_init(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - /* Will be resized by journal as needed: */ - unsigned initial_size = 1 << 16; - - return darray_make_room(&wb->inc.keys, initial_size) ?: - darray_make_room(&wb->flushing.keys, initial_size) ?: - darray_make_room(&wb->sorted, initial_size); -} diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h deleted file mode 100644 index c351d21aca0b..000000000000 --- a/fs/bcachefs/btree_write_buffer.h +++ /dev/null @@ -1,113 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H -#define _BCACHEFS_BTREE_WRITE_BUFFER_H - -#include "bkey.h" -#include "disk_accounting.h" - -static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4; -} - -static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4; -} - -struct btree_trans; -int bch2_btree_write_buffer_flush_sync(struct btree_trans *); -bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *); -int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); -int bch2_btree_write_buffer_tryflush(struct btree_trans *); - -struct bkey_buf; -int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *); - -struct journal_keys_to_wb { - struct btree_write_buffer_keys *wb; - size_t room; - u64 seq; -}; - -static inline int wb_key_cmp(const void *_l, const void *_r) -{ - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; - - return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p); -} - -int bch2_accounting_key_to_wb_slowpath(struct bch_fs *, - enum btree_id, struct bkey_i_accounting *); - -static inline int bch2_accounting_key_to_wb(struct bch_fs *c, - enum btree_id btree, struct bkey_i_accounting *k) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_write_buffered_key search; - search.btree = btree; - search.k.k.p = k->k.p; - - unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr, - sizeof(wb->accounting.data[0]), - wb_key_cmp, &search); - - if (idx >= wb->accounting.nr) - return bch2_accounting_key_to_wb_slowpath(c, btree, k); - - struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k); - bch2_accounting_accumulate(dst, accounting_i_to_s_c(k)); - return 0; -} - -int bch2_journal_key_to_wb_slowpath(struct bch_fs *, - struct journal_keys_to_wb *, - enum btree_id, struct bkey_i *); - -static inline int __bch2_journal_key_to_wb(struct bch_fs *c, - struct journal_keys_to_wb *dst, - enum btree_id btree, struct bkey_i *k) -{ - if (unlikely(!dst->room)) - return bch2_journal_key_to_wb_slowpath(c, dst, btree, k); - - struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); - wb_k->journal_seq = dst->seq; - wb_k->btree = btree; - bkey_copy(&wb_k->k, k); - dst->wb->keys.nr++; - dst->room--; - return 0; -} - -static inline int bch2_journal_key_to_wb(struct bch_fs *c, - struct journal_keys_to_wb *dst, - enum btree_id btree, struct bkey_i *k) -{ - if (unlikely(!btree_type_uses_write_buffer(btree))) { - int ret = bch2_btree_write_buffer_insert_err(c, btree, k); - dump_stack(); - return ret; - } - - EBUG_ON(!dst->seq); - - return k->k.type == KEY_TYPE_accounting - ? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k)) - : __bch2_journal_key_to_wb(c, dst, btree, k); -} - -void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); -int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); - -int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); -void bch2_fs_btree_write_buffer_exit(struct bch_fs *); -void bch2_fs_btree_write_buffer_init_early(struct bch_fs *); -int bch2_fs_btree_write_buffer_init(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h deleted file mode 100644 index e9e76e20f43b..000000000000 --- a/fs/bcachefs/btree_write_buffer_types.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H -#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H - -#include "darray.h" -#include "journal_types.h" - -#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 -#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) - -struct wb_key_ref { -union { - struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - unsigned idx:24; - u8 pos[sizeof(struct bpos)]; - enum btree_id btree:8; -#else - enum btree_id btree:8; - u8 pos[sizeof(struct bpos)]; - unsigned idx:24; -#endif - } __packed; - struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - u64 lo; - u64 mi; - u64 hi; -#else - u64 hi; - u64 mi; - u64 lo; -#endif - }; -}; -}; - -struct btree_write_buffered_key { - enum btree_id btree:8; - u64 journal_seq:56; - __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); -}; - -struct btree_write_buffer_keys { - DARRAY(struct btree_write_buffered_key) keys; - struct journal_entry_pin pin; - struct mutex lock; -}; - -struct btree_write_buffer { - DARRAY(struct wb_key_ref) sorted; - struct btree_write_buffer_keys inc; - struct btree_write_buffer_keys flushing; - struct work_struct flush_work; - - DARRAY(struct btree_write_buffered_key) accounting; -}; - -#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c deleted file mode 100644 index f25903c10e8a..000000000000 --- a/fs/bcachefs/buckets.c +++ /dev/null @@ -1,1395 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Code for manipulating bucket marks for garbage collection. - * - * Copyright 2014 Datera, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "backpointers.h" -#include "bset.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "buckets.h" -#include "buckets_waiting_for_journal.h" -#include "disk_accounting.h" -#include "ec.h" -#include "error.h" -#include "inode.h" -#include "movinggc.h" -#include "rebalance.h" -#include "recovery.h" -#include "recovery_passes.h" -#include "reflink.h" -#include "replicas.h" -#include "subvolume.h" -#include "trace.h" - -#include <linux/preempt.h> - -void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) -{ - for (unsigned i = 0; i < BCH_DATA_NR; i++) - usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets); -} - -void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage) -{ - memset(usage, 0, sizeof(*usage)); - acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, - sizeof(struct bch_dev_usage_full) / sizeof(u64)); -} - -static u64 reserve_factor(u64 r) -{ - return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); -} - -static struct bch_fs_usage_short -__bch2_fs_usage_read_short(struct bch_fs *c) -{ - struct bch_fs_usage_short ret; - u64 data, reserved; - - ret.capacity = c->capacity - - percpu_u64_get(&c->usage->hidden); - - data = percpu_u64_get(&c->usage->data) + - percpu_u64_get(&c->usage->btree); - reserved = percpu_u64_get(&c->usage->reserved) + - percpu_u64_get(c->online_reserved); - - ret.used = min(ret.capacity, data + reserve_factor(reserved)); - ret.free = ret.capacity - ret.used; - - ret.nr_inodes = percpu_u64_get(&c->usage->nr_inodes); - - return ret; -} - -struct bch_fs_usage_short -bch2_fs_usage_read_short(struct bch_fs *c) -{ - struct bch_fs_usage_short ret; - - percpu_down_read(&c->mark_lock); - ret = __bch2_fs_usage_read_short(c); - percpu_up_read(&c->mark_lock); - - return ret; -} - -void bch2_dev_usage_to_text(struct printbuf *out, - struct bch_dev *ca, - struct bch_dev_usage_full *usage) -{ - if (out->nr_tabstops < 5) { - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 12); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - } - - prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); - - for (unsigned i = 0; i < BCH_DATA_NR; i++) { - bch2_prt_data_type(out, i); - prt_printf(out, "\t%llu\r%llu\r%llu\r\n", - usage->d[i].buckets, - usage->d[i].sectors, - usage->d[i].fragmented); - } - - prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets); -} - -static int bch2_check_fix_ptr(struct btree_trans *trans, - struct bkey_s_c k, - struct extent_ptr_decoded p, - const union bch_extent_entry *entry, - bool *do_update) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); - if (!ca) { - if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID, - trans, ptr_to_invalid_device, - "pointer to missing device %u\n" - "while marking %s", - p.ptr.dev, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - return 0; - } - - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - if (!g) { - if (fsck_err(trans, ptr_to_invalid_device, - "pointer to invalid bucket on device %u\n" - "while marking %s", - p.ptr.dev, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - goto out; - } - - enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); - - if (fsck_err_on(!g->gen_valid, - trans, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - } else { - /* this pointer will be dropped */ - *do_update = true; - goto out; - } - } - - /* g->gen_valid == true */ - - if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, - trans, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached && - (g->data_type != BCH_DATA_btree || - data_type == BCH_DATA_btree)) { - g->data_type = data_type; - g->stripe_sectors = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } - - *do_update = true; - } - - if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, - trans, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - - if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, - trans, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - - if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) - goto out; - - if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), - trans, ptr_bucket_data_type_mismatch, - "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached && - data_type == BCH_DATA_btree) { - switch (g->data_type) { - case BCH_DATA_sb: - bch_err(c, "btree and superblock in the same bucket - cannot repair"); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto out; - case BCH_DATA_journal: - ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr)); - bch_err_msg(c, ret, "error deleting journal bucket %zu", - PTR_BUCKET_NR(ca, &p.ptr)); - if (ret) - goto out; - break; - } - - g->data_type = data_type; - g->stripe_sectors = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } else { - *do_update = true; - } - } - - if (p.has_ec) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); - - if (fsck_err_on(!m || !m->alive, - trans, ptr_to_missing_stripe, - "pointer to nonexistent stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - - if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), - trans, ptr_to_incorrect_stripe, - "pointer does not match stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - } -out: -fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -int bch2_check_fix_ptrs(struct btree_trans *trans, - enum btree_id btree, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry_c; - struct extent_ptr_decoded p = { 0 }; - bool do_update = false; - struct printbuf buf = PRINTBUF; - int ret = 0; - - /* We don't yet do btree key updates correctly for when we're RW */ - BUG_ON(test_bit(BCH_FS_rw, &c->flags)); - - bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { - ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); - if (ret) - goto err; - } - - if (do_update) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - scoped_guard(rcu) - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); - - if (level) { - /* - * We don't want to drop btree node pointers - if the - * btree node isn't there anymore, the read path will - * sort it out: - */ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - scoped_guard(rcu) - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen; - } - } else { - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; - - rcu_read_lock(); -restart_drop_ptrs: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); - - if ((p.ptr.cached && - (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || - (!p.ptr.cached && - gen_cmp(p.ptr.gen, g->gen) < 0) || - gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || - (g->data_type && - g->data_type != data_type)) { - bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); - goto restart_drop_ptrs; - } - } - rcu_read_unlock(); -again: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_extent_entry_for_each(ptrs, entry) { - if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, - entry->stripe_ptr.idx); - union bch_extent_entry *next_ptr; - - bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) - if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) - goto found; - next_ptr = NULL; -found: - if (!next_ptr) { - bch_err(c, "aieee, found stripe ptr with no data ptr"); - continue; - } - - if (!m || !m->alive || - !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], - &next_ptr->ptr, - m->sectors)) { - bch2_bkey_extent_entry_drop(new, entry); - goto again; - } - } - } - } - - if (0) { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, k); - bch_info(c, "updated %s", buf.buf); - - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); - bch_info(c, "new key %s", buf.buf); - } - - if (!(flags & BTREE_TRIGGER_is_root)) { - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, - BTREE_ITER_intent|BTREE_ITER_all_snapshots); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun); - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; - - if (level) - bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); - } else { - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, - jset_u64s(new->k.u64s)); - ret = PTR_ERR_OR_ZERO(e); - if (ret) - goto err; - - journal_entry_set(e, - BCH_JSET_ENTRY_btree_root, - btree, level - 1, - new, new->k.u64s); - - /* - * no locking, we're single threaded and not rw yet, see - * the big assertino above that we repeat here: - */ - BUG_ON(test_bit(BCH_FS_rw, &c->flags)); - - struct btree *b = bch2_btree_id_root(c, btree)->b; - bkey_copy(&b->key, new); - } - } -err: - printbuf_exit(&buf); - return ret; -} - -static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf, - struct bkey_s_c k, bool insert, enum bch_sb_error_id id) -{ - struct bch_fs *c = trans->c; - - prt_printf(buf, "\nwhile marking "); - bch2_bkey_val_to_text(buf, c, k); - prt_newline(buf); - - bool print = __bch2_count_fsck_err(c, id, buf); - - int ret = bch2_run_explicit_recovery_pass(c, buf, - BCH_RECOVERY_PASS_check_allocations, 0); - - if (insert) { - bch2_trans_updates_to_text(buf, trans); - __bch2_inconsistent_error(c, buf); - /* - * If we're in recovery, run_explicit_recovery_pass might give - * us an error code for rewinding recovery - */ - if (!ret) - ret = bch_err_throw(c, bucket_ref_update); - } else { - /* Always ignore overwrite errors, so that deletion works */ - ret = 0; - } - - if (print || insert) - bch2_print_str(c, KERN_ERR, buf->buf); - return ret; -} - -int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 b_gen, u8 bucket_data_type, - u32 *bucket_sectors) -{ - struct bch_fs *c = trans->c; - size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); - struct printbuf buf = PRINTBUF; - bool inserting = sectors > 0; - int ret = 0; - - BUG_ON(!sectors); - - if (unlikely(gen_after(ptr->gen, b_gen))) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen); - goto out; - } - - if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_ptr_too_stale); - goto out; - } - - if (b_gen != ptr->gen && ptr->cached) { - ret = 1; - goto out; - } - - if (unlikely(b_gen != ptr->gen)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)", - ptr->dev, bucket_nr, b_gen, - bucket_gen_get(ca, bucket_nr), - bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_stale_dirty_ptr); - goto out; - } - - if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type), - bch2_data_type_str(ptr_data_type)); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_ptr_bucket_data_type_mismatch); - goto out; - } - - if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type ?: ptr_data_type), - *bucket_sectors, sectors); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_bucket_sector_count_overflow); - sectors = -*bucket_sectors; - goto out; - } - - *bucket_sectors += sectors; -out: - printbuf_exit(&buf); - return ret; -} - -void bch2_trans_account_disk_usage_change(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; - static int warned_disk_usage = 0; - bool warn = false; - - percpu_down_read(&c->mark_lock); - struct bch_fs_usage_base *src = &trans->fs_usage_delta; - - s64 added = src->btree + src->data + src->reserved; - - /* - * Not allowed to reduce sectors_available except by getting a - * reservation: - */ - s64 should_not_have_added = added - (s64) disk_res_sectors; - if (unlikely(should_not_have_added > 0)) { - u64 old, new; - - old = atomic64_read(&c->sectors_available); - do { - new = max_t(s64, 0, old - should_not_have_added); - } while (!atomic64_try_cmpxchg(&c->sectors_available, - &old, new)); - - added -= should_not_have_added; - warn = true; - } - - if (added > 0) { - trans->disk_res->sectors -= added; - this_cpu_sub(*c->online_reserved, added); - } - - preempt_disable(); - struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); - acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); - preempt_enable(); - percpu_up_read(&c->mark_lock); - - if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) - bch2_trans_inconsistent(trans, - "disk usage increased %lli more than %llu sectors reserved)", - should_not_have_added, disk_res_sectors); -} - -/* KEY_TYPE_extent: */ - -static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, - struct bkey_s_c k, - const struct extent_ptr_decoded *p, - s64 sectors, enum bch_data_type ptr_data_type, - struct bch_alloc_v4 *a, - bool insert) -{ - u32 *dst_sectors = p->has_ec ? &a->stripe_sectors : - !p->ptr.cached ? &a->dirty_sectors : - &a->cached_sectors; - int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type, - a->gen, a->data_type, dst_sectors); - - if (ret) - return ret; - if (insert) - alloc_data_type_set(a, ptr_data_type); - return 0; -} - -static int bch2_trigger_pointer(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - const union bch_extent_entry *entry, - s64 *sectors, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - bool insert = !(flags & BTREE_TRIGGER_overwrite); - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp); - - *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len; - - struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); - if (unlikely(!ca)) { - if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) - ret = bch_err_throw(c, trigger_pointer); - goto err; - } - - struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); - if (!bucket_valid(ca, bucket.offset)) { - if (insert) { - bch2_dev_bucket_missing(ca, bucket.offset); - ret = bch_err_throw(c, trigger_pointer); - } - goto err; - } - - if (flags & BTREE_TRIGGER_transactional) { - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: - __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert); - if (ret) - goto err; - - ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); - if (ret) - goto err; - } - - if (flags & BTREE_TRIGGER_gc) { - struct bucket *g = gc_bucket(ca, bucket.offset); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", - p.ptr.dev, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch_err_throw(c, trigger_pointer); - goto err; - } - - bucket_lock(g); - struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); - alloc_to_bucket(g, new); - bucket_unlock(g); - - if (!ret) - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); - } -err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -static int bch2_trigger_stripe_ptr(struct btree_trans *trans, - struct bkey_s_c k, - struct extent_ptr_decoded p, - enum bch_data_type data_type, - s64 sectors, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - - if (flags & BTREE_TRIGGER_transactional) { - struct btree_iter iter; - struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_with_updates, stripe); - int ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, - "pointer to nonexistent stripe %llu", - (u64) p.ec.idx); - goto err; - } - - if (!bch2_ptr_matches_stripe(&s->v, p)) { - bch2_trans_inconsistent(trans, - "stripe pointer doesn't match stripe %llu", - (u64) p.ec.idx); - ret = bch_err_throw(c, trigger_stripe_pointer); - goto err; - } - - stripe_blockcount_set(&s->v, p.ec.block, - stripe_blockcount_get(&s->v, p.ec.block) + - sectors); - - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); - acc.replicas.data_type = data_type; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; - } - - if (flags & BTREE_TRIGGER_gc) { - struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); - if (!m) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", - (u64) p.ec.idx); - return bch_err_throw(c, ENOMEM_mark_stripe_ptr); - } - - gc_stripe_lock(m); - - if (!m || !m->alive) { - gc_stripe_unlock(m); - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "pointer to nonexistent stripe %llu\n while marking ", - (u64) p.ec.idx); - bch2_bkey_val_to_text(&buf, c, k); - __bch2_inconsistent_error(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, trigger_stripe_pointer); - } - - m->block_sectors[p.ec.block] += sectors; - - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA"); - gc_stripe_unlock(m); - - acc.replicas.data_type = data_type; - int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true); - if (ret) - return ret; - } - - return 0; -} - -static int __trigger_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) -{ - bool gc = flags & BTREE_TRIGGER_gc; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - enum bch_data_type data_type = bkey_is_btree_ptr(k.k) - ? BCH_DATA_btree - : BCH_DATA_user; - int ret = 0; - - s64 replicas_sectors = 0; - - struct disk_accounting_pos acc_replicas_key; - memset(&acc_replicas_key, 0, sizeof(acc_replicas_key)); - acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas; - acc_replicas_key.replicas.data_type = data_type; - acc_replicas_key.replicas.nr_devs = 0; - acc_replicas_key.replicas.nr_required = 1; - - unsigned cur_compression_type = 0; - u64 compression_acct[3] = { 1, 0, 0 }; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = 0; - ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); - if (ret < 0) - return ret; - - bool stale = ret > 0; - - if (p.ptr.cached && stale) - continue; - - if (p.ptr.cached) { - ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc); - if (ret) - return ret; - } else if (!p.has_ec) { - replicas_sectors += disk_sectors; - replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev); - } else { - ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); - if (ret) - return ret; - - /* - * There may be other dirty pointers in this extent, but - * if so they're not required for mounting if we have an - * erasure coded pointer in this extent: - */ - acc_replicas_key.replicas.nr_required = 0; - } - - if (cur_compression_type && - cur_compression_type != p.crc.compression_type) { - if (flags & BTREE_TRIGGER_overwrite) - bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - - ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, - compression, cur_compression_type); - if (ret) - return ret; - - compression_acct[0] = 1; - compression_acct[1] = 0; - compression_acct[2] = 0; - } - - cur_compression_type = p.crc.compression_type; - if (p.crc.compression_type) { - compression_acct[1] += p.crc.uncompressed_size; - compression_acct[2] += p.crc.compressed_size; - } - } - - if (acc_replicas_key.replicas.nr_devs) { - ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); - if (ret) - return ret; - } - - if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot); - if (ret) - return ret; - } - - if (cur_compression_type) { - if (flags & BTREE_TRIGGER_overwrite) - bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - - ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, - compression, cur_compression_type); - if (ret) - return ret; - } - - if (level) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id); - if (ret) - return ret; - } else { - bool insert = !(flags & BTREE_TRIGGER_overwrite); - - s64 v[3] = { - insert ? 1 : -1, - insert ? k.k->size : -((s64) k.k->size), - replicas_sectors, - }; - ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); - if (ret) - return ret; - } - - return 0; -} - -int bch2_trigger_extent(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); - struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); - unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; - unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start; - - if (unlikely(flags & BTREE_TRIGGER_check_repair)) - return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags); - - /* if pointers aren't changing - nothing to do: */ - if (new_ptrs_bytes == old_ptrs_bytes && - !memcmp(new_ptrs.start, - old_ptrs.start, - new_ptrs_bytes)) - return 0; - - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - if (old.k->type) { - int ret = __trigger_extent(trans, btree, level, old, - flags & ~BTREE_TRIGGER_insert); - if (ret) - return ret; - } - - if (new.k->type) { - int ret = __trigger_extent(trans, btree, level, new.s_c, - flags & ~BTREE_TRIGGER_overwrite); - if (ret) - return ret; - } - - int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta[1] = { 0 }; - - s64 s = bch2_bkey_sectors_need_rebalance(c, old); - need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta[0] -= s; - - s = bch2_bkey_sectors_need_rebalance(c, new.s_c); - need_rebalance_delta += s != 0; - need_rebalance_sectors_delta[0] += s; - - if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - new.k->p, need_rebalance_delta > 0); - if (ret) - return ret; - } - - if (need_rebalance_sectors_delta[0]) { - int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, - need_rebalance_sectors_delta, rebalance_work); - if (ret) - return ret; - } - } - - return 0; -} - -/* KEY_TYPE_reservation */ - -static int __trigger_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) -{ - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 sectors[1] = { k.k->size }; - - if (flags & BTREE_TRIGGER_overwrite) - sectors[0] = -sectors[0]; - - return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors, - persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas); - } - - return 0; -} - -int bch2_trigger_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags); -} - -/* Mark superblocks: */ - -static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, u64 b, - enum bch_data_type type, - unsigned sectors) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - int ret = 0; - - struct bkey_i_alloc_v4 *a = - bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b)); - if (IS_ERR(a)) - return PTR_ERR(a); - - if (a->v.data_type && type && a->v.data_type != type) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s\n", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - bch2_data_type_str(type), - bch2_data_type_str(type)); - - bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); - - ret = bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_allocations, 0); - - /* Always print, this is always fatal */ - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - if (!ret) - ret = bch_err_throw(c, metadata_bucket_inconsistency); - goto err; - } - - if (a->v.data_type != type || - a->v.dirty_sectors != sectors) { - a->v.data_type = type; - a->v.dirty_sectors = sectors; - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, - u64 b, enum bch_data_type data_type, unsigned sectors, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - struct bucket *g = gc_bucket(ca, b); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", - ca->dev_idx, bch2_data_type_str(data_type))) - goto err; - - bucket_lock(g); - struct bch_alloc_v4 old = bucket_m_to_alloc(*g); - - if (bch2_fs_inconsistent_on(g->data_type && - g->data_type != data_type, c, - "different types of data in same bucket: %s, %s", - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type))) - goto err_unlock; - - if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, - "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", - ca->dev_idx, b, g->gen, - bch2_data_type_str(g->data_type ?: data_type), - g->dirty_sectors, sectors)) - goto err_unlock; - - g->data_type = data_type; - g->dirty_sectors += sectors; - struct bch_alloc_v4 new = bucket_m_to_alloc(*g); - bucket_unlock(g); - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); - return ret; -err_unlock: - bucket_unlock(g); -err: - return bch_err_throw(c, metadata_bucket_inconsistency); -} - -int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, u64 b, - enum bch_data_type type, unsigned sectors, - enum btree_iter_update_trigger_flags flags) -{ - BUG_ON(type != BCH_DATA_free && - type != BCH_DATA_sb && - type != BCH_DATA_journal); - - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) - return 0; - - if (flags & BTREE_TRIGGER_gc) - return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags); - else if (flags & BTREE_TRIGGER_transactional) - return commit_do(trans, NULL, NULL, 0, - __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); - else - BUG(); -} - -static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, - struct bch_dev *ca, u64 start, u64 end, - enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors, - enum btree_iter_update_trigger_flags flags) -{ - do { - u64 b = sector_to_bucket(ca, start); - unsigned sectors = - min_t(u64, bucket_to_sector(ca, b + 1), end) - start; - - if (b != *bucket && *bucket_sectors) { - int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, - type, *bucket_sectors, flags); - if (ret) - return ret; - - *bucket_sectors = 0; - } - - *bucket = b; - *bucket_sectors += sectors; - start += sectors; - } while (start < end); - - return 0; -} - -static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - - mutex_lock(&c->sb_lock); - struct bch_sb_layout layout = ca->disk_sb.sb->layout; - mutex_unlock(&c->sb_lock); - - u64 bucket = 0; - unsigned i, bucket_sectors = 0; - int ret; - - for (i = 0; i < layout.nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout.sb_offset[i]); - - if (offset == BCH_SB_SECTOR) { - ret = bch2_trans_mark_metadata_sectors(trans, ca, - 0, BCH_SB_SECTOR, - BCH_DATA_sb, &bucket, &bucket_sectors, flags); - if (ret) - return ret; - } - - ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, - offset + (1 << layout.sb_max_size_bits), - BCH_DATA_sb, &bucket, &bucket_sectors, flags); - if (ret) - return ret; - } - - if (bucket_sectors) { - ret = bch2_trans_mark_metadata_bucket(trans, ca, - bucket, BCH_DATA_sb, bucket_sectors, flags); - if (ret) - return ret; - } - - for (i = 0; i < ca->journal.nr; i++) { - ret = bch2_trans_mark_metadata_bucket(trans, ca, - ca->journal.buckets[i], - BCH_DATA_journal, ca->mi.bucket_size, flags); - if (ret) - return ret; - } - - return 0; -} - -int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, - enum btree_iter_update_trigger_flags flags) -{ - int ret = bch2_trans_run(c, - __bch2_trans_mark_dev_sb(trans, ca, flags)); - bch_err_fn(c, ret); - return ret; -} - -int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, - enum btree_iter_update_trigger_flags flags) -{ - for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) { - int ret = bch2_trans_mark_dev_sb(c, ca, flags); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs); - return ret; - } - } - - return 0; -} - -int bch2_trans_mark_dev_sbs(struct bch_fs *c) -{ - return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); -} - -bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - u64 b_offset = bucket_to_sector(ca, b); - u64 b_end = bucket_to_sector(ca, b + 1); - unsigned i; - - if (!b) - return true; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - u64 end = offset + (1 << layout->sb_max_size_bits); - - if (!(offset >= b_end || end <= b_offset)) - return true; - } - - for (i = 0; i < ca->journal.nr; i++) - if (b == ca->journal.buckets[i]) - return true; - - return false; -} - -/* Disk reservations: */ - -#define SECTORS_CACHE 1024 - -int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - u64 sectors, enum bch_reservation_flags flags) -{ - struct bch_fs_pcpu *pcpu; - u64 old, get; - u64 sectors_available; - int ret; - - percpu_down_read(&c->mark_lock); - preempt_disable(); - pcpu = this_cpu_ptr(c->pcpu); - - if (sectors <= pcpu->sectors_available) - goto out; - - old = atomic64_read(&c->sectors_available); - do { - get = min((u64) sectors + SECTORS_CACHE, old); - - if (get < sectors) { - preempt_enable(); - goto recalculate; - } - } while (!atomic64_try_cmpxchg(&c->sectors_available, - &old, old - get)); - - pcpu->sectors_available += get; - -out: - pcpu->sectors_available -= sectors; - this_cpu_add(*c->online_reserved, sectors); - res->sectors += sectors; - - preempt_enable(); - percpu_up_read(&c->mark_lock); - return 0; - -recalculate: - mutex_lock(&c->sectors_available_lock); - - percpu_u64_set(&c->pcpu->sectors_available, 0); - sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); - - if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL)) - sectors = min(sectors, sectors_available); - - if (sectors <= sectors_available || - (flags & BCH_DISK_RESERVATION_NOFAIL)) { - atomic64_set(&c->sectors_available, - max_t(s64, 0, sectors_available - sectors)); - this_cpu_add(*c->online_reserved, sectors); - res->sectors += sectors; - ret = 0; - } else { - atomic64_set(&c->sectors_available, sectors_available); - ret = bch_err_throw(c, ENOSPC_disk_reservation); - } - - mutex_unlock(&c->sectors_available_lock); - percpu_up_read(&c->mark_lock); - - return ret; -} - -/* Startup/shutdown: */ - -void bch2_buckets_nouse_free(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - kvfree_rcu_mightsleep(ca->buckets_nouse); - ca->buckets_nouse = NULL; - } -} - -int bch2_buckets_nouse_alloc(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - BUG_ON(ca->buckets_nouse); - - ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO); - if (!ca->buckets_nouse) { - bch2_dev_put(ca); - return bch_err_throw(c, ENOMEM_buckets_nouse); - } - } - - return 0; -} - -static void bucket_gens_free_rcu(struct rcu_head *rcu) -{ - struct bucket_gens *buckets = - container_of(rcu, struct bucket_gens, rcu); - - kvfree(buckets); -} - -int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -{ - struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; - bool resize = ca->bucket_gens != NULL; - int ret; - - if (resize) - lockdep_assert_held(&c->state_lock); - - if (resize && ca->buckets_nouse) - return bch_err_throw(c, no_resize_with_buckets_nouse); - - bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), - GFP_KERNEL|__GFP_ZERO); - if (!bucket_gens) { - ret = bch_err_throw(c, ENOMEM_bucket_gens); - goto err; - } - - bucket_gens->first_bucket = ca->mi.first_bucket; - bucket_gens->nbuckets = nbuckets; - bucket_gens->nbuckets_minus_first = - bucket_gens->nbuckets - bucket_gens->first_bucket; - - old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); - - if (resize) { - u64 copy = min(bucket_gens->nbuckets, - old_bucket_gens->nbuckets); - memcpy(bucket_gens->b, - old_bucket_gens->b, - sizeof(bucket_gens->b[0]) * copy); - } - - ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch, - ca->mi.nbuckets, nbuckets) ?: - bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty, - ca->mi.nbuckets, nbuckets); - - rcu_assign_pointer(ca->bucket_gens, bucket_gens); - bucket_gens = old_bucket_gens; - - nbuckets = ca->mi.nbuckets; - - ret = 0; -err: - if (bucket_gens) - call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); - - return ret; -} - -void bch2_dev_buckets_free(struct bch_dev *ca) -{ - kvfree(ca->buckets_nouse); - kvfree(rcu_dereference_protected(ca->bucket_gens, 1)); - free_percpu(ca->usage); -} - -int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) -{ - ca->usage = alloc_percpu(struct bch_dev_usage_full); - if (!ca->usage) - return bch_err_throw(c, ENOMEM_usage_init); - - return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); -} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h deleted file mode 100644 index 49a3807a5eab..000000000000 --- a/fs/bcachefs/buckets.h +++ /dev/null @@ -1,369 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Code for manipulating bucket marks for garbage collection. - * - * Copyright 2014 Datera, Inc. - */ - -#ifndef _BUCKETS_H -#define _BUCKETS_H - -#include "buckets_types.h" -#include "extents.h" -#include "sb-members.h" - -static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s) -{ - return div_u64(s, ca->mi.bucket_size); -} - -static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) -{ - return ((sector_t) b) * ca->mi.bucket_size; -} - -static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) -{ - u32 remainder; - - div_u64_rem(s, ca->mi.bucket_size, &remainder); - return remainder; -} - -static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset) -{ - return div_u64_rem(s, ca->mi.bucket_size, offset); -} - -#define for_each_bucket(_b, _buckets) \ - for (_b = (_buckets)->b + (_buckets)->first_bucket; \ - _b < (_buckets)->b + (_buckets)->nbuckets; _b++) - -static inline void bucket_unlock(struct bucket *b) -{ - BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); - - clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock); - smp_mb__after_atomic(); - wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR); -} - -static inline void bucket_lock(struct bucket *b) -{ - wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR, - TASK_UNINTERRUPTIBLE); -} - -static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) -{ - return bucket_valid(ca, b) - ? genradix_ptr(&ca->buckets_gc, b) - : NULL; -} - -static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) -{ - return rcu_dereference_check(ca->bucket_gens, - lockdep_is_held(&ca->fs->state_lock)); -} - -static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) -{ - struct bucket_gens *gens = bucket_gens(ca); - - if (b - gens->first_bucket >= gens->nbuckets_minus_first) - return NULL; - return gens->b + b; -} - -static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b) -{ - u8 *gen = bucket_gen(ca, b); - return gen ? *gen : -1; -} - -static inline int bucket_gen_get(struct bch_dev *ca, size_t b) -{ - guard(rcu)(); - return bucket_gen_get_rcu(ca, b); -} - -static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return sector_to_bucket(ca, ptr->offset); -} - -static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); -} - -static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca, - const struct bch_extent_ptr *ptr, - u32 *bucket_offset) -{ - return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); -} - -static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr)); -} - -static inline enum bch_data_type ptr_data_type(const struct bkey *k, - const struct bch_extent_ptr *ptr) -{ - if (bkey_is_btree_ptr(k)) - return BCH_DATA_btree; - - return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; -} - -static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) -{ - EBUG_ON(sectors < 0); - - return crc_is_compressed(p.crc) - ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, - p.crc.uncompressed_size) - : sectors; -} - -static inline int gen_cmp(u8 a, u8 b) -{ - return (s8) (a - b); -} - -static inline int gen_after(u8 a, u8 b) -{ - return max(0, gen_cmp(a, b)); -} - -static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) -{ - int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr)); - return gen < 0 ? gen : gen_after(gen, ptr->gen); -} - -/** - * dev_ptr_stale() - check if a pointer points into a bucket that has been - * invalidated. - */ -static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) -{ - guard(rcu)(); - return dev_ptr_stale_rcu(ca, ptr); -} - -/* Device usage: */ - -void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *); -static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) -{ - struct bch_dev_usage ret; - - bch2_dev_usage_read_fast(ca, &ret); - return ret; -} - -void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *); -static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca) -{ - struct bch_dev_usage_full ret; - - bch2_dev_usage_full_read_fast(ca, &ret); - return ret; -} - -void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *); - -static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) -{ - s64 reserved = 0; - - switch (watermark) { - case BCH_WATERMARK_NR: - BUG(); - case BCH_WATERMARK_stripe: - reserved += ca->mi.nbuckets >> 6; - fallthrough; - case BCH_WATERMARK_normal: - reserved += ca->mi.nbuckets >> 6; - fallthrough; - case BCH_WATERMARK_copygc: - reserved += ca->nr_btree_reserve; - fallthrough; - case BCH_WATERMARK_btree: - reserved += ca->nr_btree_reserve; - fallthrough; - case BCH_WATERMARK_btree_copygc: - case BCH_WATERMARK_reclaim: - case BCH_WATERMARK_interior_updates: - break; - } - - return reserved; -} - -static inline u64 dev_buckets_free(struct bch_dev *ca, - struct bch_dev_usage usage, - enum bch_watermark watermark) -{ - return max_t(s64, 0, - usage.buckets[BCH_DATA_free]- - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, watermark)); -} - -static inline u64 __dev_buckets_available(struct bch_dev *ca, - struct bch_dev_usage usage, - enum bch_watermark watermark) -{ - return max_t(s64, 0, - usage.buckets[BCH_DATA_free] - + usage.buckets[BCH_DATA_cached] - + usage.buckets[BCH_DATA_need_gc_gens] - + usage.buckets[BCH_DATA_need_discard] - - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, watermark)); -} - -static inline u64 dev_buckets_available(struct bch_dev *ca, - enum bch_watermark watermark) -{ - return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark); -} - -/* Filesystem usage: */ - -struct bch_fs_usage_short -bch2_fs_usage_read_short(struct bch_fs *); - -int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *, - struct bkey_s_c, const struct bch_extent_ptr *, - s64, enum bch_data_type, u8, u8, u32 *); - -int bch2_check_fix_ptrs(struct btree_trans *, - enum btree_id, unsigned, struct bkey_s_c, - enum btree_iter_update_trigger_flags); - -int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); -int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ -({ \ - int ret = 0; \ - \ - if (_old.k->type) \ - ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert); \ - if (!ret && _new.k->type) \ - ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\ - ret; \ -}) - -void bch2_trans_account_disk_usage_change(struct btree_trans *); - -int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64, - enum bch_data_type, unsigned, - enum btree_iter_update_trigger_flags); -int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *, - enum btree_iter_update_trigger_flags); -int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, - enum btree_iter_update_trigger_flags); -int bch2_trans_mark_dev_sbs(struct bch_fs *); - -bool bch2_is_superblock_bucket(struct bch_dev *, u64); - -static inline const char *bch2_data_type_str(enum bch_data_type type) -{ - return type < BCH_DATA_NR - ? __bch2_data_types[type] - : "(invalid data type)"; -} - -/* disk reservations: */ - -static inline void bch2_disk_reservation_put(struct bch_fs *c, - struct disk_reservation *res) -{ - if (res->sectors) { - this_cpu_sub(*c->online_reserved, res->sectors); - res->sectors = 0; - } -} - -enum bch_reservation_flags { - BCH_DISK_RESERVATION_NOFAIL = 1 << 0, - BCH_DISK_RESERVATION_PARTIAL = 1 << 1, -}; - -int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *, - u64, enum bch_reservation_flags); - -static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - u64 sectors, enum bch_reservation_flags flags) -{ -#ifdef __KERNEL__ - u64 old, new; - - old = this_cpu_read(c->pcpu->sectors_available); - do { - if (sectors > old) - return __bch2_disk_reservation_add(c, res, sectors, flags); - - new = old - sectors; - } while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new)); - - this_cpu_add(*c->online_reserved, sectors); - res->sectors += sectors; - return 0; -#else - return __bch2_disk_reservation_add(c, res, sectors, flags); -#endif -} - -static inline struct disk_reservation -bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) -{ - return (struct disk_reservation) { - .sectors = 0, -#if 0 - /* not used yet: */ - .gen = c->capacity_gen, -#endif - .nr_replicas = nr_replicas, - }; -} - -static inline int bch2_disk_reservation_get(struct bch_fs *c, - struct disk_reservation *res, - u64 sectors, unsigned nr_replicas, - int flags) -{ - *res = bch2_disk_reservation_init(c, nr_replicas); - - return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); -} - -#define RESERVE_FACTOR 6 - -static inline u64 avail_factor(u64 r) -{ - return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); -} - -void bch2_buckets_nouse_free(struct bch_fs *); -int bch2_buckets_nouse_alloc(struct bch_fs *); - -int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); -void bch2_dev_buckets_free(struct bch_dev *); -int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); - -#endif /* _BUCKETS_H */ diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h deleted file mode 100644 index 0aed2500ade3..000000000000 --- a/fs/bcachefs/buckets_types.h +++ /dev/null @@ -1,100 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BUCKETS_TYPES_H -#define _BUCKETS_TYPES_H - -#include "bcachefs_format.h" -#include "util.h" - -#define BUCKET_JOURNAL_SEQ_BITS 16 - -/* - * Ugly hack alert: - * - * We need to cram a spinlock in a single byte, because that's what we have left - * in struct bucket, and we care about the size of these - during fsck, we need - * in memory state for every single bucket on every device. - * - * We used to do - * while (xchg(&b->lock, 1) cpu_relax(); - * but, it turns out not all architectures support xchg on a single byte. - * - * So now we use bit_spin_lock(), with fun games since we can't burn a whole - * ulong for this - we just need to make sure the lock bit always ends up in the - * first byte. - */ - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define BUCKET_LOCK_BITNR 0 -#else -#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) -#endif - -union ulong_byte_assert { - ulong ulong; - u8 byte; -}; - -struct bucket { - u8 lock; - u8 gen_valid:1; - u8 data_type:7; - u8 gen; - u8 stripe_redundancy; - u32 stripe; - u32 dirty_sectors; - u32 cached_sectors; - u32 stripe_sectors; -} __aligned(sizeof(long)); - -struct bucket_gens { - struct rcu_head rcu; - u16 first_bucket; - size_t nbuckets; - size_t nbuckets_minus_first; - u8 b[] __counted_by(nbuckets); -}; - -/* Only info on bucket countns: */ -struct bch_dev_usage { - u64 buckets[BCH_DATA_NR]; -}; - -struct bch_dev_usage_full { - struct bch_dev_usage_type { - u64 buckets; - u64 sectors; /* _compressed_ sectors: */ - /* - * XXX - * Why do we have this? Isn't it just buckets * bucket_size - - * sectors? - */ - u64 fragmented; - } d[BCH_DATA_NR]; -}; - -struct bch_fs_usage_base { - u64 hidden; - u64 btree; - u64 data; - u64 cached; - u64 reserved; - u64 nr_inodes; -}; - -struct bch_fs_usage_short { - u64 capacity; - u64 used; - u64 free; - u64 nr_inodes; -}; - -/* - * A reservation for space on disk: - */ -struct disk_reservation { - u64 sectors; - u32 gen; - unsigned nr_replicas; -}; - -#endif /* _BUCKETS_TYPES_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c deleted file mode 100644 index 832eff93acb6..000000000000 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "buckets_waiting_for_journal.h" -#include <linux/hash.h> -#include <linux/random.h> - -static inline struct bucket_hashed * -bucket_hash(struct buckets_waiting_for_journal_table *t, - unsigned hash_seed_idx, u64 dev_bucket) -{ - return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits); -} - -static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits) -{ - unsigned i; - - t->bits = bits; - for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) - get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); - memset(t->d, 0, sizeof(t->d[0]) << t->bits); -} - -u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b, - unsigned dev, u64 bucket) -{ - struct buckets_waiting_for_journal_table *t; - u64 dev_bucket = (u64) dev << 56 | bucket; - u64 ret = 0; - - mutex_lock(&b->lock); - t = b->t; - - for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { - struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); - - if (h->dev_bucket == dev_bucket) { - ret = h->journal_seq; - break; - } - } - - mutex_unlock(&b->lock); - - return ret; -} - -static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, - struct bucket_hashed *new, - u64 flushed_seq) -{ - struct bucket_hashed *last_evicted = NULL; - unsigned tries, i; - - for (tries = 0; tries < 10; tries++) { - struct bucket_hashed *old, *victim = NULL; - - for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { - old = bucket_hash(t, i, new->dev_bucket); - - if (old->dev_bucket == new->dev_bucket || - old->journal_seq <= flushed_seq) { - *old = *new; - return true; - } - - if (last_evicted != old) - victim = old; - } - - /* hashed to same slot 3 times: */ - if (!victim) - break; - - /* Failed to find an empty slot: */ - swap(*new, *victim); - last_evicted = victim; - } - - return false; -} - -int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, - u64 flushed_seq, - unsigned dev, u64 bucket, - u64 journal_seq) -{ - struct buckets_waiting_for_journal_table *t, *n; - struct bucket_hashed tmp, new = { - .dev_bucket = (u64) dev << 56 | bucket, - .journal_seq = journal_seq, - }; - size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0; - int ret = 0; - - mutex_lock(&b->lock); - - if (likely(bucket_table_insert(b->t, &new, flushed_seq))) - goto out; - - t = b->t; - size = 1UL << t->bits; - for (i = 0; i < size; i++) - nr_elements += t->d[i].journal_seq > flushed_seq; - - new_bits = ilog2(roundup_pow_of_two(nr_elements * 3)); -realloc: - n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); - if (!n) { - struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal); - ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set); - goto out; - } - -retry_rehash: - if (nr_rehashes_this_size == 3) { - new_bits++; - nr_rehashes_this_size = 0; - kvfree(n); - goto realloc; - } - - nr_rehashes++; - nr_rehashes_this_size++; - - bucket_table_init(n, new_bits); - - tmp = new; - BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); - - for (i = 0; i < 1UL << t->bits; i++) { - if (t->d[i].journal_seq <= flushed_seq) - continue; - - tmp = t->d[i]; - if (!bucket_table_insert(n, &tmp, flushed_seq)) - goto retry_rehash; - } - - b->t = n; - kvfree(t); - - pr_debug("took %zu rehashes, table at %zu/%lu elements", - nr_rehashes, nr_elements, 1UL << b->t->bits); -out: - mutex_unlock(&b->lock); - - return ret; -} - -void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) -{ - struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; - - kvfree(b->t); -} - -#define INITIAL_TABLE_BITS 3 - -int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) -{ - struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; - - mutex_init(&b->lock); - - b->t = kvmalloc(sizeof(*b->t) + - (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL); - if (!b->t) - return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init; - - bucket_table_init(b->t, INITIAL_TABLE_BITS); - return 0; -} diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h deleted file mode 100644 index 365619ca44c8..000000000000 --- a/fs/bcachefs/buckets_waiting_for_journal.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H -#define _BUCKETS_WAITING_FOR_JOURNAL_H - -#include "buckets_waiting_for_journal_types.h" - -u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *, - unsigned, u64); -int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, - u64, unsigned, u64, u64); - -void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); -int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); - -#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h deleted file mode 100644 index e593db061d81..000000000000 --- a/fs/bcachefs/buckets_waiting_for_journal_types.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H -#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H - -#include <linux/siphash.h> - -struct bucket_hashed { - u64 dev_bucket; - u64 journal_seq; -}; - -struct buckets_waiting_for_journal_table { - unsigned bits; - u64 hash_seeds[3]; - struct bucket_hashed d[]; -}; - -struct buckets_waiting_for_journal { - struct mutex lock; - struct buckets_waiting_for_journal_table *t; -}; - -#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c deleted file mode 100644 index 5ea89aa2b0c4..000000000000 --- a/fs/bcachefs/chardev.c +++ /dev/null @@ -1,843 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_CHARDEV - -#include "bcachefs.h" -#include "bcachefs_ioctl.h" -#include "buckets.h" -#include "chardev.h" -#include "disk_accounting.h" -#include "fsck.h" -#include "journal.h" -#include "move.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-counters.h" -#include "super-io.h" -#include "thread_with_file.h" - -#include <linux/cdev.h> -#include <linux/device.h> -#include <linux/fs.h> -#include <linux/ioctl.h> -#include <linux/major.h> -#include <linux/sched/task.h> -#include <linux/slab.h> -#include <linux/uaccess.h> - -/* returns with ref on ca->ref */ -static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, - unsigned flags) -{ - struct bch_dev *ca; - - if (flags & BCH_BY_INDEX) { - if (dev >= c->sb.nr_devices) - return ERR_PTR(-EINVAL); - - ca = bch2_dev_tryget_noerror(c, dev); - if (!ca) - return ERR_PTR(-EINVAL); - } else { - char *path; - - path = strndup_user((const char __user *) - (unsigned long) dev, PATH_MAX); - if (IS_ERR(path)) - return ERR_CAST(path); - - ca = bch2_dev_lookup(c, path); - kfree(path); - } - - return ca; -} - -#if 0 -static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) -{ - struct bch_ioctl_assemble arg; - struct bch_fs *c; - u64 *user_devs = NULL; - char **devs = NULL; - unsigned i; - int ret = -EFAULT; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags || arg.pad) - return -EINVAL; - - user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); - if (!user_devs) - return -ENOMEM; - - devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); - - if (copy_from_user(user_devs, user_arg->devs, - sizeof(u64) * arg.nr_devs)) - goto err; - - for (i = 0; i < arg.nr_devs; i++) { - devs[i] = strndup_user((const char __user *)(unsigned long) - user_devs[i], - PATH_MAX); - ret= PTR_ERR_OR_ZERO(devs[i]); - if (ret) - goto err; - } - - c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); - ret = PTR_ERR_OR_ZERO(c); - if (!ret) - closure_put(&c->cl); -err: - if (devs) - for (i = 0; i < arg.nr_devs; i++) - kfree(devs[i]); - kfree(devs); - return ret; -} - -static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) -{ - struct bch_ioctl_incremental arg; - const char *err; - char *path; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags || arg.pad) - return -EINVAL; - - path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - ret = PTR_ERR_OR_ZERO(path); - if (ret) - return ret; - - err = bch2_fs_open_incremental(path); - kfree(path); - - if (err) { - pr_err("Could not register bcachefs devices: %s", err); - return -EINVAL; - } - - return 0; -} -#endif - -static long bch2_global_ioctl(unsigned cmd, void __user *arg) -{ - long ret; - - switch (cmd) { -#if 0 - case BCH_IOCTL_ASSEMBLE: - return bch2_ioctl_assemble(arg); - case BCH_IOCTL_INCREMENTAL: - return bch2_ioctl_incremental(arg); -#endif - case BCH_IOCTL_FSCK_OFFLINE: { - ret = bch2_ioctl_fsck_offline(arg); - break; - } - default: - ret = -ENOTTY; - break; - } - - if (ret < 0) - ret = bch2_err_class(ret); - return ret; -} - -static long bch2_ioctl_query_uuid(struct bch_fs *c, - struct bch_ioctl_query_uuid __user *user_arg) -{ - return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid, - sizeof(c->sb.user_uuid)); -} - -#if 0 -static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (arg.flags || arg.pad) - return -EINVAL; - - return bch2_fs_start(c); -} - -static long bch2_ioctl_stop(struct bch_fs *c) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - bch2_fs_stop(c); - return 0; -} -#endif - -static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) -{ - char *path; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (arg.flags || arg.pad) - return -EINVAL; - - path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - ret = PTR_ERR_OR_ZERO(path); - if (ret) - return ret; - - ret = bch2_dev_add(c, path); - if (!IS_ERR(path)) - kfree(path); - - return ret; -} - -static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) -{ - struct bch_dev *ca; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| - BCH_FORCE_IF_METADATA_LOST| - BCH_FORCE_IF_DEGRADED| - BCH_BY_INDEX)) || - arg.pad) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - return bch2_dev_remove(c, ca, arg.flags); -} - -static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) -{ - char *path; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (arg.flags || arg.pad) - return -EINVAL; - - path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - ret = PTR_ERR_OR_ZERO(path); - if (ret) - return ret; - - ret = bch2_dev_online(c, path); - kfree(path); - return ret; -} - -static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) -{ - struct bch_dev *ca; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| - BCH_FORCE_IF_METADATA_LOST| - BCH_FORCE_IF_DEGRADED| - BCH_BY_INDEX)) || - arg.pad) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch2_dev_offline(c, ca, arg.flags); - bch2_dev_put(ca); - return ret; -} - -static long bch2_ioctl_disk_set_state(struct bch_fs *c, - struct bch_ioctl_disk_set_state arg) -{ - struct bch_dev *ca; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| - BCH_FORCE_IF_METADATA_LOST| - BCH_FORCE_IF_DEGRADED| - BCH_BY_INDEX)) || - arg.pad[0] || arg.pad[1] || arg.pad[2] || - arg.new_state >= BCH_MEMBER_STATE_NR) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); - if (ret) - bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); - - bch2_dev_put(ca); - return ret; -} - -struct bch_data_ctx { - struct thread_with_file thr; - - struct bch_fs *c; - struct bch_ioctl_data arg; - struct bch_move_stats stats; -}; - -static int bch2_data_thread(void *arg) -{ - struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); - - ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); - if (ctx->thr.ret == -BCH_ERR_device_offline) - ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; - else { - ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; - ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; - } - enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data); - return 0; -} - -static int bch2_data_job_release(struct inode *inode, struct file *file) -{ - struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - - bch2_thread_with_file_exit(&ctx->thr); - kfree(ctx); - return 0; -} - -static ssize_t bch2_data_job_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - struct bch_fs *c = ctx->c; - struct bch_ioctl_data_event e = { - .type = BCH_DATA_EVENT_PROGRESS, - .ret = ctx->stats.ret, - .p.data_type = ctx->stats.data_type, - .p.btree_id = ctx->stats.pos.btree, - .p.pos = ctx->stats.pos.pos, - .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), - .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected), - .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected), - }; - - if (ctx->arg.op == BCH_DATA_OP_scrub) { - struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); - if (ca) { - struct bch_dev_usage_full u; - bch2_dev_usage_full_read_fast(ca, &u); - for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) - if (ctx->arg.scrub.data_types & BIT(i)) - e.p.sectors_total += u.d[i].sectors; - bch2_dev_put(ca); - } - } else { - e.p.sectors_total = bch2_fs_usage_read_short(c).used; - } - - if (len < sizeof(e)) - return -EINVAL; - - return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e); -} - -static const struct file_operations bcachefs_data_ops = { - .release = bch2_data_job_release, - .read = bch2_data_job_read, -}; - -static long bch2_ioctl_data(struct bch_fs *c, - struct bch_ioctl_data arg) -{ - struct bch_data_ctx *ctx; - int ret; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data)) - return -EROFS; - - if (!capable(CAP_SYS_ADMIN)) { - ret = -EPERM; - goto put_ref; - } - - if (arg.op >= BCH_DATA_OP_NR || arg.flags) { - ret = -EINVAL; - goto put_ref; - } - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) { - ret = -ENOMEM; - goto put_ref; - } - - ctx->c = c; - ctx->arg = arg; - - ret = bch2_run_thread_with_file(&ctx->thr, - &bcachefs_data_ops, - bch2_data_thread); - if (ret < 0) - goto cleanup; - return ret; -cleanup: - kfree(ctx); -put_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data); - return ret; -} - -static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c, - struct bch_ioctl_fs_usage __user *user_arg) -{ - struct bch_ioctl_fs_usage arg = {}; - darray_char replicas = {}; - u32 replica_entries_bytes; - int ret = 0; - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) - return -EFAULT; - - ret = bch2_fs_replicas_usage_read(c, &replicas) ?: - (replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?: - copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr); - if (ret) - goto err; - - struct bch_fs_usage_short u = bch2_fs_usage_read_short(c); - arg.capacity = c->capacity; - arg.used = u.used; - arg.online_reserved = percpu_u64_get(c->online_reserved); - arg.replica_entries_bytes = replicas.nr; - - for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { - struct disk_accounting_pos k; - disk_accounting_key_init(k, persistent_reserved, .nr_replicas = i); - - bch2_accounting_mem_read(c, - disk_accounting_pos_to_bpos(&k), - &arg.persistent_reserved[i], 1); - } - - ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); -err: - darray_exit(&replicas); - return ret; -} - -static long bch2_ioctl_query_accounting(struct bch_fs *c, - struct bch_ioctl_query_accounting __user *user_arg) -{ - struct bch_ioctl_query_accounting arg; - darray_char accounting = {}; - int ret = 0; - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?: - bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?: - (arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?: - copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr); - if (ret) - goto err; - - arg.capacity = c->capacity; - arg.used = bch2_fs_usage_read_short(c).used; - arg.online_reserved = percpu_u64_get(c->online_reserved); - arg.accounting_u64s = accounting.nr / sizeof(u64); - - ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); -err: - darray_exit(&accounting); - return ret; -} - -/* obsolete, didn't allow for new data types: */ -static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c, - struct bch_ioctl_dev_usage __user *user_arg) -{ - struct bch_ioctl_dev_usage arg; - struct bch_dev_usage_full src; - struct bch_dev *ca; - unsigned i; - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad[0] || - arg.pad[1] || - arg.pad[2]) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - src = bch2_dev_usage_full_read(ca); - - arg.state = ca->mi.state; - arg.bucket_size = ca->mi.bucket_size; - arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - - for (i = 0; i < ARRAY_SIZE(arg.d); i++) { - arg.d[i].buckets = src.d[i].buckets; - arg.d[i].sectors = src.d[i].sectors; - arg.d[i].fragmented = src.d[i].fragmented; - } - - bch2_dev_put(ca); - - return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); -} - -static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, - struct bch_ioctl_dev_usage_v2 __user *user_arg) -{ - struct bch_ioctl_dev_usage_v2 arg; - struct bch_dev_usage_full src; - struct bch_dev *ca; - int ret = 0; - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad[0] || - arg.pad[1] || - arg.pad[2]) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - src = bch2_dev_usage_full_read(ca); - - arg.state = ca->mi.state; - arg.bucket_size = ca->mi.bucket_size; - arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR); - arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - - ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); - if (ret) - goto err; - - for (unsigned i = 0; i < arg.nr_data_types; i++) { - struct bch_ioctl_dev_usage_type t = { - .buckets = src.d[i].buckets, - .sectors = src.d[i].sectors, - .fragmented = src.d[i].fragmented, - }; - - ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t)); - if (ret) - goto err; - } -err: - bch2_dev_put(ca); - return ret; -} - -static long bch2_ioctl_read_super(struct bch_fs *c, - struct bch_ioctl_read_super arg) -{ - struct bch_dev *ca = NULL; - struct bch_sb *sb; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || - arg.pad) - return -EINVAL; - - mutex_lock(&c->sb_lock); - - if (arg.flags & BCH_READ_DEV) { - ca = bch2_device_lookup(c, arg.dev, arg.flags); - ret = PTR_ERR_OR_ZERO(ca); - if (ret) - goto err_unlock; - - sb = ca->disk_sb.sb; - } else { - sb = c->disk_sb.sb; - } - - if (vstruct_bytes(sb) > arg.size) { - ret = -ERANGE; - goto err; - } - - ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb, - vstruct_bytes(sb)); -err: - bch2_dev_put(ca); -err_unlock: - mutex_unlock(&c->sb_lock); - return ret; -} - -static long bch2_ioctl_disk_get_idx(struct bch_fs *c, - struct bch_ioctl_disk_get_idx arg) -{ - dev_t dev = huge_decode_dev(arg.dev); - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!dev) - return -EINVAL; - - guard(rcu)(); - for_each_online_member_rcu(c, ca) - if (ca->dev == dev) - return ca->dev_idx; - - return bch_err_throw(c, ENOENT_dev_idx_not_found); -} - -static long bch2_ioctl_disk_resize(struct bch_fs *c, - struct bch_ioctl_disk_resize arg) -{ - struct bch_dev *ca; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch2_dev_resize(c, ca, arg.nbuckets); - - bch2_dev_put(ca); - return ret; -} - -static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, - struct bch_ioctl_disk_resize_journal arg) -{ - struct bch_dev *ca; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad) - return -EINVAL; - - if (arg.nbuckets > U32_MAX) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); - - bch2_dev_put(ca); - return ret; -} - -#define BCH_IOCTL(_name, _argtype) \ -do { \ - _argtype i; \ - \ - if (copy_from_user(&i, arg, sizeof(i))) \ - return -EFAULT; \ - ret = bch2_ioctl_##_name(c, i); \ - goto out; \ -} while (0) - -long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) -{ - long ret; - - switch (cmd) { - case BCH_IOCTL_QUERY_UUID: - return bch2_ioctl_query_uuid(c, arg); - case BCH_IOCTL_FS_USAGE: - return bch2_ioctl_fs_usage(c, arg); - case BCH_IOCTL_DEV_USAGE: - return bch2_ioctl_dev_usage(c, arg); - case BCH_IOCTL_DEV_USAGE_V2: - return bch2_ioctl_dev_usage_v2(c, arg); -#if 0 - case BCH_IOCTL_START: - BCH_IOCTL(start, struct bch_ioctl_start); - case BCH_IOCTL_STOP: - return bch2_ioctl_stop(c); -#endif - case BCH_IOCTL_READ_SUPER: - BCH_IOCTL(read_super, struct bch_ioctl_read_super); - case BCH_IOCTL_DISK_GET_IDX: - BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); - } - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - switch (cmd) { - case BCH_IOCTL_DISK_ADD: - BCH_IOCTL(disk_add, struct bch_ioctl_disk); - case BCH_IOCTL_DISK_REMOVE: - BCH_IOCTL(disk_remove, struct bch_ioctl_disk); - case BCH_IOCTL_DISK_ONLINE: - BCH_IOCTL(disk_online, struct bch_ioctl_disk); - case BCH_IOCTL_DISK_OFFLINE: - BCH_IOCTL(disk_offline, struct bch_ioctl_disk); - case BCH_IOCTL_DISK_SET_STATE: - BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); - case BCH_IOCTL_DATA: - BCH_IOCTL(data, struct bch_ioctl_data); - case BCH_IOCTL_DISK_RESIZE: - BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); - case BCH_IOCTL_DISK_RESIZE_JOURNAL: - BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); - case BCH_IOCTL_FSCK_ONLINE: - BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); - case BCH_IOCTL_QUERY_ACCOUNTING: - return bch2_ioctl_query_accounting(c, arg); - case BCH_IOCTL_QUERY_COUNTERS: - return bch2_ioctl_query_counters(c, arg); - default: - return -ENOTTY; - } -out: - if (ret < 0) - ret = bch2_err_class(ret); - return ret; -} - -static DEFINE_IDR(bch_chardev_minor); - -static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) -{ - unsigned minor = iminor(file_inode(filp)); - struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; - void __user *arg = (void __user *) v; - - return c - ? bch2_fs_ioctl(c, cmd, arg) - : bch2_global_ioctl(cmd, arg); -} - -static const struct file_operations bch_chardev_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = bch2_chardev_ioctl, - .open = nonseekable_open, -}; - -static int bch_chardev_major; -static const struct class bch_chardev_class = { - .name = "bcachefs", -}; -static struct device *bch_chardev; - -void bch2_fs_chardev_exit(struct bch_fs *c) -{ - if (!IS_ERR_OR_NULL(c->chardev)) - device_unregister(c->chardev); - if (c->minor >= 0) - idr_remove(&bch_chardev_minor, c->minor); -} - -int bch2_fs_chardev_init(struct bch_fs *c) -{ - c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); - if (c->minor < 0) - return c->minor; - - c->chardev = device_create(&bch_chardev_class, NULL, - MKDEV(bch_chardev_major, c->minor), c, - "bcachefs%u-ctl", c->minor); - if (IS_ERR(c->chardev)) - return PTR_ERR(c->chardev); - - return 0; -} - -void bch2_chardev_exit(void) -{ - device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX)); - class_unregister(&bch_chardev_class); - if (bch_chardev_major > 0) - unregister_chrdev(bch_chardev_major, "bcachefs"); -} - -int __init bch2_chardev_init(void) -{ - int ret; - - bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); - if (bch_chardev_major < 0) - return bch_chardev_major; - - ret = class_register(&bch_chardev_class); - if (ret) - goto major_out; - - bch_chardev = device_create(&bch_chardev_class, NULL, - MKDEV(bch_chardev_major, U8_MAX), - NULL, "bcachefs-ctl"); - if (IS_ERR(bch_chardev)) { - ret = PTR_ERR(bch_chardev); - goto class_out; - } - - return 0; - -class_out: - class_unregister(&bch_chardev_class); -major_out: - unregister_chrdev(bch_chardev_major, "bcachefs-ctl"); - return ret; -} - -#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h deleted file mode 100644 index 0f563ca53c36..000000000000 --- a/fs/bcachefs/chardev.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_CHARDEV_H -#define _BCACHEFS_CHARDEV_H - -#ifndef NO_BCACHEFS_FS - -long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); - -void bch2_fs_chardev_exit(struct bch_fs *); -int bch2_fs_chardev_init(struct bch_fs *); - -void bch2_chardev_exit(void); -int __init bch2_chardev_init(void); - -#else - -static inline long bch2_fs_ioctl(struct bch_fs *c, - unsigned cmd, void __user * arg) -{ - return -ENOTTY; -} - -static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} -static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } - -static inline void bch2_chardev_exit(void) {} -static inline int __init bch2_chardev_init(void) { return 0; } - -#endif /* NO_BCACHEFS_FS */ - -#endif /* _BCACHEFS_CHARDEV_H */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c deleted file mode 100644 index a6795e73f0b9..000000000000 --- a/fs/bcachefs/checksum.c +++ /dev/null @@ -1,698 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "checksum.h" -#include "errcode.h" -#include "error.h" -#include "super.h" -#include "super-io.h" - -#include <linux/crc32c.h> -#include <linux/xxhash.h> -#include <linux/key.h> -#include <linux/random.h> -#include <linux/ratelimit.h> -#include <crypto/chacha.h> -#include <crypto/poly1305.h> -#include <keys/user-type.h> - -/* - * bch2_checksum state is an abstraction of the checksum state calculated over different pages. - * it features page merging without having the checksum algorithm lose its state. - * for native checksum aglorithms (like crc), a default seed value will do. - * for hash-like algorithms, a state needs to be stored - */ - -struct bch2_checksum_state { - union { - u64 seed; - struct xxh64_state h64state; - }; - unsigned int type; -}; - -static void bch2_checksum_init(struct bch2_checksum_state *state) -{ - switch (state->type) { - case BCH_CSUM_none: - case BCH_CSUM_crc32c: - case BCH_CSUM_crc64: - state->seed = 0; - break; - case BCH_CSUM_crc32c_nonzero: - state->seed = U32_MAX; - break; - case BCH_CSUM_crc64_nonzero: - state->seed = U64_MAX; - break; - case BCH_CSUM_xxhash: - xxh64_reset(&state->h64state, 0); - break; - default: - BUG(); - } -} - -static u64 bch2_checksum_final(const struct bch2_checksum_state *state) -{ - switch (state->type) { - case BCH_CSUM_none: - case BCH_CSUM_crc32c: - case BCH_CSUM_crc64: - return state->seed; - case BCH_CSUM_crc32c_nonzero: - return state->seed ^ U32_MAX; - case BCH_CSUM_crc64_nonzero: - return state->seed ^ U64_MAX; - case BCH_CSUM_xxhash: - return xxh64_digest(&state->h64state); - default: - BUG(); - } -} - -static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) -{ - switch (state->type) { - case BCH_CSUM_none: - return; - case BCH_CSUM_crc32c_nonzero: - case BCH_CSUM_crc32c: - state->seed = crc32c(state->seed, data, len); - break; - case BCH_CSUM_crc64_nonzero: - case BCH_CSUM_crc64: - state->seed = crc64_be(state->seed, data, len); - break; - case BCH_CSUM_xxhash: - xxh64_update(&state->h64state, data, len); - break; - default: - BUG(); - } -} - -static void bch2_chacha20_init(struct chacha_state *state, - const struct bch_key *key, struct nonce nonce) -{ - u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)]; - - BUILD_BUG_ON(sizeof(key_words) != sizeof(*key)); - memcpy(key_words, key, sizeof(key_words)); - le32_to_cpu_array(key_words, ARRAY_SIZE(key_words)); - - BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE); - chacha_init(state, key_words, (const u8 *)nonce.d); - - memzero_explicit(key_words, sizeof(key_words)); -} - -void bch2_chacha20(const struct bch_key *key, struct nonce nonce, - void *data, size_t len) -{ - struct chacha_state state; - - bch2_chacha20_init(&state, key, nonce); - chacha20_crypt(&state, data, data, len); - chacha_zeroize_state(&state); -} - -static void bch2_poly1305_init(struct poly1305_desc_ctx *desc, - struct bch_fs *c, struct nonce nonce) -{ - u8 key[POLY1305_KEY_SIZE] = { 0 }; - - nonce.d[3] ^= BCH_NONCE_POLY; - - bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key)); - poly1305_init(desc, key); -} - -struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, - struct nonce nonce, const void *data, size_t len) -{ - switch (type) { - case BCH_CSUM_none: - case BCH_CSUM_crc32c_nonzero: - case BCH_CSUM_crc64_nonzero: - case BCH_CSUM_crc32c: - case BCH_CSUM_xxhash: - case BCH_CSUM_crc64: { - struct bch2_checksum_state state; - - state.type = type; - - bch2_checksum_init(&state); - bch2_checksum_update(&state, data, len); - - return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; - } - - case BCH_CSUM_chacha20_poly1305_80: - case BCH_CSUM_chacha20_poly1305_128: { - struct poly1305_desc_ctx dctx; - u8 digest[POLY1305_DIGEST_SIZE]; - struct bch_csum ret = { 0 }; - - bch2_poly1305_init(&dctx, c, nonce); - poly1305_update(&dctx, data, len); - poly1305_final(&dctx, digest); - - memcpy(&ret, digest, bch_crc_bytes[type]); - return ret; - } - default: - return (struct bch_csum) {}; - } -} - -int bch2_encrypt(struct bch_fs *c, unsigned type, - struct nonce nonce, void *data, size_t len) -{ - if (!bch2_csum_type_is_encryption(type)) - return 0; - - if (bch2_fs_inconsistent_on(!c->chacha20_key_set, - c, "attempting to encrypt without encryption key")) - return bch_err_throw(c, no_encryption_key); - - bch2_chacha20(&c->chacha20_key, nonce, data, len); - return 0; -} - -static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv; - - switch (type) { - case BCH_CSUM_none: - return (struct bch_csum) { 0 }; - case BCH_CSUM_crc32c_nonzero: - case BCH_CSUM_crc64_nonzero: - case BCH_CSUM_crc32c: - case BCH_CSUM_xxhash: - case BCH_CSUM_crc64: { - struct bch2_checksum_state state; - - state.type = type; - bch2_checksum_init(&state); - -#ifdef CONFIG_HIGHMEM - __bio_for_each_segment(bv, bio, *iter, *iter) { - void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; - - bch2_checksum_update(&state, p, bv.bv_len); - kunmap_local(p); - } -#else - __bio_for_each_bvec(bv, bio, *iter, *iter) - bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset, - bv.bv_len); -#endif - return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; - } - - case BCH_CSUM_chacha20_poly1305_80: - case BCH_CSUM_chacha20_poly1305_128: { - struct poly1305_desc_ctx dctx; - u8 digest[POLY1305_DIGEST_SIZE]; - struct bch_csum ret = { 0 }; - - bch2_poly1305_init(&dctx, c, nonce); - -#ifdef CONFIG_HIGHMEM - __bio_for_each_segment(bv, bio, *iter, *iter) { - void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; - - poly1305_update(&dctx, p, bv.bv_len); - kunmap_local(p); - } -#else - __bio_for_each_bvec(bv, bio, *iter, *iter) - poly1305_update(&dctx, - page_address(bv.bv_page) + bv.bv_offset, - bv.bv_len); -#endif - poly1305_final(&dctx, digest); - - memcpy(&ret, digest, bch_crc_bytes[type]); - return ret; - } - default: - return (struct bch_csum) {}; - } -} - -struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) -{ - struct bvec_iter iter = bio->bi_iter; - - return __bch2_checksum_bio(c, type, nonce, bio, &iter); -} - -int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) -{ - struct bio_vec bv; - struct bvec_iter iter; - struct chacha_state chacha_state; - int ret = 0; - - if (bch2_fs_inconsistent_on(!c->chacha20_key_set, - c, "attempting to encrypt without encryption key")) - return bch_err_throw(c, no_encryption_key); - - bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce); - - bio_for_each_segment(bv, bio, iter) { - void *p; - - /* - * chacha_crypt() assumes that the length is a multiple of - * CHACHA_BLOCK_SIZE on any non-final call. - */ - if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) { - bch_err_ratelimited(c, "bio not aligned for encryption"); - ret = -EIO; - break; - } - - p = bvec_kmap_local(&bv); - chacha20_crypt(&chacha_state, p, p, bv.bv_len); - kunmap_local(p); - } - chacha_zeroize_state(&chacha_state); - return ret; -} - -struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, - struct bch_csum b, size_t b_len) -{ - struct bch2_checksum_state state; - - state.type = type; - bch2_checksum_init(&state); - state.seed = le64_to_cpu(a.lo); - - BUG_ON(!bch2_checksum_mergeable(type)); - - while (b_len) { - unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE); - - bch2_checksum_update(&state, - page_address(ZERO_PAGE(0)), page_len); - b_len -= page_len; - } - a.lo = cpu_to_le64(bch2_checksum_final(&state)); - a.lo ^= b.lo; - a.hi ^= b.hi; - return a; -} - -int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, - struct bversion version, - struct bch_extent_crc_unpacked crc_old, - struct bch_extent_crc_unpacked *crc_a, - struct bch_extent_crc_unpacked *crc_b, - unsigned len_a, unsigned len_b, - unsigned new_csum_type) -{ - struct bvec_iter iter = bio->bi_iter; - struct nonce nonce = extent_nonce(version, crc_old); - struct bch_csum merged = { 0 }; - struct crc_split { - struct bch_extent_crc_unpacked *crc; - unsigned len; - unsigned csum_type; - struct bch_csum csum; - } splits[3] = { - { crc_a, len_a, new_csum_type, { 0 }}, - { crc_b, len_b, new_csum_type, { 0 } }, - { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } }, - }, *i; - bool mergeable = crc_old.csum_type == new_csum_type && - bch2_checksum_mergeable(new_csum_type); - unsigned crc_nonce = crc_old.nonce; - - BUG_ON(len_a + len_b > bio_sectors(bio)); - BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); - BUG_ON(crc_is_compressed(crc_old)); - BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != - bch2_csum_type_is_encryption(new_csum_type)); - - for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { - iter.bi_size = i->len << 9; - if (mergeable || i->crc) - i->csum = __bch2_checksum_bio(c, i->csum_type, - nonce, bio, &iter); - else - bio_advance_iter(bio, &iter, i->len << 9); - nonce = nonce_add(nonce, i->len << 9); - } - - if (mergeable) - for (i = splits; i < splits + ARRAY_SIZE(splits); i++) - merged = bch2_checksum_merge(new_csum_type, merged, - i->csum, i->len << 9); - else - merged = bch2_checksum_bio(c, crc_old.csum_type, - extent_nonce(version, crc_old), bio); - - if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n" - " expected %0llx:%0llx got %0llx:%0llx (old type ", - __func__, - crc_old.csum.hi, - crc_old.csum.lo, - merged.hi, - merged.lo); - bch2_prt_csum_type(&buf, crc_old.csum_type); - prt_str(&buf, " new type "); - bch2_prt_csum_type(&buf, new_csum_type); - prt_str(&buf, ")"); - WARN_RATELIMIT(1, "%s", buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, recompute_checksum); - } - - for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { - if (i->crc) - *i->crc = (struct bch_extent_crc_unpacked) { - .csum_type = i->csum_type, - .compression_type = crc_old.compression_type, - .compressed_size = i->len, - .uncompressed_size = i->len, - .offset = 0, - .live_size = i->len, - .nonce = crc_nonce, - .csum = i->csum, - }; - - if (bch2_csum_type_is_encryption(new_csum_type)) - crc_nonce += i->len; - } - - return 0; -} - -/* BCH_SB_FIELD_crypt: */ - -static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - - if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&crypt->field), sizeof(*crypt)); - return -BCH_ERR_invalid_sb_crypt; - } - - if (BCH_CRYPT_KDF_TYPE(crypt)) { - prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); - return -BCH_ERR_invalid_sb_crypt; - } - - return 0; -} - -static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - - prt_printf(out, "KFD: %llu\n", BCH_CRYPT_KDF_TYPE(crypt)); - prt_printf(out, "scrypt n: %llu\n", BCH_KDF_SCRYPT_N(crypt)); - prt_printf(out, "scrypt r: %llu\n", BCH_KDF_SCRYPT_R(crypt)); - prt_printf(out, "scrypt p: %llu\n", BCH_KDF_SCRYPT_P(crypt)); -} - -const struct bch_sb_field_ops bch_sb_field_ops_crypt = { - .validate = bch2_sb_crypt_validate, - .to_text = bch2_sb_crypt_to_text, -}; - -#ifdef __KERNEL__ -static int __bch2_request_key(char *key_description, struct bch_key *key) -{ - struct key *keyring_key; - const struct user_key_payload *ukp; - int ret; - - keyring_key = request_key(&key_type_user, key_description, NULL); - if (IS_ERR(keyring_key)) - return PTR_ERR(keyring_key); - - down_read(&keyring_key->sem); - ukp = dereference_key_locked(keyring_key); - if (ukp->datalen == sizeof(*key)) { - memcpy(key, ukp->data, ukp->datalen); - ret = 0; - } else { - ret = -EINVAL; - } - up_read(&keyring_key->sem); - key_put(keyring_key); - - return ret; -} -#else -#include <keyutils.h> - -static int __bch2_request_key(char *key_description, struct bch_key *key) -{ - key_serial_t key_id; - - key_id = request_key("user", key_description, NULL, - KEY_SPEC_SESSION_KEYRING); - if (key_id >= 0) - goto got_key; - - key_id = request_key("user", key_description, NULL, - KEY_SPEC_USER_KEYRING); - if (key_id >= 0) - goto got_key; - - key_id = request_key("user", key_description, NULL, - KEY_SPEC_USER_SESSION_KEYRING); - if (key_id >= 0) - goto got_key; - - return -errno; -got_key: - - if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) - return -1; - - return 0; -} - -#include "crypto.h" -#endif - -int bch2_request_key(struct bch_sb *sb, struct bch_key *key) -{ - struct printbuf key_description = PRINTBUF; - int ret; - - prt_printf(&key_description, "bcachefs:"); - pr_uuid(&key_description, sb->user_uuid.b); - - ret = __bch2_request_key(key_description.buf, key); - printbuf_exit(&key_description); - -#ifndef __KERNEL__ - if (ret) { - char *passphrase = read_passphrase("Enter passphrase: "); - struct bch_encrypted_key sb_key; - - bch2_passphrase_check(sb, passphrase, - key, &sb_key); - ret = 0; - } -#endif - - /* stash with memfd, pass memfd fd to mount */ - - return ret; -} - -#ifndef __KERNEL__ -int bch2_revoke_key(struct bch_sb *sb) -{ - key_serial_t key_id; - struct printbuf key_description = PRINTBUF; - - prt_printf(&key_description, "bcachefs:"); - pr_uuid(&key_description, sb->user_uuid.b); - - key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING); - printbuf_exit(&key_description); - if (key_id < 0) - return errno; - - keyctl_revoke(key_id); - - return 0; -} -#endif - -int bch2_decrypt_sb_key(struct bch_fs *c, - struct bch_sb_field_crypt *crypt, - struct bch_key *key) -{ - struct bch_encrypted_key sb_key = crypt->key; - struct bch_key user_key; - int ret = 0; - - /* is key encrypted? */ - if (!bch2_key_is_encrypted(&sb_key)) - goto out; - - ret = bch2_request_key(c->disk_sb.sb, &user_key); - if (ret) { - bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); - goto err; - } - - /* decrypt real key: */ - bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key)); - - if (bch2_key_is_encrypted(&sb_key)) { - bch_err(c, "incorrect encryption key"); - ret = -EINVAL; - goto err; - } -out: - *key = sb_key.key; -err: - memzero_explicit(&sb_key, sizeof(sb_key)); - memzero_explicit(&user_key, sizeof(user_key)); - return ret; -} - -#if 0 - -/* - * This seems to be duplicating code in cmd_remove_passphrase() in - * bcachefs-tools, but we might want to switch userspace to use this - and - * perhaps add an ioctl for calling this at runtime, so we can take the - * passphrase off of a mounted filesystem (which has come up). - */ -int bch2_disable_encryption(struct bch_fs *c) -{ - struct bch_sb_field_crypt *crypt; - struct bch_key key; - int ret = -EINVAL; - - mutex_lock(&c->sb_lock); - - crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); - if (!crypt) - goto out; - - /* is key encrypted? */ - ret = 0; - if (bch2_key_is_encrypted(&crypt->key)) - goto out; - - ret = bch2_decrypt_sb_key(c, crypt, &key); - if (ret) - goto out; - - crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC); - crypt->key.key = key; - - SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); - bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); - - return ret; -} - -/* - * For enabling encryption on an existing filesystem: not hooked up yet, but it - * should be - */ -int bch2_enable_encryption(struct bch_fs *c, bool keyed) -{ - struct bch_encrypted_key key; - struct bch_key user_key; - struct bch_sb_field_crypt *crypt; - int ret = -EINVAL; - - mutex_lock(&c->sb_lock); - - /* Do we already have an encryption key? */ - if (bch2_sb_field_get(c->disk_sb.sb, crypt)) - goto err; - - ret = bch2_alloc_ciphers(c); - if (ret) - goto err; - - key.magic = cpu_to_le64(BCH_KEY_MAGIC); - get_random_bytes(&key.key, sizeof(key.key)); - - if (keyed) { - ret = bch2_request_key(c->disk_sb.sb, &user_key); - if (ret) { - bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); - goto err; - } - - ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), - &key, sizeof(key)); - if (ret) - goto err; - } - - ret = crypto_skcipher_setkey(&c->chacha20->base, - (void *) &key.key, sizeof(key.key)); - if (ret) - goto err; - - crypt = bch2_sb_field_resize(&c->disk_sb, crypt, - sizeof(*crypt) / sizeof(u64)); - if (!crypt) { - ret = bch_err_throw(c, ENOSPC_sb_crypt); - goto err; - } - - crypt->key = key; - - /* write superblock */ - SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); - bch2_write_super(c); -err: - mutex_unlock(&c->sb_lock); - memzero_explicit(&user_key, sizeof(user_key)); - memzero_explicit(&key, sizeof(key)); - return ret; -} -#endif - -void bch2_fs_encryption_exit(struct bch_fs *c) -{ - memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key)); -} - -int bch2_fs_encryption_init(struct bch_fs *c) -{ - struct bch_sb_field_crypt *crypt; - int ret; - - crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); - if (!crypt) - return 0; - - ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key); - if (ret) - return ret; - c->chacha20_key_set = true; - return 0; -} diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h deleted file mode 100644 index 7bd9cf6104ca..000000000000 --- a/fs/bcachefs/checksum.h +++ /dev/null @@ -1,240 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_CHECKSUM_H -#define _BCACHEFS_CHECKSUM_H - -#include "bcachefs.h" -#include "extents_types.h" -#include "super-io.h" - -#include <linux/crc64.h> -#include <crypto/chacha.h> - -static inline bool bch2_checksum_mergeable(unsigned type) -{ - - switch (type) { - case BCH_CSUM_none: - case BCH_CSUM_crc32c: - case BCH_CSUM_crc64: - return true; - default: - return false; - } -} - -struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, - struct bch_csum, size_t); - -#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) -#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) -#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) -#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) -#define BCH_NONCE_POLY cpu_to_le32(1 << 31) - -struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, - const void *, size_t); - -/* - * This is used for various on disk data structures - bch_sb, prio_set, bset, - * jset: The checksum is _always_ the first field of these structs - */ -#define csum_vstruct(_c, _type, _nonce, _i) \ -({ \ - const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\ - \ - bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\ -}) - -static inline void bch2_csum_to_text(struct printbuf *out, - enum bch_csum_type type, - struct bch_csum csum) -{ - const u8 *p = (u8 *) &csum; - unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16; - - for (unsigned i = 0; i < bytes; i++) - prt_hex_byte(out, p[i]); -} - -static inline void bch2_csum_err_msg(struct printbuf *out, - enum bch_csum_type type, - struct bch_csum expected, - struct bch_csum got) -{ - prt_str(out, "checksum error, type "); - bch2_prt_csum_type(out, type); - prt_str(out, ": got "); - bch2_csum_to_text(out, type, got); - prt_str(out, " should be "); - bch2_csum_to_text(out, type, expected); -} - -void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t); - -int bch2_request_key(struct bch_sb *, struct bch_key *); -#ifndef __KERNEL__ -int bch2_revoke_key(struct bch_sb *); -#endif - -int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, - void *data, size_t); - -struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, - struct nonce, struct bio *); - -int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, - struct bch_extent_crc_unpacked, - struct bch_extent_crc_unpacked *, - struct bch_extent_crc_unpacked *, - unsigned, unsigned, unsigned); - -int __bch2_encrypt_bio(struct bch_fs *, unsigned, - struct nonce, struct bio *); - -static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) -{ - return bch2_csum_type_is_encryption(type) - ? __bch2_encrypt_bio(c, type, nonce, bio) - : 0; -} - -extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; - -int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, - struct bch_key *); - -#if 0 -int bch2_disable_encryption(struct bch_fs *); -int bch2_enable_encryption(struct bch_fs *, bool); -#endif - -void bch2_fs_encryption_exit(struct bch_fs *); -int bch2_fs_encryption_init(struct bch_fs *); - -static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type, - bool data) -{ - switch (type) { - case BCH_CSUM_OPT_none: - return BCH_CSUM_none; - case BCH_CSUM_OPT_crc32c: - return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; - case BCH_CSUM_OPT_crc64: - return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; - case BCH_CSUM_OPT_xxhash: - return BCH_CSUM_xxhash; - default: - BUG(); - } -} - -static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, - struct bch_io_opts opts) -{ - if (opts.nocow) - return 0; - - if (c->sb.encryption_type) - return c->opts.wide_macs - ? BCH_CSUM_chacha20_poly1305_128 - : BCH_CSUM_chacha20_poly1305_80; - - return bch2_csum_opt_to_type(opts.data_checksum, true); -} - -static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) -{ - if (c->sb.encryption_type) - return BCH_CSUM_chacha20_poly1305_128; - - return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); -} - -static inline bool bch2_checksum_type_valid(const struct bch_fs *c, - unsigned type) -{ - if (type >= BCH_CSUM_NR) - return false; - - if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set) - return false; - - return true; -} - -/* returns true if not equal */ -static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) -{ - /* - * XXX: need some way of preventing the compiler from optimizing this - * into a form that isn't constant time.. - */ - return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; -} - -/* for skipping ahead and encrypting/decrypting at an offset: */ -static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) -{ - EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - - le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); - return nonce; -} - -static inline struct nonce null_nonce(void) -{ - struct nonce ret; - - memset(&ret, 0, sizeof(ret)); - return ret; -} - -static inline struct nonce extent_nonce(struct bversion version, - struct bch_extent_crc_unpacked crc) -{ - unsigned compression_type = crc_is_compressed(crc) - ? crc.compression_type - : 0; - unsigned size = compression_type ? crc.uncompressed_size : 0; - struct nonce nonce = (struct nonce) {{ - [0] = cpu_to_le32(size << 22), - [1] = cpu_to_le32(version.lo), - [2] = cpu_to_le32(version.lo >> 32), - [3] = cpu_to_le32(version.hi| - (compression_type << 24))^BCH_NONCE_EXTENT, - }}; - - return nonce_add(nonce, crc.nonce << 9); -} - -static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) -{ - return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; -} - -static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) -{ - __le64 magic = __bch2_sb_magic(sb); - - return (struct nonce) {{ - [0] = 0, - [1] = 0, - [2] = ((__le32 *) &magic)[0], - [3] = ((__le32 *) &magic)[1], - }}; -} - -static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) -{ - __le64 magic = bch2_sb_magic(c); - - return (struct nonce) {{ - [0] = 0, - [1] = 0, - [2] = ((__le32 *) &magic)[0], - [3] = ((__le32 *) &magic)[1], - }}; -} - -#endif /* _BCACHEFS_CHECKSUM_H */ diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c deleted file mode 100644 index 8e9264b5a84e..000000000000 --- a/fs/bcachefs/clock.c +++ /dev/null @@ -1,181 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "clock.h" - -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/preempt.h> - -static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args) -{ - struct io_timer **_l = (struct io_timer **)l; - struct io_timer **_r = (struct io_timer **)r; - - return (*_l)->expire < (*_r)->expire; -} - -static const struct min_heap_callbacks callbacks = { - .less = io_timer_cmp, - .swp = NULL, -}; - -void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) -{ - spin_lock(&clock->timer_lock); - - if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { - spin_unlock(&clock->timer_lock); - timer->fn(timer); - return; - } - - for (size_t i = 0; i < clock->timers.nr; i++) - if (clock->timers.data[i] == timer) - goto out; - - BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL)); -out: - spin_unlock(&clock->timer_lock); -} - -void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) -{ - spin_lock(&clock->timer_lock); - - for (size_t i = 0; i < clock->timers.nr; i++) - if (clock->timers.data[i] == timer) { - min_heap_del(&clock->timers, i, &callbacks, NULL); - break; - } - - spin_unlock(&clock->timer_lock); -} - -struct io_clock_wait { - struct io_timer io_timer; - struct task_struct *task; - int expired; -}; - -static void io_clock_wait_fn(struct io_timer *timer) -{ - struct io_clock_wait *wait = container_of(timer, - struct io_clock_wait, io_timer); - - wait->expired = 1; - wake_up_process(wait->task); -} - -void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) -{ - struct io_clock_wait wait = { - .io_timer.expire = until, - .io_timer.fn = io_clock_wait_fn, - .io_timer.fn2 = (void *) _RET_IP_, - .task = current, - }; - - bch2_io_timer_add(clock, &wait.io_timer); - schedule(); - bch2_io_timer_del(clock, &wait.io_timer); -} - -unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock, - u64 io_until, unsigned long cpu_timeout) -{ - bool kthread = (current->flags & PF_KTHREAD) != 0; - struct io_clock_wait wait = { - .io_timer.expire = io_until, - .io_timer.fn = io_clock_wait_fn, - .io_timer.fn2 = (void *) _RET_IP_, - .task = current, - }; - - bch2_io_timer_add(clock, &wait.io_timer); - - set_current_state(TASK_INTERRUPTIBLE); - if (!(kthread && kthread_should_stop())) { - cpu_timeout = schedule_timeout(cpu_timeout); - try_to_freeze(); - } - - __set_current_state(TASK_RUNNING); - bch2_io_timer_del(clock, &wait.io_timer); - return cpu_timeout; -} - -void bch2_kthread_io_clock_wait(struct io_clock *clock, - u64 io_until, unsigned long cpu_timeout) -{ - bool kthread = (current->flags & PF_KTHREAD) != 0; - - while (!(kthread && kthread_should_stop()) && - cpu_timeout && - atomic64_read(&clock->now) < io_until) - cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout); -} - -static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) -{ - struct io_timer *ret = NULL; - - if (clock->timers.nr && - time_after_eq64(now, clock->timers.data[0]->expire)) { - ret = *min_heap_peek(&clock->timers); - min_heap_pop(&clock->timers, &callbacks, NULL); - } - - return ret; -} - -void __bch2_increment_clock(struct io_clock *clock, u64 sectors) -{ - struct io_timer *timer; - u64 now = atomic64_add_return(sectors, &clock->now); - - spin_lock(&clock->timer_lock); - while ((timer = get_expired_timer(clock, now))) - timer->fn(timer); - spin_unlock(&clock->timer_lock); -} - -void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) -{ - out->atomic++; - spin_lock(&clock->timer_lock); - u64 now = atomic64_read(&clock->now); - - printbuf_tabstop_push(out, 40); - prt_printf(out, "current time:\t%llu\n", now); - - for (unsigned i = 0; i < clock->timers.nr; i++) - prt_printf(out, "%ps %ps:\t%llu\n", - clock->timers.data[i]->fn, - clock->timers.data[i]->fn2, - clock->timers.data[i]->expire); - spin_unlock(&clock->timer_lock); - --out->atomic; -} - -void bch2_io_clock_exit(struct io_clock *clock) -{ - free_heap(&clock->timers); - free_percpu(clock->pcpu_buf); -} - -int bch2_io_clock_init(struct io_clock *clock) -{ - atomic64_set(&clock->now, 0); - spin_lock_init(&clock->timer_lock); - - clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); - - clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); - if (!clock->pcpu_buf) - return -BCH_ERR_ENOMEM_io_clock_init; - - if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) - return -BCH_ERR_ENOMEM_io_clock_init; - - return 0; -} diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h deleted file mode 100644 index 8769be2aa21e..000000000000 --- a/fs/bcachefs/clock.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_CLOCK_H -#define _BCACHEFS_CLOCK_H - -void bch2_io_timer_add(struct io_clock *, struct io_timer *); -void bch2_io_timer_del(struct io_clock *, struct io_timer *); -unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long); -void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long); - -void __bch2_increment_clock(struct io_clock *, u64); - -static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors, - int rw) -{ - struct io_clock *clock = &c->io_clock[rw]; - - if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= - IO_CLOCK_PCPU_SECTORS)) - __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); -} - -void bch2_io_clock_schedule_timeout(struct io_clock *, u64); - -void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); - -void bch2_io_clock_exit(struct io_clock *); -int bch2_io_clock_init(struct io_clock *); - -#endif /* _BCACHEFS_CLOCK_H */ diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h deleted file mode 100644 index 37554e4514fe..000000000000 --- a/fs/bcachefs/clock_types.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_CLOCK_TYPES_H -#define _BCACHEFS_CLOCK_TYPES_H - -#include "util.h" - -#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) - -/* - * Clocks/timers in units of sectors of IO: - * - * Note - they use percpu batching, so they're only approximate. - */ - -struct io_timer; -typedef void (*io_timer_fn)(struct io_timer *); - -struct io_timer { - io_timer_fn fn; - void *fn2; - u64 expire; -}; - -/* Amount to buffer up on a percpu counter */ -#define IO_CLOCK_PCPU_SECTORS 128 - -typedef DEFINE_MIN_HEAP(struct io_timer *, io_timer_heap) io_timer_heap; - -struct io_clock { - atomic64_t now; - u16 __percpu *pcpu_buf; - unsigned max_slop; - - spinlock_t timer_lock; - io_timer_heap timers; -}; - -#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c deleted file mode 100644 index b37b1f325f0a..000000000000 --- a/fs/bcachefs/compress.c +++ /dev/null @@ -1,773 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "checksum.h" -#include "compress.h" -#include "error.h" -#include "extents.h" -#include "io_write.h" -#include "opts.h" -#include "super-io.h" - -#include <linux/lz4.h> -#include <linux/zlib.h> -#include <linux/zstd.h> - -static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type) -{ - switch (type) { - case BCH_COMPRESSION_TYPE_none: - case BCH_COMPRESSION_TYPE_incompressible: - return BCH_COMPRESSION_OPT_none; - case BCH_COMPRESSION_TYPE_lz4_old: - case BCH_COMPRESSION_TYPE_lz4: - return BCH_COMPRESSION_OPT_lz4; - case BCH_COMPRESSION_TYPE_gzip: - return BCH_COMPRESSION_OPT_gzip; - case BCH_COMPRESSION_TYPE_zstd: - return BCH_COMPRESSION_OPT_zstd; - default: - BUG(); - } -} - -/* Bounce buffer: */ -struct bbuf { - void *b; - enum { - BB_NONE, - BB_VMAP, - BB_KMALLOC, - BB_MEMPOOL, - } type; - int rw; -}; - -static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) -{ - void *b; - - BUG_ON(size > c->opts.encoded_extent_max); - - b = kmalloc(size, GFP_NOFS|__GFP_NOWARN); - if (b) - return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; - - b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS); - if (b) - return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; - - BUG(); -} - -static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) -{ - struct bio_vec bv; - struct bvec_iter iter; - void *expected_start = NULL; - - __bio_for_each_bvec(bv, bio, iter, start) { - if (expected_start && - expected_start != page_address(bv.bv_page) + bv.bv_offset) - return false; - - expected_start = page_address(bv.bv_page) + - bv.bv_offset + bv.bv_len; - } - - return true; -} - -static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, - struct bvec_iter start, int rw) -{ - struct bbuf ret; - struct bio_vec bv; - struct bvec_iter iter; - unsigned nr_pages = 0; - struct page *stack_pages[16]; - struct page **pages = NULL; - void *data; - - BUG_ON(start.bi_size > c->opts.encoded_extent_max); - - if (!PageHighMem(bio_iter_page(bio, start)) && - bio_phys_contig(bio, start)) - return (struct bbuf) { - .b = page_address(bio_iter_page(bio, start)) + - bio_iter_offset(bio, start), - .type = BB_NONE, .rw = rw - }; - - /* check if we can map the pages contiguously: */ - __bio_for_each_segment(bv, bio, iter, start) { - if (iter.bi_size != start.bi_size && - bv.bv_offset) - goto bounce; - - if (bv.bv_len < iter.bi_size && - bv.bv_offset + bv.bv_len < PAGE_SIZE) - goto bounce; - - nr_pages++; - } - - BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); - - pages = nr_pages > ARRAY_SIZE(stack_pages) - ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS) - : stack_pages; - if (!pages) - goto bounce; - - nr_pages = 0; - __bio_for_each_segment(bv, bio, iter, start) - pages[nr_pages++] = bv.bv_page; - - data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (pages != stack_pages) - kfree(pages); - - if (data) - return (struct bbuf) { - .b = data + bio_iter_offset(bio, start), - .type = BB_VMAP, .rw = rw - }; -bounce: - ret = __bounce_alloc(c, start.bi_size, rw); - - if (rw == READ) - memcpy_from_bio(ret.b, bio, start); - - return ret; -} - -static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) -{ - return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); -} - -static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) -{ - switch (buf.type) { - case BB_NONE: - break; - case BB_VMAP: - vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); - break; - case BB_KMALLOC: - kfree(buf.b); - break; - case BB_MEMPOOL: - mempool_free(buf.b, &c->compression_bounce[buf.rw]); - break; - } -} - -static inline void zlib_set_workspace(z_stream *strm, void *workspace) -{ -#ifdef __KERNEL__ - strm->workspace = workspace; -#endif -} - -static int __bio_uncompress(struct bch_fs *c, struct bio *src, - void *dst_data, struct bch_extent_crc_unpacked crc) -{ - struct bbuf src_data = { NULL }; - size_t src_len = src->bi_iter.bi_size; - size_t dst_len = crc.uncompressed_size << 9; - void *workspace; - int ret = 0, ret2; - - enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); - mempool_t *workspace_pool = &c->compress_workspace[opt]; - if (unlikely(!mempool_initialized(workspace_pool))) { - if (fsck_err(c, compression_type_not_marked_in_sb, - "compression type %s set but not marked in superblock", - __bch2_compression_types[crc.compression_type])) - ret = bch2_check_set_has_compressed_data(c, opt); - else - ret = bch_err_throw(c, compression_workspace_not_initialized); - if (ret) - goto err; - } - - src_data = bio_map_or_bounce(c, src, READ); - - switch (crc.compression_type) { - case BCH_COMPRESSION_TYPE_lz4_old: - case BCH_COMPRESSION_TYPE_lz4: - ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, - src_len, dst_len, dst_len); - if (ret2 != dst_len) - ret = bch_err_throw(c, decompress_lz4); - break; - case BCH_COMPRESSION_TYPE_gzip: { - z_stream strm = { - .next_in = src_data.b, - .avail_in = src_len, - .next_out = dst_data, - .avail_out = dst_len, - }; - - workspace = mempool_alloc(workspace_pool, GFP_NOFS); - - zlib_set_workspace(&strm, workspace); - zlib_inflateInit2(&strm, -MAX_WBITS); - ret2 = zlib_inflate(&strm, Z_FINISH); - - mempool_free(workspace, workspace_pool); - - if (ret2 != Z_STREAM_END) - ret = bch_err_throw(c, decompress_gzip); - break; - } - case BCH_COMPRESSION_TYPE_zstd: { - ZSTD_DCtx *ctx; - size_t real_src_len = le32_to_cpup(src_data.b); - - if (real_src_len > src_len - 4) { - ret = bch_err_throw(c, decompress_zstd_src_len_bad); - goto err; - } - - workspace = mempool_alloc(workspace_pool, GFP_NOFS); - ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - - ret2 = zstd_decompress_dctx(ctx, - dst_data, dst_len, - src_data.b + 4, real_src_len); - - mempool_free(workspace, workspace_pool); - - if (ret2 != dst_len) - ret = bch_err_throw(c, decompress_zstd); - break; - } - default: - BUG(); - } -err: -fsck_err: - bio_unmap_or_unbounce(c, src_data); - return ret; -} - -int bch2_bio_uncompress_inplace(struct bch_write_op *op, - struct bio *bio) -{ - struct bch_fs *c = op->c; - struct bch_extent_crc_unpacked *crc = &op->crc; - struct bbuf data = { NULL }; - size_t dst_len = crc->uncompressed_size << 9; - int ret = 0; - - /* bio must own its pages: */ - BUG_ON(!bio->bi_vcnt); - BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - - if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) { - bch2_write_op_error(op, op->pos.offset, - "extent too big to decompress (%u > %u)", - crc->uncompressed_size << 9, c->opts.encoded_extent_max); - return bch_err_throw(c, decompress_exceeded_max_encoded_extent); - } - - data = __bounce_alloc(c, dst_len, WRITE); - - ret = __bio_uncompress(c, bio, data.b, *crc); - - if (c->opts.no_data_io) - ret = 0; - - if (ret) { - bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret)); - goto err; - } - - /* - * XXX: don't have a good way to assert that the bio was allocated with - * enough space, we depend on bch2_move_extent doing the right thing - */ - bio->bi_iter.bi_size = crc->live_size << 9; - - memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); - - crc->csum_type = 0; - crc->compression_type = 0; - crc->compressed_size = crc->live_size; - crc->uncompressed_size = crc->live_size; - crc->offset = 0; - crc->csum = (struct bch_csum) { 0, 0 }; -err: - bio_unmap_or_unbounce(c, data); - return ret; -} - -int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, - struct bio *dst, struct bvec_iter dst_iter, - struct bch_extent_crc_unpacked crc) -{ - struct bbuf dst_data = { NULL }; - size_t dst_len = crc.uncompressed_size << 9; - int ret; - - if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || - crc.compressed_size << 9 > c->opts.encoded_extent_max) - return bch_err_throw(c, decompress_exceeded_max_encoded_extent); - - dst_data = dst_len == dst_iter.bi_size - ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) - : __bounce_alloc(c, dst_len, WRITE); - - ret = __bio_uncompress(c, src, dst_data.b, crc); - if (ret) - goto err; - - if (dst_data.type != BB_NONE && - dst_data.type != BB_VMAP) - memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); -err: - bio_unmap_or_unbounce(c, dst_data); - return ret; -} - -static int attempt_compress(struct bch_fs *c, - void *workspace, - void *dst, size_t dst_len, - void *src, size_t src_len, - struct bch_compression_opt compression) -{ - enum bch_compression_type compression_type = - __bch2_compression_opt_to_type[compression.type]; - - switch (compression_type) { - case BCH_COMPRESSION_TYPE_lz4: - if (compression.level < LZ4HC_MIN_CLEVEL) { - int len = src_len; - int ret = LZ4_compress_destSize( - src, dst, - &len, dst_len, - workspace); - if (len < src_len) - return -len; - - return ret; - } else { - int ret = LZ4_compress_HC( - src, dst, - src_len, dst_len, - compression.level, - workspace); - - return ret ?: -1; - } - case BCH_COMPRESSION_TYPE_gzip: { - z_stream strm = { - .next_in = src, - .avail_in = src_len, - .next_out = dst, - .avail_out = dst_len, - }; - - zlib_set_workspace(&strm, workspace); - if (zlib_deflateInit2(&strm, - compression.level - ? clamp_t(unsigned, compression.level, - Z_BEST_SPEED, Z_BEST_COMPRESSION) - : Z_DEFAULT_COMPRESSION, - Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, - Z_DEFAULT_STRATEGY) != Z_OK) - return 0; - - if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) - return 0; - - if (zlib_deflateEnd(&strm) != Z_OK) - return 0; - - return strm.total_out; - } - case BCH_COMPRESSION_TYPE_zstd: { - /* - * rescale: - * zstd max compression level is 22, our max level is 15 - */ - unsigned level = min((compression.level * 3) / 2, zstd_max_clevel()); - ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max); - ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size); - - /* - * ZSTD requires that when we decompress we pass in the exact - * compressed size - rounding it up to the nearest sector - * doesn't work, so we use the first 4 bytes of the buffer for - * that. - * - * Additionally, the ZSTD code seems to have a bug where it will - * write just past the end of the buffer - so subtract a fudge - * factor (7 bytes) from the dst buffer size to account for - * that. - */ - size_t len = zstd_compress_cctx(ctx, - dst + 4, dst_len - 4 - 7, - src, src_len, - ¶ms); - if (zstd_is_error(len)) - return 0; - - *((__le32 *) dst) = cpu_to_le32(len); - return len + 4; - } - default: - BUG(); - } -} - -static unsigned __bio_compress(struct bch_fs *c, - struct bio *dst, size_t *dst_len, - struct bio *src, size_t *src_len, - struct bch_compression_opt compression) -{ - struct bbuf src_data = { NULL }, dst_data = { NULL }; - void *workspace; - enum bch_compression_type compression_type = - __bch2_compression_opt_to_type[compression.type]; - unsigned pad; - int ret = 0; - - /* bch2_compression_decode catches unknown compression types: */ - BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR); - - mempool_t *workspace_pool = &c->compress_workspace[compression.type]; - if (unlikely(!mempool_initialized(workspace_pool))) { - if (fsck_err(c, compression_opt_not_marked_in_sb, - "compression opt %s set but not marked in superblock", - bch2_compression_opts[compression.type])) { - ret = bch2_check_set_has_compressed_data(c, compression.type); - if (ret) /* memory allocation failure, don't compress */ - return 0; - } else { - return 0; - } - } - - /* If it's only one block, don't bother trying to compress: */ - if (src->bi_iter.bi_size <= c->opts.block_size) - return BCH_COMPRESSION_TYPE_incompressible; - - dst_data = bio_map_or_bounce(c, dst, WRITE); - src_data = bio_map_or_bounce(c, src, READ); - - workspace = mempool_alloc(workspace_pool, GFP_NOFS); - - *src_len = src->bi_iter.bi_size; - *dst_len = dst->bi_iter.bi_size; - - /* - * XXX: this algorithm sucks when the compression code doesn't tell us - * how much would fit, like LZ4 does: - */ - while (1) { - if (*src_len <= block_bytes(c)) { - ret = -1; - break; - } - - ret = attempt_compress(c, workspace, - dst_data.b, *dst_len, - src_data.b, *src_len, - compression); - if (ret > 0) { - *dst_len = ret; - ret = 0; - break; - } - - /* Didn't fit: should we retry with a smaller amount? */ - if (*src_len <= *dst_len) { - ret = -1; - break; - } - - /* - * If ret is negative, it's a hint as to how much data would fit - */ - BUG_ON(-ret >= *src_len); - - if (ret < 0) - *src_len = -ret; - else - *src_len -= (*src_len - *dst_len) / 2; - *src_len = round_down(*src_len, block_bytes(c)); - } - - mempool_free(workspace, workspace_pool); - - if (ret) - goto err; - - /* Didn't get smaller: */ - if (round_up(*dst_len, block_bytes(c)) >= *src_len) - goto err; - - pad = round_up(*dst_len, block_bytes(c)) - *dst_len; - - memset(dst_data.b + *dst_len, 0, pad); - *dst_len += pad; - - if (dst_data.type != BB_NONE && - dst_data.type != BB_VMAP) - memcpy_to_bio(dst, dst->bi_iter, dst_data.b); - - BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); - BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); - BUG_ON(*dst_len & (block_bytes(c) - 1)); - BUG_ON(*src_len & (block_bytes(c) - 1)); - ret = compression_type; -out: - bio_unmap_or_unbounce(c, src_data); - bio_unmap_or_unbounce(c, dst_data); - return ret; -err: - ret = BCH_COMPRESSION_TYPE_incompressible; - goto out; -fsck_err: - ret = 0; - goto out; -} - -unsigned bch2_bio_compress(struct bch_fs *c, - struct bio *dst, size_t *dst_len, - struct bio *src, size_t *src_len, - unsigned compression_opt) -{ - unsigned orig_dst = dst->bi_iter.bi_size; - unsigned orig_src = src->bi_iter.bi_size; - unsigned compression_type; - - /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ - src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, - c->opts.encoded_extent_max); - /* Don't generate a bigger output than input: */ - dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); - - compression_type = - __bio_compress(c, dst, dst_len, src, src_len, - bch2_compression_decode(compression_opt)); - - dst->bi_iter.bi_size = orig_dst; - src->bi_iter.bi_size = orig_src; - return compression_type; -} - -static int __bch2_fs_compress_init(struct bch_fs *, u64); - -#define BCH_FEATURE_none 0 - -static const unsigned bch2_compression_opt_to_feature[] = { -#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, - BCH_COMPRESSION_OPTS() -#undef x -}; - -#undef BCH_FEATURE_none - -static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) -{ - int ret = 0; - - if ((c->sb.features & f) == f) - return 0; - - mutex_lock(&c->sb_lock); - - if ((c->sb.features & f) == f) { - mutex_unlock(&c->sb_lock); - return 0; - } - - ret = __bch2_fs_compress_init(c, c->sb.features|f); - if (ret) { - mutex_unlock(&c->sb_lock); - return ret; - } - - c->disk_sb.sb->features[0] |= cpu_to_le64(f); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return 0; -} - -int bch2_check_set_has_compressed_data(struct bch_fs *c, - unsigned compression_opt) -{ - unsigned compression_type = bch2_compression_decode(compression_opt).type; - - BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); - - return compression_type - ? __bch2_check_set_has_compressed_data(c, - 1ULL << bch2_compression_opt_to_feature[compression_type]) - : 0; -} - -void bch2_fs_compress_exit(struct bch_fs *c) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) - mempool_exit(&c->compress_workspace[i]); - mempool_exit(&c->compression_bounce[WRITE]); - mempool_exit(&c->compression_bounce[READ]); -} - -static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) -{ - ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), - c->opts.encoded_extent_max); - - c->zstd_workspace_size = zstd_cctx_workspace_bound(¶ms.cParams); - - struct { - unsigned feature; - enum bch_compression_opts type; - size_t compress_workspace; - } compression_types[] = { - { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4, - max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, - { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip, - max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), - zlib_inflate_workspacesize()) }, - { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd, - max(c->zstd_workspace_size, - zstd_dctx_workspace_bound()) }, - }, *i; - bool have_compressed = false; - - for (i = compression_types; - i < compression_types + ARRAY_SIZE(compression_types); - i++) - have_compressed |= (features & (1 << i->feature)) != 0; - - if (!have_compressed) - return 0; - - if (!mempool_initialized(&c->compression_bounce[READ]) && - mempool_init_kvmalloc_pool(&c->compression_bounce[READ], - 1, c->opts.encoded_extent_max)) - return bch_err_throw(c, ENOMEM_compression_bounce_read_init); - - if (!mempool_initialized(&c->compression_bounce[WRITE]) && - mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], - 1, c->opts.encoded_extent_max)) - return bch_err_throw(c, ENOMEM_compression_bounce_write_init); - - for (i = compression_types; - i < compression_types + ARRAY_SIZE(compression_types); - i++) { - if (!(features & (1 << i->feature))) - continue; - - if (mempool_initialized(&c->compress_workspace[i->type])) - continue; - - if (mempool_init_kvmalloc_pool( - &c->compress_workspace[i->type], - 1, i->compress_workspace)) - return bch_err_throw(c, ENOMEM_compression_workspace_init); - } - - return 0; -} - -static u64 compression_opt_to_feature(unsigned v) -{ - unsigned type = bch2_compression_decode(v).type; - - return BIT_ULL(bch2_compression_opt_to_feature[type]); -} - -int bch2_fs_compress_init(struct bch_fs *c) -{ - u64 f = c->sb.features; - - f |= compression_opt_to_feature(c->opts.compression); - f |= compression_opt_to_feature(c->opts.background_compression); - - return __bch2_fs_compress_init(c, f); -} - -int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, - struct printbuf *err) -{ - char *val = kstrdup(_val, GFP_KERNEL); - char *p = val, *type_str, *level_str; - struct bch_compression_opt opt = { 0 }; - int ret; - - if (!val) - return -ENOMEM; - - type_str = strsep(&p, ":"); - level_str = p; - - ret = match_string(bch2_compression_opts, -1, type_str); - if (ret < 0 && err) - prt_printf(err, "invalid compression type\n"); - if (ret < 0) - goto err; - - opt.type = ret; - - if (level_str) { - unsigned level; - - ret = kstrtouint(level_str, 10, &level); - if (!ret && !opt.type && level) - ret = -EINVAL; - if (!ret && level > 15) - ret = -EINVAL; - if (ret < 0 && err) - prt_printf(err, "invalid compression level\n"); - if (ret < 0) - goto err; - - opt.level = level; - } - - *res = bch2_compression_encode(opt); -err: - kfree(val); - return ret; -} - -void bch2_compression_opt_to_text(struct printbuf *out, u64 v) -{ - struct bch_compression_opt opt = bch2_compression_decode(v); - - if (opt.type < BCH_COMPRESSION_OPT_NR) - prt_str(out, bch2_compression_opts[opt.type]); - else - prt_printf(out, "(unknown compression opt %u)", opt.type); - if (opt.level) - prt_printf(out, ":%u", opt.level); -} - -void bch2_opt_compression_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_sb *sb, - u64 v) -{ - return bch2_compression_opt_to_text(out, v); -} - -int bch2_opt_compression_validate(u64 v, struct printbuf *err) -{ - if (!bch2_compression_opt_valid(v)) { - prt_printf(err, "invalid compression opt %llu", v); - return -BCH_ERR_invalid_sb_opt_compression; - } - - return 0; -} diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h deleted file mode 100644 index bec2f05bfd52..000000000000 --- a/fs/bcachefs/compress.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_COMPRESS_H -#define _BCACHEFS_COMPRESS_H - -#include "extents_types.h" - -static const unsigned __bch2_compression_opt_to_type[] = { -#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, - BCH_COMPRESSION_OPTS() -#undef x -}; - -struct bch_compression_opt { - u8 type:4, - level:4; -}; - -static inline struct bch_compression_opt __bch2_compression_decode(unsigned v) -{ - return (struct bch_compression_opt) { - .type = v & 15, - .level = v >> 4, - }; -} - -static inline bool bch2_compression_opt_valid(unsigned v) -{ - struct bch_compression_opt opt = __bch2_compression_decode(v); - - return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level); -} - -static inline struct bch_compression_opt bch2_compression_decode(unsigned v) -{ - return bch2_compression_opt_valid(v) - ? __bch2_compression_decode(v) - : (struct bch_compression_opt) { 0 }; -} - -static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) -{ - return opt.type|(opt.level << 4); -} - -static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) -{ - return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; -} - -struct bch_write_op; -int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *); -int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, - struct bvec_iter, struct bch_extent_crc_unpacked); -unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, - struct bio *, size_t *, unsigned); - -int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); -void bch2_fs_compress_exit(struct bch_fs *); -int bch2_fs_compress_init(struct bch_fs *); - -void bch2_compression_opt_to_text(struct printbuf *, u64); - -int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); -void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); -int bch2_opt_compression_validate(u64, struct printbuf *); - -#define bch2_opt_compression (struct bch_opt_fn) { \ - .parse = bch2_opt_compression_parse, \ - .to_text = bch2_opt_compression_to_text, \ - .validate = bch2_opt_compression_validate, \ -} - -#endif /* _BCACHEFS_COMPRESS_H */ diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c deleted file mode 100644 index e86d36d23e9e..000000000000 --- a/fs/bcachefs/darray.c +++ /dev/null @@ -1,38 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/log2.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include "darray.h" - -int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) -{ - if (new_size > d->size) { - new_size = roundup_pow_of_two(new_size); - - /* - * This is a workaround: kvmalloc() doesn't support > INT_MAX - * allocations, but vmalloc() does. - * The limit needs to be lifted from kvmalloc, and when it does - * we'll go back to just using that. - */ - size_t bytes; - if (unlikely(check_mul_overflow(new_size, element_size, &bytes))) - return -ENOMEM; - - void *data = likely(bytes < INT_MAX) - ? kvmalloc_noprof(bytes, gfp) - : vmalloc_noprof(bytes); - if (!data) - return -ENOMEM; - - if (d->size) - memcpy(data, d->data, d->size * element_size); - if (d->data != d->preallocated) - kvfree(d->data); - d->data = data; - d->size = new_size; - } - - return 0; -} diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h deleted file mode 100644 index 4080ee99aadd..000000000000 --- a/fs/bcachefs/darray.h +++ /dev/null @@ -1,158 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DARRAY_H -#define _BCACHEFS_DARRAY_H - -/* - * Dynamic arrays: - * - * Inspired by CCAN's darray - */ - -#include <linux/cleanup.h> -#include <linux/slab.h> - -#define DARRAY_PREALLOCATED(_type, _nr) \ -struct { \ - size_t nr, size; \ - _type *data; \ - _type preallocated[_nr]; \ -} - -#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) - -typedef DARRAY(char) darray_char; -typedef DARRAY(char *) darray_str; -typedef DARRAY(const char *) darray_const_str; - -typedef DARRAY(u8) darray_u8; -typedef DARRAY(u16) darray_u16; -typedef DARRAY(u32) darray_u32; -typedef DARRAY(u64) darray_u64; - -typedef DARRAY(s8) darray_s8; -typedef DARRAY(s16) darray_s16; -typedef DARRAY(s32) darray_s32; -typedef DARRAY(s64) darray_s64; - -int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); - -#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__)) - -#define __darray_resize(_d, _element_size, _new_size, _gfp) \ - (unlikely((_new_size) > (_d)->size) \ - ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\ - : 0) - -#define darray_resize_gfp(_d, _new_size, _gfp) \ - __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp) - -#define darray_resize(_d, _new_size) \ - darray_resize_gfp(_d, _new_size, GFP_KERNEL) - -#define darray_make_room_gfp(_d, _more, _gfp) \ - darray_resize_gfp((_d), (_d)->nr + (_more), _gfp) - -#define darray_make_room(_d, _more) \ - darray_make_room_gfp(_d, _more, GFP_KERNEL) - -#define darray_room(_d) ((_d).size - (_d).nr) - -#define darray_top(_d) ((_d).data[(_d).nr]) - -#define darray_push_gfp(_d, _item, _gfp) \ -({ \ - int _ret = darray_make_room_gfp((_d), 1, _gfp); \ - \ - if (!_ret) \ - (_d)->data[(_d)->nr++] = (_item); \ - _ret; \ -}) - -#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL) - -#define darray_pop(_d) ((_d)->data[--(_d)->nr]) - -#define darray_first(_d) ((_d).data[0]) -#define darray_last(_d) ((_d).data[(_d).nr - 1]) - -#define darray_insert_item(_d, pos, _item) \ -({ \ - size_t _pos = (pos); \ - int _ret = darray_make_room((_d), 1); \ - \ - if (!_ret) \ - array_insert_item((_d)->data, (_d)->nr, _pos, (_item)); \ - _ret; \ -}) - -#define darray_remove_item(_d, _pos) \ - array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) - -#define darray_find_p(_d, _i, cond) \ -({ \ - typeof((_d).data) _ret = NULL; \ - \ - darray_for_each(_d, _i) \ - if (cond) { \ - _ret = _i; \ - break; \ - } \ - _ret; \ -}) - -#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item) - -/* Iteration: */ - -#define __darray_for_each(_d, _i) \ - for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) - -#define darray_for_each(_d, _i) \ - for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) - -#define darray_for_each_reverse(_d, _i) \ - for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) - -/* Init/exit */ - -#define darray_init(_d) \ -do { \ - (_d)->nr = 0; \ - (_d)->size = ARRAY_SIZE((_d)->preallocated); \ - (_d)->data = (_d)->size ? (_d)->preallocated : NULL; \ -} while (0) - -#define darray_exit(_d) \ -do { \ - if (!ARRAY_SIZE((_d)->preallocated) || \ - (_d)->data != (_d)->preallocated) \ - kvfree((_d)->data); \ - darray_init(_d); \ -} while (0) - -#define DEFINE_DARRAY_CLASS(_type) \ -DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void) - -#define DEFINE_DARRAY(_type) \ -typedef DARRAY(_type) darray_##_type; \ -DEFINE_DARRAY_CLASS(darray_##_type) - -#define DEFINE_DARRAY_NAMED(_name, _type) \ -typedef DARRAY(_type) _name; \ -DEFINE_DARRAY_CLASS(_name) - -DEFINE_DARRAY_CLASS(darray_char); -DEFINE_DARRAY_CLASS(darray_str) -DEFINE_DARRAY_CLASS(darray_const_str) - -DEFINE_DARRAY_CLASS(darray_u8) -DEFINE_DARRAY_CLASS(darray_u16) -DEFINE_DARRAY_CLASS(darray_u32) -DEFINE_DARRAY_CLASS(darray_u64) - -DEFINE_DARRAY_CLASS(darray_s8) -DEFINE_DARRAY_CLASS(darray_s16) -DEFINE_DARRAY_CLASS(darray_s32) -DEFINE_DARRAY_CLASS(darray_s64) - -#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c deleted file mode 100644 index e848e210a9bf..000000000000 --- a/fs/bcachefs/data_update.c +++ /dev/null @@ -1,1021 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "compress.h" -#include "data_update.h" -#include "disk_groups.h" -#include "ec.h" -#include "error.h" -#include "extents.h" -#include "io_write.h" -#include "keylist.h" -#include "move.h" -#include "nocow_locking.h" -#include "rebalance.h" -#include "snapshot.h" -#include "subvolume.h" -#include "trace.h" - -#include <linux/ioprio.h> - -static const char * const bch2_data_update_type_strs[] = { -#define x(t, n, ...) [n] = #t, - BCH_DATA_UPDATE_TYPES() -#undef x - NULL -}; - -static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) - bch2_dev_put(bch2_dev_have_ref(c, ptr->dev)); -} - -static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) { - if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; - bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); - } - return false; - } - } - return true; -} - -static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - } -} - -static noinline_for_stack -bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs, - const struct bch_extent_ptr *start) -{ - if (!ctxt) { - bkey_for_each_ptr(ptrs, ptr) { - if (ptr == start) - break; - - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - } - return false; - } - - __bkey_for_each_ptr(start, ptrs.end, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - bool locked; - move_ctxt_wait_event(ctxt, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || - list_empty(&ctxt->ios)); - if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); - } - return true; -} - -static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs) -{ - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) - return __bkey_nocow_lock(c, ctxt, ptrs, ptr); - } - - return true; -} - -noinline_for_stack -static void trace_io_move_finish2(struct data_update *u, - struct bkey_i *new, - struct bkey_i *insert) -{ - struct bch_fs *c = u->op.c; - struct printbuf buf = PRINTBUF; - - prt_newline(&buf); - - bch2_data_update_to_text(&buf, u); - prt_newline(&buf); - - prt_str_indented(&buf, "new replicas:\t"); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); - prt_newline(&buf); - - prt_str_indented(&buf, "insert:\t"); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - prt_newline(&buf); - - trace_io_move_finish(c, buf.buf); - printbuf_exit(&buf); -} - -noinline_for_stack -static void trace_io_move_fail2(struct data_update *m, - struct bkey_s_c new, - struct bkey_s_c wrote, - struct bkey_i *insert, - const char *msg) -{ - struct bch_fs *c = m->op.c; - struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - struct printbuf buf = PRINTBUF; - unsigned rewrites_found = 0; - - if (!trace_io_move_fail_enabled()) - return; - - prt_str(&buf, msg); - - if (insert) { - const union bch_extent_entry *entry; - struct bch_extent_ptr *ptr; - struct extent_ptr_decoded p; - - unsigned ptr_bit = 1; - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { - if ((ptr_bit & m->data_opts.rewrite_ptrs) && - (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && - !ptr->cached) - rewrites_found |= ptr_bit; - ptr_bit <<= 1; - } - } - - prt_str(&buf, "rewrites found:\t"); - bch2_prt_u64_base2(&buf, rewrites_found); - prt_newline(&buf); - - bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, new); - - prt_str(&buf, "\nwrote: "); - bch2_bkey_val_to_text(&buf, c, wrote); - - if (insert) { - prt_str(&buf, "\ninsert: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - } - - trace_io_move_fail(c, buf.buf); - printbuf_exit(&buf); -} - -noinline_for_stack -static void trace_data_update2(struct data_update *m, - struct bkey_s_c old, struct bkey_s_c k, - struct bkey_i *insert) -{ - struct bch_fs *c = m->op.c; - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_data_update(c, buf.buf); - printbuf_exit(&buf); -} - -noinline_for_stack -static void trace_io_move_created_rebalance2(struct data_update *m, - struct bkey_s_c old, struct bkey_s_c k, - struct bkey_i *insert) -{ - struct bch_fs *c = m->op.c; - struct printbuf buf = PRINTBUF; - - bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_io_move_created_rebalance(c, buf.buf); - printbuf_exit(&buf); - - this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); -} - -noinline_for_stack -static int data_update_invalid_bkey(struct data_update *m, - struct bkey_s_c old, struct bkey_s_c k, - struct bkey_i *insert) -{ - struct bch_fs *c = m->op.c; - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_str(&buf, "about to insert invalid key in data update path"); - prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - prt_newline(&buf); - - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - return bch_err_throw(c, invalid_bkey); -} - -static int __bch2_data_update_index_update(struct btree_trans *trans, - struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_iter iter; - struct data_update *m = container_of(op, struct data_update, op); - int ret = 0; - - bch2_trans_iter_init(trans, &iter, m->btree_id, - bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k), - BTREE_ITER_slots|BTREE_ITER_intent); - - while (1) { - struct bkey_s_c k; - struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - struct bkey_i *insert = NULL; - struct bkey_i_extent *new; - const union bch_extent_entry *entry_c; - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bch_extent_ptr *ptr; - const struct bch_extent_ptr *ptr_c; - struct bpos next_pos; - bool should_check_enospc; - s64 i_sectors_delta = 0, disk_sectors_delta = 0; - unsigned rewrites_found = 0, durability, ptr_bit; - - bch2_trans_begin(trans); - - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys)); - - if (!bch2_extents_match(k, old)) { - trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), - NULL, "no match:"); - goto nowork; - } - - insert = bch2_trans_kmalloc(trans, - bkey_bytes(k.k) + - bkey_val_bytes(&new->k) + - sizeof(struct bch_extent_rebalance)); - ret = PTR_ERR_OR_ZERO(insert); - if (ret) - goto err; - - bkey_reassemble(insert, k); - - new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys)); - bch2_cut_front(iter.pos, &new->k_i); - - bch2_cut_front(iter.pos, insert); - bch2_cut_back(new->k.p, insert); - bch2_cut_back(insert->k.p, &new->k_i); - - /* - * @old: extent that we read from - * @insert: key that we're going to update, initialized from - * extent currently in btree - same as @old unless we raced with - * other updates - * @new: extent with new pointers that we'll be adding to @insert - * - * Fist, drop rewrite_ptrs from @new: - */ - ptr_bit = 1; - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { - if ((ptr_bit & m->data_opts.rewrite_ptrs) && - (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && - !ptr->cached) { - bch2_extent_ptr_set_cached(c, &m->op.opts, - bkey_i_to_s(insert), ptr); - rewrites_found |= ptr_bit; - } - ptr_bit <<= 1; - } - - if (m->data_opts.rewrite_ptrs && - !rewrites_found && - bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { - trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); - goto nowork; - } - - /* - * A replica that we just wrote might conflict with a replica - * that we want to keep, due to racing with another move: - */ -restart_drop_conflicting_replicas: - extent_for_each_ptr(extent_i_to_s(new), ptr) - if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) && - !ptr_c->cached) { - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr); - goto restart_drop_conflicting_replicas; - } - - if (!bkey_val_u64s(&new->k)) { - trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); - goto nowork; - } - - /* Now, drop pointers that conflict with what we just wrote: */ - extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) - if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev))) - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); - - durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) + - bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); - - /* Now, drop excess replicas: */ - scoped_guard(rcu) { -restart_drop_extra_replicas: - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { - unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); - - if (!p.ptr.cached && - durability - ptr_durability >= m->op.opts.data_replicas) { - durability -= ptr_durability; - - bch2_extent_ptr_set_cached(c, &m->op.opts, - bkey_i_to_s(insert), &entry->ptr); - goto restart_drop_extra_replicas; - } - } - } - - /* Finally, add the pointers we just wrote: */ - extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) - bch2_extent_ptr_decoded_append(insert, &p); - - bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert)); - - ret = bch2_sum_sector_overwrites(trans, &iter, insert, - &should_check_enospc, - &i_sectors_delta, - &disk_sectors_delta); - if (ret) - goto err; - - if (disk_sectors_delta > (s64) op->res.sectors) { - ret = bch2_disk_reservation_add(c, &op->res, - disk_sectors_delta - op->res.sectors, - !should_check_enospc - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - goto out; - } - - next_pos = insert->k.p; - - /* - * Check for nonce offset inconsistency: - * This is debug code - we've been seeing this bug rarely, and - * it's been hard to reproduce, so this should give us some more - * information when it does occur: - */ - int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), - (struct bkey_validate_context) { - .btree = m->btree_id, - .flags = BCH_VALIDATE_commit, - }); - if (unlikely(invalid)) { - ret = data_update_invalid_bkey(m, old, k, insert); - goto out; - } - - ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: - bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: - bch2_insert_snapshot_whiteouts(trans, m->btree_id, - k.k->p, bkey_start_pos(&insert->k)) ?: - bch2_insert_snapshot_whiteouts(trans, m->btree_id, - k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: - bch2_trans_update(trans, &iter, insert, - BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto err; - - if (trace_data_update_enabled()) - trace_data_update2(m, old, k, insert); - - if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > - bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) - trace_io_move_created_rebalance2(m, old, k, insert); - - ret = bch2_trans_commit(trans, &op->res, - NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - m->data_opts.btree_insert_flags); - if (ret) - goto err; - - bch2_btree_iter_set_pos(trans, &iter, next_pos); - - this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); - if (trace_io_move_finish_enabled()) - trace_io_move_finish2(m, &new->k_i, insert); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - if (ret) - break; -next: - while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) { - bch2_keylist_pop_front(&op->insert_keys); - if (bch2_keylist_empty(&op->insert_keys)) - goto out; - } - continue; -nowork: - if (m->stats) { - BUG_ON(k.k->p.offset <= iter.pos.offset); - atomic64_inc(&m->stats->keys_raced); - atomic64_add(k.k->p.offset - iter.pos.offset, - &m->stats->sectors_raced); - } - - count_event(c, io_move_fail); - - bch2_btree_iter_advance(trans, &iter); - goto next; - } -out: - bch2_trans_iter_exit(trans, &iter); - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); - return ret; -} - -int bch2_data_update_index_update(struct bch_write_op *op) -{ - return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); -} - -void bch2_data_update_read_done(struct data_update *m) -{ - m->read_done = true; - - /* write bio must own pages: */ - BUG_ON(!m->op.wbio.bio.bi_vcnt); - - m->op.crc = m->rbio.pick.crc; - m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; - - this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); - - closure_call(&m->op.cl, bch2_write, NULL, NULL); -} - -void bch2_data_update_exit(struct data_update *update) -{ - struct bch_fs *c = update->op.c; - struct bkey_s_c k = bkey_i_to_s_c(update->k.k); - - bch2_bio_free_pages_pool(c, &update->op.wbio.bio); - kfree(update->bvecs); - update->bvecs = NULL; - - if (c->opts.nocow_enabled) - bkey_nocow_unlock(c, k); - bkey_put_dev_refs(c, k); - bch2_disk_reservation_put(c, &update->op.res); - bch2_bkey_buf_exit(&update->k, c); -} - -static noinline_for_stack -int bch2_update_unwritten_extent(struct btree_trans *trans, - struct data_update *update) -{ - struct bch_fs *c = update->op.c; - struct bkey_i_extent *e; - struct write_point *wp; - struct closure cl; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - closure_init_stack(&cl); - bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); - - while (bpos_lt(update->op.pos, update->k.k->k.p)) { - unsigned sectors = update->k.k->k.p.offset - - update->op.pos.offset; - - bch2_trans_begin(trans); - - bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, - BTREE_ITER_slots); - ret = lockrestart_do(trans, ({ - k = bch2_btree_iter_peek_slot(trans, &iter); - bkey_err(k); - })); - bch2_trans_iter_exit(trans, &iter); - - if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k))) - break; - - e = bkey_extent_init(update->op.insert_keys.top); - e->k.p = update->op.pos; - - ret = bch2_alloc_sectors_start_trans(trans, - update->op.target, - false, - update->op.write_point, - &update->op.devs_have, - update->op.nr_replicas, - update->op.nr_replicas, - update->op.watermark, - 0, &cl, &wp); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { - bch2_trans_unlock(trans); - closure_sync(&cl); - continue; - } - - bch_err_fn_ratelimited(c, ret); - - if (ret) - break; - - sectors = min(sectors, wp->sectors_free); - - bch2_key_resize(&e->k, sectors); - - bch2_open_bucket_get(c, wp, &update->op.open_buckets); - bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); - bch2_alloc_sectors_done(c, wp); - - update->op.pos.offset += sectors; - - extent_for_each_ptr(extent_i_to_s(e), ptr) - ptr->unwritten = true; - bch2_keylist_push(&update->op.insert_keys); - - ret = __bch2_data_update_index_update(trans, &update->op); - - bch2_open_buckets_put(c, &update->op.open_buckets); - - if (ret) - break; - } - - if (closure_nr_remaining(&cl) != 1) { - bch2_trans_unlock(trans); - closure_sync(&cl); - } - - return ret; -} - -void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 20); - - prt_str_indented(out, "rewrite ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); - prt_newline(out); - - prt_str_indented(out, "kill ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->kill_ptrs); - prt_newline(out); - - prt_str_indented(out, "target:\t"); - bch2_target_to_text(out, c, data_opts->target); - prt_newline(out); - - prt_str_indented(out, "compression:\t"); - bch2_compression_opt_to_text(out, io_opts->background_compression); - prt_newline(out); - - prt_str_indented(out, "opts.replicas:\t"); - prt_u64(out, io_opts->data_replicas); - prt_newline(out); - - prt_str_indented(out, "extra replicas:\t"); - prt_u64(out, data_opts->extra_replicas); - prt_newline(out); - - prt_str_indented(out, "scrub:\t"); - prt_u64(out, data_opts->scrub); -} - -void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) -{ - prt_str(out, bch2_data_update_type_strs[m->type]); - prt_newline(out); - - bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); - prt_newline(out); - - prt_str_indented(out, "old key:\t"); - bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); -} - -void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) -{ - bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); - prt_newline(out); - printbuf_indent_add(out, 2); - bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); - - if (!m->read_done) { - prt_printf(out, "read:\n"); - printbuf_indent_add(out, 2); - bch2_read_bio_to_text(out, &m->rbio); - } else { - prt_printf(out, "write:\n"); - printbuf_indent_add(out, 2); - bch2_write_op_to_text(out, &m->op); - } - printbuf_indent_sub(out, 4); -} - -int bch2_extent_drop_ptrs(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bch_fs *c = trans->c; - struct bkey_i *n; - int ret; - - n = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - while (data_opts->kill_ptrs) { - unsigned i = 0, drop = __fls(data_opts->kill_ptrs); - - bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop); - data_opts->kill_ptrs ^= 1U << drop; - } - - /* - * If the new extent no longer has any pointers, bch2_extent_normalize() - * will do the appropriate thing with it (turning it into a - * KEY_TYPE_error key, or just a discard if it was a cached extent) - */ - bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n)); - - /* - * Since we're not inserting through an extent iterator - * (BTREE_ITER_all_snapshots iterators aren't extent iterators), - * we aren't using the extent overwrite path to delete, we're - * just using the normal key deletion path: - */ - if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents)) - n->k.size = 0; - - return bch2_trans_relock(trans) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -} - -static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts, - unsigned buf_bytes) -{ - unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); - - m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); - if (!m->bvecs) - return -ENOMEM; - - bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); - bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); - - if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { - kfree(m->bvecs); - m->bvecs = NULL; - return -ENOMEM; - } - - rbio_init(&m->rbio.bio, c, *io_opts, NULL); - m->rbio.data_update = true; - m->rbio.bio.bi_iter.bi_size = buf_bytes; - m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); - m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - return 0; -} - -int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - /* write path might have to decompress data: */ - unsigned buf_bytes = 0; - bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) - buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); - - return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); -} - -static int can_write_extent(struct bch_fs *c, struct data_update *m) -{ - if ((m->op.flags & BCH_WRITE_alloc_nowait) && - unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) - return bch_err_throw(c, data_update_done_would_block); - - unsigned target = m->op.flags & BCH_WRITE_only_specified_devs - ? m->op.target - : 0; - struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); - - darray_for_each(m->op.devs_have, i) - __clear_bit(*i, devs.d); - - guard(rcu)(); - - unsigned nr_replicas = 0, i; - for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { - struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); - if (!ca) - continue; - - struct bch_dev_usage usage; - bch2_dev_usage_read_fast(ca, &usage); - - if (!dev_buckets_free(ca, usage, m->op.watermark)) - continue; - - nr_replicas += ca->mi.durability; - if (nr_replicas >= m->op.nr_replicas) - break; - } - - if (!nr_replicas) - return bch_err_throw(c, data_update_done_no_rw_devs); - if (nr_replicas < m->op.nr_replicas) - return bch_err_throw(c, insufficient_devices); - return 0; -} - -int bch2_data_update_init(struct btree_trans *trans, - struct btree_iter *iter, - struct moving_context *ctxt, - struct data_update *m, - struct write_point_specifier wp, - struct bch_io_opts *io_opts, - struct data_update_opts data_opts, - enum btree_id btree_id, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - if (k.k->p.snapshot) { - ret = bch2_check_key_has_snapshot(trans, iter, k); - if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) { - /* Can't repair yet, waiting on other recovery passes */ - return bch_err_throw(c, data_update_done_no_snapshot); - } - if (ret < 0) - return ret; - if (ret) /* key was deleted */ - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - bch_err_throw(c, data_update_done_no_snapshot); - ret = 0; - } - - bch2_bkey_buf_init(&m->k); - bch2_bkey_buf_reassemble(&m->k, c, k); - m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc - ? BCH_DATA_UPDATE_copygc - : BCH_DATA_UPDATE_rebalance; - m->btree_id = btree_id; - m->data_opts = data_opts; - m->ctxt = ctxt; - m->stats = ctxt ? ctxt->stats : NULL; - - bch2_write_op_init(&m->op, c, *io_opts); - m->op.pos = bkey_start_pos(k.k); - m->op.version = k.k->bversion; - m->op.target = data_opts.target; - m->op.write_point = wp; - m->op.nr_replicas = 0; - m->op.flags |= BCH_WRITE_pages_stable| - BCH_WRITE_pages_owned| - BCH_WRITE_data_encoded| - BCH_WRITE_move| - m->data_opts.write_flags; - m->op.compression_opt = io_opts->background_compression; - m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - - unsigned durability_have = 0, durability_removing = 0; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; - unsigned buf_bytes = 0; - bool unwritten = false; - - unsigned ptr_bit = 1; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (!p.ptr.cached) { - guard(rcu)(); - if (ptr_bit & m->data_opts.rewrite_ptrs) { - if (crc_is_compressed(p.crc)) - reserve_sectors += k.k->size; - - m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); - durability_removing += bch2_extent_ptr_desired_durability(c, &p); - } else if (!(ptr_bit & m->data_opts.kill_ptrs)) { - bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); - durability_have += bch2_extent_ptr_durability(c, &p); - } - } - - /* - * op->csum_type is normally initialized from the fs/file's - * current options - but if an extent is encrypted, we require - * that it stays encrypted: - */ - if (bch2_csum_type_is_encryption(p.crc.csum_type)) { - m->op.nonce = p.crc.nonce + p.crc.offset; - m->op.csum_type = p.crc.csum_type; - } - - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) - m->op.incompressible = true; - - buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); - unwritten |= p.ptr.unwritten; - - ptr_bit <<= 1; - } - - unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); - - /* - * If current extent durability is less than io_opts.data_replicas, - * we're not trying to rereplicate the extent up to data_replicas here - - * unless extra_replicas was specified - * - * Increasing replication is an explicit operation triggered by - * rereplicate, currently, so that users don't get an unexpected -ENOSPC - */ - m->op.nr_replicas = min(durability_removing, durability_required) + - m->data_opts.extra_replicas; - - /* - * If device(s) were set to durability=0 after data was written to them - * we can end up with a duribilty=0 extent, and the normal algorithm - * that tries not to increase durability doesn't work: - */ - if (!(durability_have + durability_removing)) - m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); - - m->op.nr_replicas_required = m->op.nr_replicas; - - /* - * It might turn out that we don't need any new replicas, if the - * replicas or durability settings have been changed since the extent - * was written: - */ - if (!m->op.nr_replicas) { - m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; - m->data_opts.rewrite_ptrs = 0; - /* if iter == NULL, it's just a promote */ - if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); - if (!ret) - ret = bch_err_throw(c, data_update_done_no_writes_needed); - goto out_bkey_buf_exit; - } - - /* - * Check if the allocation will succeed, to avoid getting an error later - * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless - * read: - * - * This guards against - * - BCH_WRITE_alloc_nowait allocations failing (promotes) - * - Destination target full - * - Device(s) in destination target offline - * - Insufficient durability available in destination target - * (i.e. trying to move a durability=2 replica to a target with a - * single durability=2 device) - */ - ret = can_write_extent(c, m); - if (ret) - goto out_bkey_buf_exit; - - if (reserve_sectors) { - ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, - m->data_opts.extra_replicas - ? 0 - : BCH_DISK_RESERVATION_NOFAIL); - if (ret) - goto out_bkey_buf_exit; - } - - if (!bkey_get_dev_refs(c, k)) { - ret = bch_err_throw(c, data_update_done_no_dev_refs); - goto out_put_disk_res; - } - - if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, ptrs)) { - ret = bch_err_throw(c, nocow_lock_blocked); - goto out_put_dev_refs; - } - - if (unwritten) { - ret = bch2_update_unwritten_extent(trans, m) ?: - bch_err_throw(c, data_update_done_unwritten); - goto out_nocow_unlock; - } - - bch2_trans_unlock(trans); - - ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); - if (ret) - goto out_nocow_unlock; - - return 0; -out_nocow_unlock: - if (c->opts.nocow_enabled) - bkey_nocow_unlock(c, k); -out_put_dev_refs: - bkey_put_dev_refs(c, k); -out_put_disk_res: - bch2_disk_reservation_put(c, &m->op.res); -out_bkey_buf_exit: - bch2_bkey_buf_exit(&m->k, c); - return ret; -} - -void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned ptr_bit = 1; - - bkey_for_each_ptr(ptrs, ptr) { - if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) { - opts->kill_ptrs |= ptr_bit; - opts->rewrite_ptrs ^= ptr_bit; - } - - ptr_bit <<= 1; - } -} diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h deleted file mode 100644 index 5e14d13568de..000000000000 --- a/fs/bcachefs/data_update.h +++ /dev/null @@ -1,93 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _BCACHEFS_DATA_UPDATE_H -#define _BCACHEFS_DATA_UPDATE_H - -#include "bkey_buf.h" -#include "io_read.h" -#include "io_write_types.h" - -struct moving_context; - -struct data_update_opts { - unsigned rewrite_ptrs; - unsigned kill_ptrs; - u16 target; - u8 extra_replicas; - unsigned btree_insert_flags; - unsigned write_flags; - - int read_dev; - bool scrub; -}; - -void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, - struct bch_io_opts *, struct data_update_opts *); - -#define BCH_DATA_UPDATE_TYPES() \ - x(copygc, 0) \ - x(rebalance, 1) \ - x(promote, 2) - -enum bch_data_update_types { -#define x(n, id) BCH_DATA_UPDATE_##n = id, - BCH_DATA_UPDATE_TYPES() -#undef x -}; - -struct data_update { - enum bch_data_update_types type; - /* extent being updated: */ - bool read_done; - enum btree_id btree_id; - struct bkey_buf k; - struct data_update_opts data_opts; - struct moving_context *ctxt; - struct bch_move_stats *stats; - - struct bch_read_bio rbio; - struct bch_write_op op; - struct bio_vec *bvecs; -}; - -struct promote_op { - struct rcu_head rcu; - u64 start_time; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - - struct rhash_head hash; - struct bpos pos; - - struct work_struct work; - struct data_update write; - struct bio_vec bi_inline_vecs[]; /* must be last */ -}; - -void bch2_data_update_to_text(struct printbuf *, struct data_update *); -void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); - -int bch2_data_update_index_update(struct bch_write_op *); - -void bch2_data_update_read_done(struct data_update *); - -int bch2_extent_drop_ptrs(struct btree_trans *, - struct btree_iter *, - struct bkey_s_c, - struct bch_io_opts *, - struct data_update_opts *); - -int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, - struct bch_io_opts *); - -void bch2_data_update_exit(struct data_update *); -int bch2_data_update_init(struct btree_trans *, struct btree_iter *, - struct moving_context *, - struct data_update *, - struct write_point_specifier, - struct bch_io_opts *, struct data_update_opts, - enum btree_id, struct bkey_s_c); -void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); - -#endif /* _BCACHEFS_DATA_UPDATE_H */ diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c deleted file mode 100644 index 07c2a0f73cc2..000000000000 --- a/fs/bcachefs/debug.c +++ /dev/null @@ -1,996 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Assorted bcachefs debug code - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "async_objs.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "data_update.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "fsck.h" -#include "inode.h" -#include "journal_reclaim.h" -#include "super.h" - -#include <linux/console.h> -#include <linux/debugfs.h> -#include <linux/module.h> -#include <linux/random.h> -#include <linux/seq_file.h> - -static struct dentry *bch_debug; - -static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, - struct extent_ptr_decoded pick) -{ - struct btree *v = c->verify_data; - struct btree_node *n_ondisk = c->verify_ondisk; - struct btree_node *n_sorted = c->verify_data->data; - struct bset *sorted, *inmemory = &b->data->keys; - struct bio *bio; - bool failed = false; - - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_btree_verify_replicas); - if (!ca) - return false; - - bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_sorted, btree_buf_bytes(b)), - REQ_OP_READ|REQ_META, - GFP_NOFS, - &c->btree_bio); - bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_sorted, btree_buf_bytes(b)); - - submit_bio_wait(bio); - - bio_put(bio); - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_btree_verify_replicas); - - memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); - - v->written = 0; - if (bch2_btree_node_read_done(c, ca, v, NULL, NULL)) - return false; - - n_sorted = c->verify_data->data; - sorted = &n_sorted->keys; - - if (inmemory->u64s != sorted->u64s || - memcmp(inmemory->start, - sorted->start, - vstruct_end(inmemory) - (void *) inmemory->start)) { - unsigned offset = 0, sectors; - struct bset *i; - unsigned j; - - console_lock(); - - printk(KERN_ERR "*** in memory:\n"); - bch2_dump_bset(c, b, inmemory, 0); - - printk(KERN_ERR "*** read back in:\n"); - bch2_dump_bset(c, v, sorted, 0); - - while (offset < v->written) { - if (!offset) { - i = &n_ondisk->keys; - sectors = vstruct_blocks(n_ondisk, c->block_bits) << - c->block_bits; - } else { - struct btree_node_entry *bne = - (void *) n_ondisk + (offset << 9); - i = &bne->keys; - - sectors = vstruct_blocks(bne, c->block_bits) << - c->block_bits; - } - - printk(KERN_ERR "*** on disk block %u:\n", offset); - bch2_dump_bset(c, b, i, offset); - - offset += sectors; - } - - for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) - if (inmemory->_data[j] != sorted->_data[j]) - break; - - console_unlock(); - bch_err(c, "verify failed at key %u", j); - - failed = true; - } - - if (v->written != b->written) { - bch_err(c, "written wrong: expected %u, got %u", - b->written, v->written); - failed = true; - } - - return failed; -} - -void __bch2_btree_verify(struct bch_fs *c, struct btree *b) -{ - struct bkey_ptrs_c ptrs; - struct extent_ptr_decoded p; - const union bch_extent_entry *entry; - struct btree *v; - struct bset *inmemory = &b->data->keys; - struct bkey_packed *k; - bool failed = false; - - if (c->opts.nochanges) - return; - - bch2_btree_node_io_lock(b); - mutex_lock(&c->verify_lock); - - if (!c->verify_ondisk) { - c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); - if (!c->verify_ondisk) - goto out; - } - - if (!c->verify_data) { - c->verify_data = __bch2_btree_node_mem_alloc(c); - if (!c->verify_data) - goto out; - } - - BUG_ON(b->nsets != 1); - - for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k)) - if (k->type == KEY_TYPE_btree_ptr_v2) - ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0; - - v = c->verify_data; - bkey_copy(&v->key, &b->key); - v->c.level = b->c.level; - v->c.btree_id = b->c.btree_id; - bch2_btree_keys_init(v); - - ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); - bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry) - failed |= bch2_btree_verify_replica(c, b, p); - - if (failed) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf); - printbuf_exit(&buf); - } -out: - mutex_unlock(&c->verify_lock); - bch2_btree_node_io_unlock(b); -} - -void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, - const struct btree *b) -{ - struct btree_node *n_ondisk = NULL; - struct extent_ptr_decoded pick; - struct bch_dev *ca; - struct bio *bio = NULL; - unsigned offset = 0; - int ret; - - if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) { - prt_printf(out, "error getting device to read from: invalid device\n"); - return; - } - - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_btree_node_ondisk_to_text); - if (!ca) { - prt_printf(out, "error getting device to read from: not online\n"); - return; - } - - n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); - if (!n_ondisk) { - prt_printf(out, "memory allocation failure\n"); - goto out; - } - - bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_ondisk, btree_buf_bytes(b)), - REQ_OP_READ|REQ_META, - GFP_NOFS, - &c->btree_bio); - bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b)); - - ret = submit_bio_wait(bio); - if (ret) { - prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret)); - goto out; - } - - while (offset < btree_sectors(c)) { - struct bset *i; - struct nonce nonce; - struct bch_csum csum; - struct bkey_packed *k; - unsigned sectors; - - if (!offset) { - i = &n_ondisk->keys; - - if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { - prt_printf(out, "unknown checksum type at offset %u: %llu\n", - offset, BSET_CSUM_TYPE(i)); - goto out; - } - - nonce = btree_nonce(i, offset << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk); - - if (bch2_crc_cmp(csum, n_ondisk->csum)) { - prt_printf(out, "invalid checksum\n"); - goto out; - } - - bset_encrypt(c, i, offset << 9); - - sectors = vstruct_sectors(n_ondisk, c->block_bits); - } else { - struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9); - - i = &bne->keys; - - if (i->seq != n_ondisk->keys.seq) - break; - - if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { - prt_printf(out, "unknown checksum type at offset %u: %llu\n", - offset, BSET_CSUM_TYPE(i)); - goto out; - } - - nonce = btree_nonce(i, offset << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - - if (bch2_crc_cmp(csum, bne->csum)) { - prt_printf(out, "invalid checksum"); - goto out; - } - - bset_encrypt(c, i, offset << 9); - - sectors = vstruct_sectors(bne, c->block_bits); - } - - prt_printf(out, " offset %u version %u, journal seq %llu\n", - offset, - le16_to_cpu(i->version), - le64_to_cpu(i->journal_seq)); - offset += sectors; - - printbuf_indent_add(out, 4); - - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) { - struct bkey u; - - bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u)); - prt_newline(out); - } - - printbuf_indent_sub(out, 4); - } -out: - if (bio) - bio_put(bio); - kvfree(n_ondisk); - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_btree_node_ondisk_to_text); -} - -#ifdef CONFIG_DEBUG_FS - -ssize_t bch2_debugfs_flush_buf(struct dump_iter *i) -{ - if (i->buf.pos) { - size_t bytes = min_t(size_t, i->buf.pos, i->size); - int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes); - - i->ret += copied; - i->ubuf += copied; - i->size -= copied; - i->buf.pos -= copied; - memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); - - if (i->buf.last_newline >= copied) - i->buf.last_newline -= copied; - if (i->buf.last_field >= copied) - i->buf.last_field -= copied; - - if (copied != bytes) - return -EFAULT; - } - - return i->size ? 0 : i->ret; -} - -static int bch2_dump_open(struct inode *inode, struct file *file) -{ - struct btree_debug *bd = inode->i_private; - struct dump_iter *i; - - i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) - return -ENOMEM; - - file->private_data = i; - i->from = POS_MIN; - i->iter = 0; - i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); - i->id = bd->id; - i->buf = PRINTBUF; - - return 0; -} - -int bch2_dump_release(struct inode *inode, struct file *file) -{ - struct dump_iter *i = file->private_data; - - printbuf_exit(&i->buf); - kfree(i); - return 0; -} - -static ssize_t bch2_read_btree(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - return bch2_debugfs_flush_buf(i) ?: - bch2_trans_run(i->c, - for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - bch2_bkey_val_to_text(&i->buf, i->c, k); - prt_newline(&i->buf); - bch2_trans_unlock(trans); - i->from = bpos_successor(iter.pos); - bch2_debugfs_flush_buf(i); - }))) ?: - i->ret; -} - -static const struct file_operations btree_debug_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_read_btree, -}; - -static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - ssize_t ret = bch2_debugfs_flush_buf(i); - if (ret) - return ret; - - if (bpos_eq(SPOS_MAX, i->from)) - return i->ret; - - return bch2_trans_run(i->c, - for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({ - bch2_btree_node_to_text(&i->buf, i->c, b); - i->from = !bpos_eq(SPOS_MAX, b->key.k.p) - ? bpos_successor(b->key.k.p) - : b->key.k.p; - - drop_locks_do(trans, bch2_debugfs_flush_buf(i)); - }))) ?: i->ret; -} - -static const struct file_operations btree_format_debug_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_read_btree_formats, -}; - -static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - return bch2_debugfs_flush_buf(i) ?: - bch2_trans_run(i->c, - for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - struct btree_path_level *l = - &btree_iter_path(trans, &iter)->l[0]; - struct bkey_packed *_k = - bch2_btree_node_iter_peek(&l->iter, l->b); - - if (bpos_gt(l->b->key.k.p, i->prev_node)) { - bch2_btree_node_to_text(&i->buf, i->c, l->b); - i->prev_node = l->b->key.k.p; - } - - bch2_bfloat_to_text(&i->buf, l->b, _k); - bch2_trans_unlock(trans); - i->from = bpos_successor(iter.pos); - bch2_debugfs_flush_buf(i); - }))) ?: - i->ret; -} - -static const struct file_operations bfloat_failed_debug_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_read_bfloat_failed, -}; - -static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c, - struct btree *b) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - prt_printf(out, "%px ", b); - bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); - prt_printf(out, "\n"); - - printbuf_indent_add(out, 2); - - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); - prt_newline(out); - - prt_printf(out, "flags:\t"); - prt_bitflags(out, bch2_btree_node_flags, b->flags); - prt_newline(out); - - prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL); - prt_printf(out, "written:\t%u\n", b->written); - prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked)); - prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable); - - prt_printf(out, "journal pin %px:\t%llu\n", - &b->writes[0].journal, b->writes[0].journal.seq); - prt_printf(out, "journal pin %px:\t%llu\n", - &b->writes[1].journal, b->writes[1].journal.seq); - - prt_printf(out, "ob:\t%u\n", b->ob.nr); - - printbuf_indent_sub(out, 2); -} - -static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - bool done = false; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - do { - ret = bch2_debugfs_flush_buf(i); - if (ret) - return ret; - - i->buf.atomic++; - scoped_guard(rcu) { - struct bucket_table *tbl = - rht_dereference_rcu(c->btree_cache.table.tbl, - &c->btree_cache.table); - if (i->iter < tbl->size) { - struct rhash_head *pos; - struct btree *b; - - rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) - bch2_cached_btree_node_to_text(&i->buf, c, b); - i->iter++; - } else { - done = true; - } - } - --i->buf.atomic; - } while (!done); - - if (i->buf.allocation_failure) - ret = -ENOMEM; - - if (!ret) - ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; -} - -static const struct file_operations cached_btree_nodes_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_cached_btree_nodes_read, -}; - -typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r); - -static void list_sort(struct list_head *head, list_cmp_fn cmp) -{ - struct list_head *pos; - - list_for_each(pos, head) - while (!list_is_last(pos, head) && - cmp(pos, pos->next) > 0) { - struct list_head *pos2, *next = pos->next; - - list_del(next); - list_for_each(pos2, head) - if (cmp(next, pos2) < 0) - goto pos_found; - BUG(); -pos_found: - list_add_tail(next, pos2); - } -} - -static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r) -{ - return cmp_int(l, r); -} - -static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - struct btree_trans *trans; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); -restart: - seqmutex_lock(&c->btree_trans_lock); - list_sort(&c->btree_trans_list, list_ptr_order_cmp); - - list_for_each_entry(trans, &c->btree_trans_list, list) { - if ((ulong) trans <= i->iter) - continue; - - i->iter = (ulong) trans; - - if (!closure_get_not_zero(&trans->ref)) - continue; - - if (!trans->srcu_held) { - closure_put(&trans->ref); - continue; - } - - u32 seq = seqmutex_unlock(&c->btree_trans_lock); - - bch2_btree_trans_to_text(&i->buf, trans); - - prt_printf(&i->buf, "backtrace:\n"); - printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL); - printbuf_indent_sub(&i->buf, 2); - prt_newline(&i->buf); - - closure_put(&trans->ref); - - ret = bch2_debugfs_flush_buf(i); - if (ret) - goto unlocked; - - if (!seqmutex_relock(&c->btree_trans_lock, seq)) - goto restart; - } - seqmutex_unlock(&c->btree_trans_lock); -unlocked: - srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - - if (i->buf.allocation_failure) - ret = -ENOMEM; - - if (!ret) - ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; -} - -static const struct file_operations btree_transactions_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_btree_transactions_read, -}; - -static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - bool done = false; - int err; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - while (1) { - err = bch2_debugfs_flush_buf(i); - if (err) - return err; - - if (!i->size) - break; - - if (done) - break; - - done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); - i->iter++; - } - - if (i->buf.allocation_failure) - return -ENOMEM; - - return i->ret; -} - -static const struct file_operations journal_pins_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_journal_pins_read, -}; - -static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - int err; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - if (!i->iter) { - bch2_btree_updates_to_text(&i->buf, c); - i->iter++; - } - - err = bch2_debugfs_flush_buf(i); - if (err) - return err; - - if (i->buf.allocation_failure) - return -ENOMEM; - - return i->ret; -} - -static const struct file_operations btree_updates_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_btree_updates_read, -}; - -static int btree_transaction_stats_open(struct inode *inode, struct file *file) -{ - struct bch_fs *c = inode->i_private; - struct dump_iter *i; - - i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) - return -ENOMEM; - - i->iter = 1; - i->c = c; - i->buf = PRINTBUF; - file->private_data = i; - - return 0; -} - -static int btree_transaction_stats_release(struct inode *inode, struct file *file) -{ - struct dump_iter *i = file->private_data; - - printbuf_exit(&i->buf); - kfree(i); - - return 0; -} - -static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - int err; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - while (1) { - struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; - - err = bch2_debugfs_flush_buf(i); - if (err) - return err; - - if (!i->size) - break; - - if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) || - !bch2_btree_transaction_fns[i->iter]) - break; - - prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]); - printbuf_indent_add(&i->buf, 2); - - mutex_lock(&s->lock); - - prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - printbuf_indent_add(&i->buf, 2); - bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); - printbuf_indent_sub(&i->buf, 2); -#endif - - prt_printf(&i->buf, "Transaction duration:\n"); - - printbuf_indent_add(&i->buf, 2); - bch2_time_stats_to_text(&i->buf, &s->duration); - printbuf_indent_sub(&i->buf, 2); - - if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { - prt_printf(&i->buf, "Lock hold times:\n"); - - printbuf_indent_add(&i->buf, 2); - bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); - printbuf_indent_sub(&i->buf, 2); - } - - if (s->max_paths_text) { - prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths); - - printbuf_indent_add(&i->buf, 2); - prt_str_indented(&i->buf, s->max_paths_text); - printbuf_indent_sub(&i->buf, 2); - } - - mutex_unlock(&s->lock); - - printbuf_indent_sub(&i->buf, 2); - prt_newline(&i->buf); - i->iter++; - } - - if (i->buf.allocation_failure) - return -ENOMEM; - - return i->ret; -} - -static const struct file_operations btree_transaction_stats_op = { - .owner = THIS_MODULE, - .open = btree_transaction_stats_open, - .release = btree_transaction_stats_release, - .read = btree_transaction_stats_read, -}; - -/* walk btree transactions until we find a deadlock and print it */ -static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct btree_trans *trans; - ulong iter = 0; -restart: - seqmutex_lock(&c->btree_trans_lock); - list_sort(&c->btree_trans_list, list_ptr_order_cmp); - - list_for_each_entry(trans, &c->btree_trans_list, list) { - if ((ulong) trans <= iter) - continue; - - iter = (ulong) trans; - - if (!closure_get_not_zero(&trans->ref)) - continue; - - u32 seq = seqmutex_unlock(&c->btree_trans_lock); - - bool found = bch2_check_for_deadlock(trans, out) != 0; - - closure_put(&trans->ref); - - if (found) - return; - - if (!seqmutex_relock(&c->btree_trans_lock, seq)) - goto restart; - } - seqmutex_unlock(&c->btree_trans_lock); -} - -typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *); - -static ssize_t bch2_simple_print(struct file *file, char __user *buf, - size_t size, loff_t *ppos, - fs_to_text_fn fn) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - if (!i->iter) { - fn(&i->buf, c); - i->iter++; - } - - if (i->buf.allocation_failure) - ret = -ENOMEM; - - if (!ret) - ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; -} - -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text); -} - -static const struct file_operations btree_deadlock_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_btree_deadlock_read, -}; - -static ssize_t bch2_write_points_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text); -} - -static const struct file_operations write_points_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_write_points_read, -}; - -void bch2_fs_debug_exit(struct bch_fs *c) -{ - if (!IS_ERR_OR_NULL(c->fs_debug_dir)) - debugfs_remove_recursive(c->fs_debug_dir); -} - -static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd) -{ - struct dentry *d; - - d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir); - - debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops); - - debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops); - - debugfs_create_file("bfloat-failed", 0400, d, bd, - &bfloat_failed_debug_ops); -} - -void bch2_fs_debug_init(struct bch_fs *c) -{ - struct btree_debug *bd; - char name[100]; - - if (IS_ERR_OR_NULL(bch_debug)) - return; - - if (c->sb.multi_device) - snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); - else - strscpy(name, c->name, sizeof(name)); - - c->fs_debug_dir = debugfs_create_dir(name, bch_debug); - if (IS_ERR_OR_NULL(c->fs_debug_dir)) - return; - - debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, - c->btree_debug, &cached_btree_nodes_ops); - - debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, - c->btree_debug, &btree_transactions_ops); - - debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, - c->btree_debug, &journal_pins_ops); - - debugfs_create_file("btree_updates", 0400, c->fs_debug_dir, - c->btree_debug, &btree_updates_ops); - - debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, - c, &btree_transaction_stats_op); - - debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, - c->btree_debug, &btree_deadlock_ops); - - debugfs_create_file("write_points", 0400, c->fs_debug_dir, - c->btree_debug, &write_points_ops); - - bch2_fs_async_obj_debugfs_init(c); - - c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); - if (IS_ERR_OR_NULL(c->btree_debug_dir)) - return; - - for (bd = c->btree_debug; - bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); - bd++) { - bd->id = bd - c->btree_debug; - bch2_fs_debug_btree_init(c, bd); - } -} - -#endif - -void bch2_debug_exit(void) -{ - if (!IS_ERR_OR_NULL(bch_debug)) - debugfs_remove_recursive(bch_debug); -} - -int __init bch2_debug_init(void) -{ - bch_debug = debugfs_create_dir("bcachefs", NULL); - return 0; -} diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h deleted file mode 100644 index d88b1194b8ac..000000000000 --- a/fs/bcachefs/debug.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DEBUG_H -#define _BCACHEFS_DEBUG_H - -#include "bcachefs.h" - -struct bio; -struct btree; -struct bch_fs; - -void __bch2_btree_verify(struct bch_fs *, struct btree *); -void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, - const struct btree *); - -static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) -{ - if (static_branch_unlikely(&bch2_verify_btree_ondisk)) - __bch2_btree_verify(c, b); -} - -#ifdef CONFIG_DEBUG_FS -struct dump_iter { - struct bch_fs *c; - struct async_obj_list *list; - enum btree_id id; - struct bpos from; - struct bpos prev_node; - u64 iter; - - struct printbuf buf; - - char __user *ubuf; /* destination user buffer */ - size_t size; /* size of requested read */ - ssize_t ret; /* bytes read so far */ -}; - -ssize_t bch2_debugfs_flush_buf(struct dump_iter *); -int bch2_dump_release(struct inode *, struct file *); - -void bch2_fs_debug_exit(struct bch_fs *); -void bch2_fs_debug_init(struct bch_fs *); -#else -static inline void bch2_fs_debug_exit(struct bch_fs *c) {} -static inline void bch2_fs_debug_init(struct bch_fs *c) {} -#endif - -void bch2_debug_exit(void); -int bch2_debug_init(void); - -#endif /* _BCACHEFS_DEBUG_H */ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c deleted file mode 100644 index 28875c5c86ad..000000000000 --- a/fs/bcachefs/dirent.c +++ /dev/null @@ -1,766 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_buf.h" -#include "bkey_methods.h" -#include "btree_update.h" -#include "extents.h" -#include "dirent.h" -#include "fs.h" -#include "keylist.h" -#include "str_hash.h" -#include "subvolume.h" - -#include <linux/dcache.h> - -#ifdef CONFIG_UNICODE -int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, - const struct qstr *str, struct qstr *out_cf) -{ - *out_cf = (struct qstr) QSTR_INIT(NULL, 0); - - if (!bch2_fs_casefold_enabled(trans->c)) - return -EOPNOTSUPP; - - unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); - int ret = PTR_ERR_OR_ZERO(buf); - if (ret) - return ret; - - ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1); - if (ret <= 0) - return ret; - - *out_cf = (struct qstr) QSTR_INIT(buf, ret); - return 0; -} -#endif - -static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) -{ - if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name)) - return 0; - - unsigned bkey_u64s = bkey_val_u64s(d.k); - unsigned bkey_bytes = bkey_u64s * sizeof(u64); - u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1]; -#if CPU_BIG_ENDIAN - unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8; -#else - unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8; -#endif - - return bkey_bytes - - (d.v->d_casefold - ? offsetof(struct bch_dirent, d_cf_name_block.d_names) - : offsetof(struct bch_dirent, d_name)) - - trailing_nuls; -} - -struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d) -{ - if (d.v->d_casefold) { - unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); - return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len); - } else { - return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); - } -} - -static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d) -{ - if (d.v->d_casefold) { - unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); - unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len); - return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len); - } else { - return (struct qstr) QSTR_INIT(NULL, 0); - } -} - -static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d) -{ - return d.v->d_casefold - ? bch2_dirent_get_casefold_name(d) - : bch2_dirent_get_name(d); -} - -static u64 bch2_dirent_hash(const struct bch_hash_info *info, - const struct qstr *name) -{ - struct bch_str_hash_ctx ctx; - - bch2_str_hash_init(&ctx, info); - bch2_str_hash_update(&ctx, info, name->name, name->len); - - /* [0,2) reserved for dots */ - return max_t(u64, bch2_str_hash_end(&ctx, info), 2); -} - -static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) -{ - return bch2_dirent_hash(info, key); -} - -static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr name = bch2_dirent_get_lookup_name(d); - - return bch2_dirent_hash(info, &name); -} - -static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) -{ - struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - const struct qstr l_name = bch2_dirent_get_lookup_name(l); - const struct qstr *r_name = _r; - - return !qstr_eq(l_name, *r_name); -} - -static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -{ - struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); - const struct qstr l_name = bch2_dirent_get_lookup_name(l); - const struct qstr r_name = bch2_dirent_get_lookup_name(r); - - return !qstr_eq(l_name, r_name); -} - -static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - - if (d.v->d_type == DT_SUBVOL) - return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol; - return true; -} - -const struct bch_hash_desc bch2_dirent_hash_desc = { - .btree_id = BTREE_ID_dirents, - .key_type = KEY_TYPE_dirent, - .hash_key = dirent_hash_key, - .hash_bkey = dirent_hash_bkey, - .cmp_key = dirent_cmp_key, - .cmp_bkey = dirent_cmp_bkey, - .is_visible = dirent_is_visible, -}; - -int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - unsigned name_block_len = bch2_dirent_name_bytes(d); - struct qstr d_name = bch2_dirent_get_name(d); - struct qstr d_cf_name = bch2_dirent_get_casefold_name(d); - int ret = 0; - - bkey_fsck_err_on(!d_name.len, - c, dirent_empty_name, - "empty name"); - - bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len, - c, dirent_val_too_big, - "dirent names exceed bkey size (%d + %d > %d)", - d_name.len, d_cf_name.len, name_block_len); - - /* - * Check new keys don't exceed the max length - * (older keys may be larger.) - */ - bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, - c, dirent_name_too_long, - "dirent name too big (%u > %u)", - d_name.len, BCH_NAME_MAX); - - bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), - c, dirent_name_embedded_nul, - "dirent has stray data after name's NUL"); - - bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) || - (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), - c, dirent_name_dot_or_dotdot, - "invalid name"); - - bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), - c, dirent_name_has_slash, - "name with /"); - - bkey_fsck_err_on(d.v->d_type != DT_SUBVOL && - le64_to_cpu(d.v->d_inum) == d.k->p.inode, - c, dirent_to_itself, - "dirent points to own directory"); - - if (d.v->d_casefold) { - bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit && - d_cf_name.len > BCH_NAME_MAX, - c, dirent_cf_name_too_big, - "dirent w/ cf name too big (%u > %u)", - d_cf_name.len, BCH_NAME_MAX); - - bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len), - c, dirent_stray_data_after_cf_name, - "dirent has stray data after cf name's NUL"); - } -fsck_err: - return ret; -} - -void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr d_name = bch2_dirent_get_name(d); - - prt_printf(out, "%.*s", d_name.len, d_name.name); - - if (d.v->d_casefold) { - struct qstr d_name = bch2_dirent_get_lookup_name(d); - prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name); - } - - prt_str(out, " ->"); - - if (d.v->d_type != DT_SUBVOL) - prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum)); - else - prt_printf(out, " %u -> %u", - le32_to_cpu(d.v->d_parent_subvol), - le32_to_cpu(d.v->d_child_subvol)); - - prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); -} - -int bch2_dirent_init_name(struct bch_fs *c, - struct bkey_i_dirent *dirent, - const struct bch_hash_info *hash_info, - const struct qstr *name, - const struct qstr *cf_name) -{ - EBUG_ON(hash_info->cf_encoding == NULL && cf_name); - int cf_len = 0; - - if (name->len > BCH_NAME_MAX) - return -ENAMETOOLONG; - - dirent->v.d_casefold = hash_info->cf_encoding != NULL; - - if (!dirent->v.d_casefold) { - memcpy(&dirent->v.d_name[0], name->name, name->len); - memset(&dirent->v.d_name[name->len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_name) - - name->len); - } else { - if (!bch2_fs_casefold_enabled(c)) - return -EOPNOTSUPP; - -#ifdef CONFIG_UNICODE - memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); - - char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len]; - - if (cf_name) { - cf_len = cf_name->len; - - memcpy(cf_out, cf_name->name, cf_name->len); - } else { - cf_len = utf8_casefold(hash_info->cf_encoding, name, - cf_out, - bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out); - if (cf_len <= 0) - return cf_len; - } - - memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_cf_name_block.d_names) - - name->len + cf_len); - - dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); - dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len); - - EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len); -#endif - } - - unsigned u64s = dirent_val_u64s(name->len, cf_len); - BUG_ON(u64s > bkey_val_u64s(&dirent->k)); - set_bkey_val_u64s(&dirent->k, u64s); - return 0; -} - -struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans, - const struct bch_hash_info *hash_info, - subvol_inum dir, - u8 type, - const struct qstr *name, - const struct qstr *cf_name, - u64 dst) -{ - struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); - if (IS_ERR(dirent)) - return dirent; - - bkey_dirent_init(&dirent->k_i); - dirent->k.u64s = BKEY_U64s_MAX; - - if (type != DT_SUBVOL) { - dirent->v.d_inum = cpu_to_le64(dst); - } else { - dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); - dirent->v.d_child_subvol = cpu_to_le32(dst); - } - - dirent->v.d_type = type; - dirent->v.d_unused = 0; - - int ret = bch2_dirent_init_name(trans->c, dirent, hash_info, name, cf_name); - if (ret) - return ERR_PTR(ret); - - EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); - return dirent; -} - -int bch2_dirent_create_snapshot(struct btree_trans *trans, - u32 dir_subvol, u64 dir, u32 snapshot, - const struct bch_hash_info *hash_info, - u8 type, const struct qstr *name, u64 dst_inum, - u64 *dir_offset, - enum btree_iter_update_trigger_flags flags) -{ - subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir }; - struct bkey_i_dirent *dirent; - int ret; - - dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); - ret = PTR_ERR_OR_ZERO(dirent); - if (ret) - return ret; - - dirent->k.p.inode = dir; - dirent->k.p.snapshot = snapshot; - - ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, snapshot, &dirent->k_i, flags); - *dir_offset = dirent->k.p.offset; - - return ret; -} - -int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, - const struct bch_hash_info *hash_info, - u8 type, const struct qstr *name, u64 dst_inum, - u64 *dir_offset, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_i_dirent *dirent; - int ret; - - dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); - ret = PTR_ERR_OR_ZERO(dirent); - if (ret) - return ret; - - ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir, &dirent->k_i, flags); - *dir_offset = dirent->k.p.offset; - - return ret; -} - -int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, - struct bkey_s_c_dirent d, subvol_inum *target) -{ - struct bch_subvolume s; - int ret = 0; - - if (d.v->d_type == DT_SUBVOL && - le32_to_cpu(d.v->d_parent_subvol) != dir.subvol) - return 1; - - if (likely(d.v->d_type != DT_SUBVOL)) { - target->subvol = dir.subvol; - target->inum = le64_to_cpu(d.v->d_inum); - } else { - target->subvol = le32_to_cpu(d.v->d_child_subvol); - - ret = bch2_subvolume_get(trans, target->subvol, true, &s); - - target->inum = le64_to_cpu(s.inode); - } - - return ret; -} - -int bch2_dirent_rename(struct btree_trans *trans, - subvol_inum src_dir, struct bch_hash_info *src_hash, - subvol_inum dst_dir, struct bch_hash_info *dst_hash, - const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, - const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, - enum bch_rename_mode mode) -{ - struct qstr src_name_lookup, dst_name_lookup; - struct btree_iter src_iter = {}; - struct btree_iter dst_iter = {}; - struct bkey_s_c old_src, old_dst = bkey_s_c_null; - struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; - struct bpos dst_pos = - POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); - unsigned src_update_flags = 0; - bool delete_src, delete_dst; - int ret = 0; - - memset(src_inum, 0, sizeof(*src_inum)); - memset(dst_inum, 0, sizeof(*dst_inum)); - - /* Lookup src: */ - ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup); - if (ret) - goto out; - old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, - src_hash, src_dir, &src_name_lookup, - BTREE_ITER_intent); - ret = bkey_err(old_src); - if (ret) - goto out; - - ret = bch2_dirent_read_target(trans, src_dir, - bkey_s_c_to_dirent(old_src), src_inum); - if (ret) - goto out; - - /* Lookup dst: */ - ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup); - if (ret) - goto out; - if (mode == BCH_RENAME) { - /* - * Note that we're _not_ checking if the target already exists - - * we're relying on the VFS to do that check for us for - * correctness: - */ - ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, &dst_name_lookup); - if (ret) - goto out; - } else { - old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, &dst_name_lookup, - BTREE_ITER_intent); - ret = bkey_err(old_dst); - if (ret) - goto out; - - ret = bch2_dirent_read_target(trans, dst_dir, - bkey_s_c_to_dirent(old_dst), dst_inum); - if (ret) - goto out; - } - - if (mode != BCH_RENAME_EXCHANGE) - *src_offset = dst_iter.pos.offset; - - /* Create new dst key: */ - new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, - dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); - ret = PTR_ERR_OR_ZERO(new_dst); - if (ret) - goto out; - - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - new_dst->k.p = dst_iter.pos; - - /* Create new src key: */ - if (mode == BCH_RENAME_EXCHANGE) { - new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name, - src_hash->cf_encoding ? &src_name_lookup : NULL, 0); - ret = PTR_ERR_OR_ZERO(new_src); - if (ret) - goto out; - - dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); - new_src->k.p = src_iter.pos; - } else { - new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); - ret = PTR_ERR_OR_ZERO(new_src); - if (ret) - goto out; - - bkey_init(&new_src->k); - new_src->k.p = src_iter.pos; - - if (bkey_le(dst_pos, src_iter.pos) && - bkey_lt(src_iter.pos, dst_iter.pos)) { - /* - * We have a hash collision for the new dst key, - * and new_src - the key we're deleting - is between - * new_dst's hashed slot and the slot we're going to be - * inserting it into - oops. This will break the hash - * table if we don't deal with it: - */ - if (mode == BCH_RENAME) { - /* - * If we're not overwriting, we can just insert - * new_dst at the src position: - */ - new_src = new_dst; - new_src->k.p = src_iter.pos; - goto out_set_src; - } else { - /* If we're overwriting, we can't insert new_dst - * at a different slot because it has to - * overwrite old_dst - just make sure to use a - * whiteout when deleting src: - */ - new_src->k.type = KEY_TYPE_hash_whiteout; - } - } else { - /* Check if we need a whiteout to delete src: */ - ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, - src_hash, &src_iter); - if (ret < 0) - goto out; - - if (ret) - new_src->k.type = KEY_TYPE_hash_whiteout; - } - } - - if (new_dst->v.d_type == DT_SUBVOL) - new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol); - - if ((mode == BCH_RENAME_EXCHANGE) && - new_src->v.d_type == DT_SUBVOL) - new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); - - ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); - if (ret) - goto out; -out_set_src: - /* - * If we're deleting a subvolume we need to really delete the dirent, - * not just emit a whiteout in the current snapshot - there can only be - * single dirent that points to a given subvolume. - * - * IOW, we don't maintain multiple versions in different snapshots of - * dirents that point to subvolumes - dirents that point to subvolumes - * are only visible in one particular subvolume so it's not necessary, - * and it would be particularly confusing for fsck to have to deal with. - */ - delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL && - new_src->k.p.snapshot != old_src.k->p.snapshot; - - delete_dst = old_dst.k && - bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL && - new_dst->k.p.snapshot != old_dst.k->p.snapshot; - - if (!delete_src || !bkey_deleted(&new_src->k)) { - ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); - if (ret) - goto out; - } - - if (delete_src) { - bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot); - ret = bch2_btree_iter_traverse(trans, &src_iter) ?: - bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto out; - } - - if (delete_dst) { - bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot); - ret = bch2_btree_iter_traverse(trans, &dst_iter) ?: - bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto out; - } - - if (mode == BCH_RENAME_EXCHANGE) - *src_offset = new_src->k.p.offset; - *dst_offset = new_dst->k.p.offset; -out: - bch2_trans_iter_exit(trans, &src_iter); - bch2_trans_iter_exit(trans, &dst_iter); - return ret; -} - -int bch2_dirent_lookup_trans(struct btree_trans *trans, - struct btree_iter *iter, - subvol_inum dir, - const struct bch_hash_info *hash_info, - const struct qstr *name, subvol_inum *inum, - unsigned flags) -{ - struct qstr lookup_name; - int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name); - if (ret) - return ret; - - struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, &lookup_name, flags); - ret = bkey_err(k); - if (ret) - goto err; - - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum); - if (ret > 0) - ret = -ENOENT; -err: - if (ret) - bch2_trans_iter_exit(trans, iter); - return ret; -} - -u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, - const struct bch_hash_info *hash_info, - const struct qstr *name, subvol_inum *inum) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = {}; - - int ret = lockrestart_do(trans, - bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, - SPOS(dir, 0, snapshot), - POS(dir, U64_MAX), 0, k, ret) - if (k.k->type == KEY_TYPE_dirent) { - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) - continue; - ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty); - break; - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) -{ - u32 snapshot; - - return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?: - bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot); -} - -static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target) -{ - struct qstr name = bch2_dirent_get_name(d); - /* - * Although not required by the kernel code, updating ctx->pos is needed - * for the bcachefs FUSE driver. Without this update, the FUSE - * implementation will be stuck in an infinite loop when reading - * directories (via the bcachefs_fuse_readdir callback). - * In kernel space, ctx->pos is updated by the VFS code. - */ - ctx->pos = d.k->p.offset; - bool ret = dir_emit(ctx, name.name, - name.len, - target.inum, - vfs_d_type(d.v->d_type)); - if (ret) - ctx->pos = d.k->p.offset + 1; - return !ret; -} - -int bch2_readdir(struct bch_fs *c, subvol_inum inum, - struct bch_hash_info *hash_info, - struct dir_context *ctx) -{ - struct bkey_buf sk; - bch2_bkey_buf_init(&sk); - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents, - POS(inum.inum, ctx->pos), - POS(inum.inum, U64_MAX), - inum.subvol, 0, k, ({ - if (k.k->type != KEY_TYPE_dirent) - continue; - - /* dir_emit() can fault and block: */ - bch2_bkey_buf_reassemble(&sk, c, k); - struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); - - subvol_inum target; - - bool need_second_pass = false; - int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc, - hash_info, &iter, k, &need_second_pass) ?: - bch2_dirent_read_target(trans, inum, dirent, &target); - if (ret2 > 0) - continue; - - ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target)); - }))); - - bch2_bkey_buf_exit(&sk, c); - - return ret < 0 ? ret : 0; -} - -/* fsck */ - -static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inode_nr) - break; - if (!bkey_is_inode(k.k)) - continue; - ret = bch2_inode_unpack(k, inode); - goto found; - } - ret = bch_err_throw(trans->c, ENOENT_inode); -found: - bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bch_inode_unpacked dir_inode; - struct bch_hash_info dir_hash_info; - int ret; - - ret = lookup_first_inode(trans, pos.inode, &dir_inode); - if (ret) - goto err; - - dir_hash_info = bch2_hash_info_init(c, &dir_inode); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); - - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); -err: - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h deleted file mode 100644 index 0417608c18d5..000000000000 --- a/fs/bcachefs/dirent.h +++ /dev/null @@ -1,119 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DIRENT_H -#define _BCACHEFS_DIRENT_H - -#include "str_hash.h" - -extern const struct bch_hash_desc bch2_dirent_hash_desc; - -int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_dirent ((struct bkey_ops) { \ - .key_validate = bch2_dirent_validate, \ - .val_to_text = bch2_dirent_to_text, \ - .min_val_size = 16, \ -}) - -struct qstr; -struct file; -struct dir_context; -struct bch_fs; -struct bch_hash_info; -struct bch_inode_info; - -#ifdef CONFIG_UNICODE -int bch2_casefold(struct btree_trans *, const struct bch_hash_info *, - const struct qstr *, struct qstr *); -#else -static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, - const struct qstr *str, struct qstr *out_cf) -{ - return -EOPNOTSUPP; -} -#endif - -static inline int bch2_maybe_casefold(struct btree_trans *trans, - const struct bch_hash_info *info, - const struct qstr *str, struct qstr *out_cf) -{ - if (likely(!info->cf_encoding)) { - *out_cf = *str; - return 0; - } else { - return bch2_casefold(trans, info, str, out_cf); - } -} - -struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent); - -static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) -{ - unsigned bytes = cf_len - ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len - : offsetof(struct bch_dirent, d_name) + len; - - return DIV_ROUND_UP(bytes, sizeof(u64)); -} - -int bch2_dirent_read_target(struct btree_trans *, subvol_inum, - struct bkey_s_c_dirent, subvol_inum *); - -static inline void dirent_copy_target(struct bkey_i_dirent *dst, - struct bkey_s_c_dirent src) -{ - dst->v.d_inum = src.v->d_inum; - dst->v.d_type = src.v->d_type; -} - -int bch2_dirent_init_name(struct bch_fs *, - struct bkey_i_dirent *, - const struct bch_hash_info *, - const struct qstr *, - const struct qstr *); -struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *, - const struct bch_hash_info *, subvol_inum, u8, - const struct qstr *, const struct qstr *, u64); - -int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, - const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, - enum btree_iter_update_trigger_flags); -int bch2_dirent_create(struct btree_trans *, subvol_inum, - const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, - enum btree_iter_update_trigger_flags); - -static inline unsigned vfs_d_type(unsigned type) -{ - return type == DT_SUBVOL ? DT_DIR : type; -} - -enum bch_rename_mode { - BCH_RENAME, - BCH_RENAME_OVERWRITE, - BCH_RENAME_EXCHANGE, -}; - -int bch2_dirent_rename(struct btree_trans *, - subvol_inum, struct bch_hash_info *, - subvol_inum, struct bch_hash_info *, - const struct qstr *, subvol_inum *, u64 *, - const struct qstr *, subvol_inum *, u64 *, - enum bch_rename_mode); - -int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, - subvol_inum, const struct bch_hash_info *, - const struct qstr *, subvol_inum *, unsigned); -u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, - const struct bch_hash_info *, - const struct qstr *, subvol_inum *); - -int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); -int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); -int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *); - -int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); - -#endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h deleted file mode 100644 index a46dbddd21aa..000000000000 --- a/fs/bcachefs/dirent_format.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DIRENT_FORMAT_H -#define _BCACHEFS_DIRENT_FORMAT_H - -/* - * Dirents (and xattrs) have to implement string lookups; since our b-tree - * doesn't support arbitrary length strings for the key, we instead index by a - * 64 bit hash (currently truncated sha1) of the string, stored in the offset - * field of the key - using linear probing to resolve hash collisions. This also - * provides us with the readdir cookie posix requires. - * - * Linear probing requires us to use whiteouts for deletions, in the event of a - * collision: - */ - -struct bch_dirent { - struct bch_val v; - - /* Target inode number: */ - union { - __le64 d_inum; - struct { /* DT_SUBVOL */ - __le32 d_child_subvol; - __le32 d_parent_subvol; - }; - }; - - /* - * Copy of mode bits 12-15 from the target inode - so userspace can get - * the filetype without having to do a stat() - */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 d_type:5, - d_unused:2, - d_casefold:1; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u8 d_casefold:1, - d_unused:2, - d_type:5; -#endif - - union { - struct { - __u8 d_pad; - __le16 d_name_len; - __le16 d_cf_name_len; - __u8 d_names[]; - } d_cf_name_block __packed; - __DECLARE_FLEX_ARRAY(__u8, d_name); - } __packed; -} __packed __aligned(8); - -#define DT_SUBVOL 16 -#define BCH_DT_MAX 17 - -#define BCH_NAME_MAX 512 - -#endif /* _BCACHEFS_DIRENT_FORMAT_H */ diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c deleted file mode 100644 index f7528cd69c73..000000000000 --- a/fs/bcachefs/disk_accounting.c +++ /dev/null @@ -1,1074 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bcachefs_ioctl.h" -#include "btree_cache.h" -#include "btree_journal_iter.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "compress.h" -#include "disk_accounting.h" -#include "error.h" -#include "journal_io.h" -#include "replicas.h" - -/* - * Notes on disk accounting: - * - * We have two parallel sets of counters to be concerned with, and both must be - * kept in sync. - * - * - Persistent/on disk accounting, stored in the accounting btree and updated - * via btree write buffer updates that treat new accounting keys as deltas to - * apply to existing values. But reading from a write buffer btree is - * expensive, so we also have - * - * - In memory accounting, where accounting is stored as an array of percpu - * counters, indexed by an eytzinger array of disk acounting keys/bpos (which - * are the same thing, excepting byte swabbing on big endian). - * - * Cheap to read, but non persistent. - * - * Disk accounting updates are generated by transactional triggers; these run as - * keys enter and leave the btree, and can compare old and new versions of keys; - * the output of these triggers are deltas to the various counters. - * - * Disk accounting updates are done as btree write buffer updates, where the - * counters in the disk accounting key are deltas that will be applied to the - * counter in the btree when the key is flushed by the write buffer (or journal - * replay). - * - * To do a disk accounting update: - * - initialize a disk_accounting_pos, to specify which counter is being update - * - initialize counter deltas, as an array of 1-3 s64s - * - call bch2_disk_accounting_mod() - * - * This queues up the accounting update to be done at transaction commit time. - * Underneath, it's a normal btree write buffer update. - * - * The transaction commit path is responsible for propagating updates to the in - * memory counters, with bch2_accounting_mem_mod(). - * - * The commit path also assigns every disk accounting update a unique version - * number, based on the journal sequence number and offset within that journal - * buffer; this is used by journal replay to determine which updates have been - * done. - * - * The transaction commit path also ensures that replicas entry accounting - * updates are properly marked in the superblock (so that we know whether we can - * mount without data being unavailable); it will update the superblock if - * bch2_accounting_mem_mod() tells it to. - */ - -static const char * const disk_accounting_type_strs[] = { -#define x(t, n, ...) [n] = #t, - BCH_DISK_ACCOUNTING_TYPES() -#undef x - NULL -}; - -static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos, - s64 *d, unsigned nr) -{ - struct bkey_i_accounting *acc = bkey_accounting_init(k); - - acc->k.p = pos; - set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); - - memcpy_u64s_small(acc->v.d, d, nr); -} - -static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, - s64 *d, unsigned nr) -{ - return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr); -} - -static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); - -int bch2_disk_accounting_mod(struct btree_trans *trans, - struct disk_accounting_pos *k, - s64 *d, unsigned nr, bool gc) -{ - BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); - - /* Normalize: */ - switch (k->type) { - case BCH_DISK_ACCOUNTING_replicas: - bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp); - break; - } - - struct bpos pos = disk_accounting_pos_to_bpos(k); - - if (likely(!gc)) { - struct bkey_i_accounting *a; -#if 0 - for (a = btree_trans_subbuf_base(trans, &trans->accounting); - a != btree_trans_subbuf_top(trans, &trans->accounting); - a = (void *) bkey_next(&a->k_i)) - if (bpos_eq(a->k.p, pos)) { - BUG_ON(nr != bch2_accounting_counters(&a->k)); - acc_u64s(a->v.d, d, nr); - - if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) { - unsigned offset = (u64 *) a - - (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); - - trans->accounting.u64s -= a->k.u64s; - memmove_u64s_down(a, - bkey_next(&a->k_i), - trans->accounting.u64s - offset); - } - return 0; - } -#endif - unsigned u64s = sizeof(*a) / sizeof(u64) + nr; - a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) - return ret; - - __accounting_key_init(&a->k_i, pos, d, nr); - return 0; - } else { - struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; - - __accounting_key_init(&k_i.k, pos, d, nr); - - int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); - if (ret == -BCH_ERR_btree_insert_need_mark_replicas) - ret = drop_locks_do(trans, - bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: - bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); - return ret; - } -} - -int bch2_mod_dev_cached_sectors(struct btree_trans *trans, - unsigned dev, s64 sectors, - bool gc) -{ - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_replicas_entry_cached(&acc.replicas, dev); - - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); -} - -static inline bool is_zero(char *start, char *end) -{ - BUG_ON(start > end); - - for (; start < end; start++) - if (*start) - return false; - return true; -} - -#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) - -static const unsigned bch2_accounting_type_nr_counters[] = { -#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr, - BCH_DISK_ACCOUNTING_TYPES() -#undef x -}; - -int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - void *end = &acc_k + 1; - int ret = 0; - - bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && - bversion_zero(k.k->bversion), - c, accounting_key_version_0, - "accounting key with version=0"); - - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_nr_inodes: - end = field_end(acc_k, nr_inodes); - break; - case BCH_DISK_ACCOUNTING_persistent_reserved: - end = field_end(acc_k, persistent_reserved); - break; - case BCH_DISK_ACCOUNTING_replicas: - bkey_fsck_err_on(!acc_k.replicas.nr_devs, - c, accounting_key_replicas_nr_devs_0, - "accounting key replicas entry with nr_devs=0"); - - bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs || - (acc_k.replicas.nr_required > 1 && - acc_k.replicas.nr_required == acc_k.replicas.nr_devs), - c, accounting_key_replicas_nr_required_bad, - "accounting key replicas entry with bad nr_required"); - - for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) - bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], - c, accounting_key_replicas_devs_unsorted, - "accounting key replicas entry with unsorted devs"); - - end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - end = field_end(acc_k, dev_data_type); - break; - case BCH_DISK_ACCOUNTING_compression: - end = field_end(acc_k, compression); - break; - case BCH_DISK_ACCOUNTING_snapshot: - end = field_end(acc_k, snapshot); - break; - case BCH_DISK_ACCOUNTING_btree: - end = field_end(acc_k, btree); - break; - case BCH_DISK_ACCOUNTING_rebalance_work: - end = field_end(acc_k, rebalance_work); - break; - } - - bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), - c, accounting_key_junk_at_end, - "junk at end of accounting key"); - - bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type], - c, accounting_key_nr_counters_wrong, - "accounting key with %u counters, should be %u", - bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]); -fsck_err: - return ret; -} - -void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) -{ - if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) { - prt_printf(out, "unknown type %u", k->type); - return; - } - - prt_str(out, disk_accounting_type_strs[k->type]); - prt_str(out, " "); - - switch (k->type) { - case BCH_DISK_ACCOUNTING_nr_inodes: - break; - case BCH_DISK_ACCOUNTING_persistent_reserved: - prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas); - break; - case BCH_DISK_ACCOUNTING_replicas: - bch2_replicas_entry_to_text(out, &k->replicas); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev); - bch2_prt_data_type(out, k->dev_data_type.data_type); - break; - case BCH_DISK_ACCOUNTING_compression: - bch2_prt_compression_type(out, k->compression.type); - break; - case BCH_DISK_ACCOUNTING_snapshot: - prt_printf(out, "id=%u", k->snapshot.id); - break; - case BCH_DISK_ACCOUNTING_btree: - prt_str(out, "btree="); - bch2_btree_id_to_text(out, k->btree.id); - break; - } -} - -void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k); - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - bch2_accounting_key_to_text(out, &acc_k); - - for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++) - prt_printf(out, " %lli", acc.v->d[i]); -} - -void bch2_accounting_swab(struct bkey_s k) -{ - for (u64 *p = (u64 *) k.v; - p < (u64 *) bkey_val_end(k); - p++) - *p = swab64(*p); -} - -static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, - struct disk_accounting_pos *acc) -{ - unsafe_memcpy(r, &acc->replicas, - replicas_entry_bytes(&acc->replicas), - "variable length struct"); -} - -static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) -{ - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, p); - - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_replicas: - __accounting_to_replicas(r, &acc_k); - return true; - default: - return false; - } -} - -static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) -{ - union bch_replicas_padded r; - return accounting_to_replicas(&r.e, p) - ? bch2_mark_replicas(c, &r.e) - : 0; -} - -/* - * Ensure accounting keys being updated are present in the superblock, when - * applicable (i.e. replicas updates) - */ -int bch2_accounting_update_sb(struct btree_trans *trans) -{ - for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); - i != btree_trans_subbuf_top(trans, &trans->accounting); - i = bkey_next(i)) { - int ret = bch2_accounting_update_sb_one(trans->c, i->k.p); - if (ret) - return ret; - } - - return 0; -} - -static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a) -{ - struct bch_accounting_mem *acc = &c->accounting; - - /* raced with another insert, already present: */ - if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &a.k->p) < acc->k.nr) - return 0; - - struct accounting_mem_entry n = { - .pos = a.k->p, - .bversion = a.k->bversion, - .nr_counters = bch2_accounting_counters(a.k), - .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL), - }; - - if (!n.v[0]) - goto err; - - if (acc->gc_running) { - n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL); - if (!n.v[1]) - goto err; - } - - if (darray_push(&acc->k, n)) - goto err; - - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); - - if (trace_accounting_mem_insert_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_accounting_to_text(&buf, c, a.s_c); - trace_accounting_mem_insert(c, buf.buf); - printbuf_exit(&buf); - } - return 0; -err: - free_percpu(n.v[1]); - free_percpu(n.v[0]); - return bch_err_throw(c, ENOMEM_disk_accounting); -} - -int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, - enum bch_accounting_mode mode) -{ - union bch_replicas_padded r; - - if (mode != BCH_ACCOUNTING_read && - accounting_to_replicas(&r.e, a.k->p) && - !bch2_replicas_marked_locked(c, &r.e)) - return bch_err_throw(c, btree_insert_need_mark_replicas); - - percpu_up_read(&c->mark_lock); - percpu_down_write(&c->mark_lock); - int ret = __bch2_accounting_mem_insert(c, a); - percpu_up_write(&c->mark_lock); - percpu_down_read(&c->mark_lock); - return ret; -} - -int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, - enum bch_accounting_mode mode) -{ - union bch_replicas_padded r; - - if (mode != BCH_ACCOUNTING_read && - accounting_to_replicas(&r.e, a.k->p) && - !bch2_replicas_marked_locked(c, &r.e)) - return bch_err_throw(c, btree_insert_need_mark_replicas); - - return __bch2_accounting_mem_insert(c, a); -} - -static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) -{ - for (unsigned i = 0; i < e->nr_counters; i++) - if (percpu_u64_get(e->v[0] + i) || - (e->v[1] && - percpu_u64_get(e->v[1] + i))) - return false; - return true; -} - -void bch2_accounting_mem_gc(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - - percpu_down_write(&c->mark_lock); - struct accounting_mem_entry *dst = acc->k.data; - - darray_for_each(acc->k, src) { - if (accounting_mem_entry_is_zero(src)) { - free_percpu(src->v[0]); - free_percpu(src->v[1]); - } else { - *dst++ = *src; - } - } - - acc->k.nr = dst - acc->k.data; - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); - percpu_up_write(&c->mark_lock); -} - -/* - * Read out accounting keys for replicas entries, as an array of - * bch_replicas_usage entries. - * - * Note: this may be deprecated/removed at smoe point in the future and replaced - * with something more general, it exists to support the ioctl used by the - * 'bcachefs fs usage' command. - */ -int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) -{ - struct bch_accounting_mem *acc = &c->accounting; - int ret = 0; - - darray_init(usage); - - percpu_down_read(&c->mark_lock); - darray_for_each(acc->k, i) { - union { - u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs, - BCH_BKEY_PTRS_MAX)]; - struct bch_replicas_usage r; - } u; - u.r.r.nr_devs = BCH_BKEY_PTRS_MAX; - - if (!accounting_to_replicas(&u.r.r, i->pos)) - continue; - - u64 sectors; - bch2_accounting_mem_read_counters(acc, i - acc->k.data, §ors, 1, false); - u.r.sectors = sectors; - - ret = darray_make_room(usage, replicas_usage_bytes(&u.r)); - if (ret) - break; - - memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r)); - usage->nr += replicas_usage_bytes(&u.r); - } - percpu_up_read(&c->mark_lock); - - if (ret) - darray_exit(usage); - return ret; -} - -int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask) -{ - - struct bch_accounting_mem *acc = &c->accounting; - int ret = 0; - - darray_init(out_buf); - - percpu_down_read(&c->mark_lock); - darray_for_each(acc->k, i) { - struct disk_accounting_pos a_p; - bpos_to_disk_accounting_pos(&a_p, i->pos); - - if (!(accounting_types_mask & BIT(a_p.type))) - continue; - - ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) + - sizeof(u64) * i->nr_counters); - if (ret) - break; - - struct bkey_i_accounting *a_out = - bkey_accounting_init((void *) &darray_top(*out_buf)); - set_bkey_val_u64s(&a_out->k, i->nr_counters); - a_out->k.p = i->pos; - bch2_accounting_mem_read_counters(acc, i - acc->k.data, - a_out->v.d, i->nr_counters, false); - - if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out))) - out_buf->nr += bkey_bytes(&a_out->k); - } - - percpu_up_read(&c->mark_lock); - - if (ret) - darray_exit(out_buf); - return ret; -} - -static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) -{ - darray_for_each(acc->k, e) { - free_percpu(e->v[gc]); - e->v[gc] = NULL; - } -} - -int bch2_gc_accounting_start(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - int ret = 0; - - percpu_down_write(&c->mark_lock); - darray_for_each(acc->k, e) { - e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL); - if (!e->v[1]) { - bch2_accounting_free_counters(acc, true); - ret = bch_err_throw(c, ENOMEM_disk_accounting); - break; - } - } - - acc->gc_running = !ret; - percpu_up_write(&c->mark_lock); - - return ret; -} - -int bch2_gc_accounting_done(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - struct bpos pos = POS_MIN; - int ret = 0; - - percpu_down_write(&c->mark_lock); - while (1) { - unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &pos); - - if (idx >= acc->k.nr) - break; - - struct accounting_mem_entry *e = acc->k.data + idx; - pos = bpos_successor(e->pos); - - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, e->pos); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - continue; - - u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; - u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; - - unsigned nr = e->nr_counters; - bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false); - bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true); - - if (memcmp(dst_v, src_v, nr * sizeof(u64))) { - printbuf_reset(&buf); - prt_str(&buf, "accounting mismatch for "); - bch2_accounting_key_to_text(&buf, &acc_k); - - prt_str(&buf, ":\n got"); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", dst_v[j]); - - prt_str(&buf, "\nshould be"); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", src_v[j]); - - for (unsigned j = 0; j < nr; j++) - src_v[j] -= dst_v[j]; - - bch2_trans_unlock_long(trans); - - if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) { - percpu_up_write(&c->mark_lock); - ret = commit_do(trans, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); - percpu_down_write(&c->mark_lock); - if (ret) - goto err; - - if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { - memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); - struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; - - accounting_key_init(&k_i.k, &acc_k, src_v, nr); - bch2_accounting_mem_mod_locked(trans, - bkey_i_to_s_c_accounting(&k_i.k), - BCH_ACCOUNTING_normal, true); - - preempt_disable(); - struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); - struct bch_fs_usage_base *src = &trans->fs_usage_delta; - acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); - preempt_enable(); - } - } - } - } -err: -fsck_err: - percpu_up_write(&c->mark_lock); - printbuf_exit(&buf); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - - if (k.k->type != KEY_TYPE_accounting) - return 0; - - percpu_down_read(&c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), - BCH_ACCOUNTING_read, false); - percpu_up_read(&c->mark_lock); - return ret; -} - -static int bch2_disk_accounting_validate_late(struct btree_trans *trans, - struct disk_accounting_pos *acc, - u64 *v, unsigned nr) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0, invalid_dev = -1; - - switch (acc->type) { - case BCH_DISK_ACCOUNTING_replicas: { - union bch_replicas_padded r; - __accounting_to_replicas(&r.e, acc); - - for (unsigned i = 0; i < r.e.nr_devs; i++) - if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && - !bch2_dev_exists(c, r.e.devs[i])) { - invalid_dev = r.e.devs[i]; - goto invalid_device; - } - - /* - * All replicas entry checks except for invalid device are done - * in bch2_accounting_validate - */ - BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); - - if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), - trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n%s", - (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, acc), - buf.buf))) { - /* - * We're not RW yet and still single threaded, dropping - * and retaking lock is ok: - */ - percpu_up_write(&c->mark_lock); - ret = bch2_mark_replicas(c, &r.e); - if (ret) - goto fsck_err; - percpu_down_write(&c->mark_lock); - } - break; - } - - case BCH_DISK_ACCOUNTING_dev_data_type: - if (!bch2_dev_exists(c, acc->dev_data_type.dev)) { - invalid_dev = acc->dev_data_type.dev; - goto invalid_device; - } - break; - } - -fsck_err: - printbuf_exit(&buf); - return ret; -invalid_device: - if (fsck_err(trans, accounting_to_invalid_device, - "accounting entry points to invalid device %i\n%s", - invalid_dev, - (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, acc), - buf.buf))) { - for (unsigned i = 0; i < nr; i++) - v[i] = -v[i]; - - ret = commit_do(trans, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: - -BCH_ERR_remove_disk_accounting_entry; - } else { - ret = bch_err_throw(c, remove_disk_accounting_entry); - } - goto fsck_err; -} - -/* - * At startup time, initialize the in memory accounting from the btree (and - * journal) - */ -int bch2_accounting_read(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - - /* - * We might run more than once if we rewind to start topology repair or - * btree node scan - and those might cause us to get different results, - * so we can't just skip if we've already run. - * - * Instead, zero out any accounting we have: - */ - percpu_down_write(&c->mark_lock); - darray_for_each(acc->k, e) - percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); - for_each_member_device(c, ca) - percpu_memset(ca->usage, 0, sizeof(*ca->usage)); - percpu_memset(c->usage, 0, sizeof(*c->usage)); - percpu_up_write(&c->mark_lock); - - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); - iter.flags &= ~BTREE_ITER_with_journal; - int ret = for_each_btree_key_continue(trans, iter, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); - - if (k.k->type != KEY_TYPE_accounting) - continue; - - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; - - if (!bch2_accounting_is_mem(&acc_k)) { - struct disk_accounting_pos next; - memset(&next, 0, sizeof(next)); - next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); - continue; - } - - accounting_read_key(trans, k); - })); - if (ret) - goto err; - - struct journal_keys *keys = &c->journal_keys; - struct journal_key *dst = keys->data; - move_gap(keys, keys->nr); - - darray_for_each(*keys, i) { - if (i->k->k.type == KEY_TYPE_accounting) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); - - if (!bch2_accounting_is_mem(&acc_k)) - continue; - - struct bkey_s_c k = bkey_i_to_s_c(i->k); - unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, - sizeof(acc->k.data[0]), - accounting_pos_cmp, &k.k->p); - - bool applied = idx < acc->k.nr && - bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0; - - if (applied) - continue; - - if (i + 1 < &darray_top(*keys) && - i[1].k->k.type == KEY_TYPE_accounting && - !journal_key_cmp(i, i + 1)) { - WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); - - i[1].journal_seq = i[0].journal_seq; - - bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k), - bkey_s_c_to_accounting(k)); - continue; - } - - ret = accounting_read_key(trans, k); - if (ret) - goto err; - } - - *dst++ = *i; - } - keys->gap = keys->nr = dst - keys->data; - - percpu_down_write(&c->mark_lock); - - darray_for_each_reverse(acc->k, i) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, i->pos); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - memset(v, 0, sizeof(v)); - - for (unsigned j = 0; j < i->nr_counters; j++) - v[j] = percpu_u64_get(i->v[0] + j); - - /* - * If the entry counters are zeroed, it should be treated as - * nonexistent - it might point to an invalid device. - * - * Remove it, so that if it's re-added it gets re-marked in the - * superblock: - */ - ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) - ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); - - if (ret == -BCH_ERR_remove_disk_accounting_entry) { - free_percpu(i->v[0]); - free_percpu(i->v[1]); - darray_remove_item(&acc->k, i); - ret = 0; - continue; - } - - if (ret) - goto fsck_err; - } - - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); - - preempt_disable(); - struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); - - for (unsigned i = 0; i < acc->k.nr; i++) { - struct disk_accounting_pos k; - bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); - - switch (k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - usage->reserved += v[0] * k.persistent_reserved.nr_replicas; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: { - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); - if (ca) { - struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; - percpu_u64_set(&d->buckets, v[0]); - percpu_u64_set(&d->sectors, v[1]); - percpu_u64_set(&d->fragmented, v[2]); - - if (k.dev_data_type.data_type == BCH_DATA_sb || - k.dev_data_type.data_type == BCH_DATA_journal) - usage->hidden += v[0] * ca->mi.bucket_size; - } - break; - } - } - } - preempt_enable(); -fsck_err: - percpu_up_write(&c->mark_lock); -err: - printbuf_exit(&buf); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) -{ - return bch2_trans_run(c, - bch2_btree_write_buffer_flush_sync(trans) ?: - for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, - BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ - struct disk_accounting_pos acc; - bpos_to_disk_accounting_pos(&acc, k.k->p); - - acc.type == BCH_DISK_ACCOUNTING_dev_data_type && - acc.dev_data_type.dev == dev - ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) - : 0; - })) ?: - bch2_btree_write_buffer_flush_sync(trans)); -} - -int bch2_dev_usage_init(struct bch_dev *ca, bool gc) -{ - struct bch_fs *c = ca->fs; - u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; - - int ret = bch2_trans_do(c, ({ - bch2_disk_accounting_mod2(trans, gc, - v, dev_data_type, - .dev = ca->dev_idx, - .data_type = BCH_DATA_free) ?: - (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); - })); - bch_err_fn(c, ret); - return ret; -} - -void bch2_verify_accounting_clean(struct bch_fs *c) -{ - bool mismatch = false; - struct bch_fs_usage_base base = {}, base_inmem = {}; - - bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_accounting, POS_MIN, - BTREE_ITER_all_snapshots, k, ({ - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); - unsigned nr = bch2_accounting_counters(k.k); - - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; - - if (!bch2_accounting_is_mem(&acc_k)) { - struct disk_accounting_pos next; - memset(&next, 0, sizeof(next)); - next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); - continue; - } - - bch2_accounting_mem_read(c, k.k->p, v, nr); - - if (memcmp(a.v->d, v, nr * sizeof(u64))) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, " !="); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", v[j]); - - pr_err("%s", buf.buf); - printbuf_exit(&buf); - mismatch = true; - } - - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - { - guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ - struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); - if (!ca) - continue; - - v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); - v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); - v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); - } - - if (memcmp(a.v->d, v, 3 * sizeof(u64))) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, " in mem"); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", v[j]); - - pr_err("dev accounting mismatch: %s", buf.buf); - printbuf_exit(&buf); - mismatch = true; - } - } - - 0; - }))); - - acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64)); - -#define check(x) \ - if (base.x != base_inmem.x) { \ - pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \ - mismatch = true; \ - } - - //check(hidden); - check(btree); - check(data); - check(cached); - check(reserved); - check(nr_inodes); - - WARN_ON(mismatch); -} - -void bch2_accounting_gc_free(struct bch_fs *c) -{ - lockdep_assert_held(&c->mark_lock); - - struct bch_accounting_mem *acc = &c->accounting; - - bch2_accounting_free_counters(acc, true); - acc->gc_running = false; -} - -void bch2_fs_accounting_exit(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - - bch2_accounting_free_counters(acc, false); - darray_exit(&acc->k); -} diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h deleted file mode 100644 index d61abebf3e0b..000000000000 --- a/fs/bcachefs/disk_accounting.h +++ /dev/null @@ -1,301 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_ACCOUNTING_H -#define _BCACHEFS_DISK_ACCOUNTING_H - -#include "btree_update.h" -#include "eytzinger.h" -#include "sb-members.h" - -static inline void bch2_u64s_neg(u64 *v, unsigned nr) -{ - for (unsigned i = 0; i < nr; i++) - v[i] = -v[i]; -} - -static inline unsigned bch2_accounting_counters(const struct bkey *k) -{ - return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64); -} - -static inline void bch2_accounting_neg(struct bkey_s_accounting a) -{ - bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k)); -} - -static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a) -{ - for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) - if (a.v->d[i]) - return false; - return true; -} - -static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, - struct bkey_s_c_accounting src) -{ - for (unsigned i = 0; - i < min(bch2_accounting_counters(&dst->k), - bch2_accounting_counters(src.k)); - i++) - dst->v.d[i] += src.v->d[i]; - - if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0) - dst->k.bversion = src.k->bversion; -} - -static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, - enum bch_data_type data_type, - s64 sectors) -{ - switch (data_type) { - case BCH_DATA_btree: - fs_usage->btree += sectors; - break; - case BCH_DATA_user: - case BCH_DATA_parity: - fs_usage->data += sectors; - break; - case BCH_DATA_cached: - fs_usage->cached += sectors; - break; - default: - break; - } -} - -static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p) -{ - BUILD_BUG_ON(sizeof(*acc) != sizeof(p)); - -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - acc->_pad = p; -#else - memcpy_swab(acc, &p, sizeof(p)); -#endif -} - -static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc) -{ - struct bpos p; -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - p = acc->_pad; -#else - memcpy_swab(&p, acc, sizeof(p)); -#endif - return p; -} - -int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, - s64 *, unsigned, bool); - -#define disk_accounting_key_init(_k, _type, ...) \ -do { \ - memset(&(_k), 0, sizeof(_k)); \ - (_k).type = BCH_DISK_ACCOUNTING_##_type; \ - (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \ -} while (0) - -#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \ -({ \ - struct disk_accounting_pos pos; \ - disk_accounting_key_init(pos, __VA_ARGS__); \ - bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \ -}) - -#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \ - bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__) - -int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); - -int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); -void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_accounting_swab(struct bkey_s); - -#define bch2_bkey_ops_accounting ((struct bkey_ops) { \ - .key_validate = bch2_accounting_validate, \ - .val_to_text = bch2_accounting_to_text, \ - .swab = bch2_accounting_swab, \ - .min_val_size = 8, \ -}) - -int bch2_accounting_update_sb(struct btree_trans *); - -static inline int accounting_pos_cmp(const void *_l, const void *_r) -{ - const struct bpos *l = _l, *r = _r; - - return bpos_cmp(*l, *r); -} - -enum bch_accounting_mode { - BCH_ACCOUNTING_normal, - BCH_ACCOUNTING_gc, - BCH_ACCOUNTING_read, -}; - -int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); -int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); -void bch2_accounting_mem_gc(struct bch_fs *); - -static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc) -{ - return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR && - acc->type != BCH_DISK_ACCOUNTING_inum; -} - -/* - * Update in memory counters so they match the btree update we're doing; called - * from transaction commit path - */ -static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, - struct bkey_s_c_accounting a, - enum bch_accounting_mode mode, - bool write_locked) -{ - struct bch_fs *c = trans->c; - struct bch_accounting_mem *acc = &c->accounting; - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, a.k->p); - bool gc = mode == BCH_ACCOUNTING_gc; - - if (gc && !acc->gc_running) - return 0; - - if (!bch2_accounting_is_mem(&acc_k)) - return 0; - - if (mode == BCH_ACCOUNTING_normal) { - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: { - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); - if (ca) { - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); - } - break; - } - } - } - - unsigned idx; - - while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { - int ret = 0; - if (unlikely(write_locked)) - ret = bch2_accounting_mem_insert_locked(c, a, mode); - else - ret = bch2_accounting_mem_insert(c, a, mode); - if (ret) - return ret; - } - - struct accounting_mem_entry *e = &acc->k.data[idx]; - - EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); - - for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) - this_cpu_add(e->v[gc][i], a.v->d[i]); - return 0; -} - -static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) -{ - percpu_down_read(&trans->c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false); - percpu_up_read(&trans->c->mark_lock); - return ret; -} - -static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc, - unsigned idx, u64 *v, unsigned nr, bool gc) -{ - memset(v, 0, sizeof(*v) * nr); - - if (unlikely(idx >= acc->k.nr)) - return; - - struct accounting_mem_entry *e = &acc->k.data[idx]; - - nr = min_t(unsigned, nr, e->nr_counters); - - for (unsigned i = 0; i < nr; i++) - v[i] = percpu_u64_get(e->v[gc] + i); -} - -static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, - u64 *v, unsigned nr) -{ - percpu_down_read(&c->mark_lock); - struct bch_accounting_mem *acc = &c->accounting; - unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &p); - - bch2_accounting_mem_read_counters(acc, idx, v, nr, false); - percpu_up_read(&c->mark_lock); -} - -static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) -{ - EBUG_ON(!res->ref); - - return (struct bversion) { - .hi = res->seq >> 32, - .lo = (res->seq << 32) | (res->offset + offset), - }; -} - -static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, - struct bkey_i_accounting *a, - unsigned commit_flags) -{ - u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); - a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base); - - EBUG_ON(bversion_zero(a->k.bversion)); - - return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) - ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false) - : 0; -} - -static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans, - struct bkey_i_accounting *a_i, - unsigned commit_flags) -{ - if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { - struct bkey_s_accounting a = accounting_i_to_s(a_i); - - bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false); - bch2_accounting_neg(a); - } -} - -int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); -int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); - -int bch2_gc_accounting_start(struct bch_fs *); -int bch2_gc_accounting_done(struct bch_fs *); - -int bch2_accounting_read(struct bch_fs *); - -int bch2_dev_usage_remove(struct bch_fs *, unsigned); -int bch2_dev_usage_init(struct bch_dev *, bool); - -void bch2_verify_accounting_clean(struct bch_fs *c); - -void bch2_accounting_gc_free(struct bch_fs *); -void bch2_fs_accounting_exit(struct bch_fs *); - -#endif /* _BCACHEFS_DISK_ACCOUNTING_H */ diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h deleted file mode 100644 index 8269af1dbe2a..000000000000 --- a/fs/bcachefs/disk_accounting_format.h +++ /dev/null @@ -1,225 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H -#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H - -#include "replicas_format.h" - -/* - * Disk accounting - KEY_TYPE_accounting - on disk format: - * - * Here, the key has considerably more structure than a typical key (bpos); an - * accounting key is 'struct disk_accounting_pos', which is a union of bpos. - * - * More specifically: a key is just a muliword integer (where word endianness - * matches native byte order), so we're treating bpos as an opaque 20 byte - * integer and mapping bch_accounting_key to that. - * - * This is a type-tagged union of all our various subtypes; a disk accounting - * key can be device counters, replicas counters, et cetera - it's extensible. - * - * The value is a list of u64s or s64s; the number of counters is specific to a - * given accounting type. - * - * Unlike with other key types, updates are _deltas_, and the deltas are not - * resolved until the update to the underlying btree, done by btree write buffer - * flush or journal replay. - * - * Journal replay in particular requires special handling. The journal tracks a - * range of entries which may possibly have not yet been applied to the btree - * yet - it does not know definitively whether individual entries are dirty and - * still need to be applied. - * - * To handle this, we use the version field of struct bkey, and give every - * accounting update a unique version number - a total ordering in time; the - * version number is derived from the key's position in the journal. Then - * journal replay can compare the version number of the key from the journal - * with the version number of the key in the btree to determine if a key needs - * to be replayed. - * - * For this to work, we must maintain this strict time ordering of updates as - * they are flushed to the btree, both via write buffer flush and via journal - * replay. This has complications for the write buffer code while journal replay - * is still in progress; the write buffer cannot flush any accounting keys to - * the btree until journal replay has finished replaying its accounting keys, or - * the (newer) version number of the keys from the write buffer will cause - * updates from journal replay to be lost. - */ - -struct bch_accounting { - struct bch_val v; - __u64 d[]; -}; - -#define BCH_ACCOUNTING_MAX_COUNTERS 3 - -#define BCH_DATA_TYPES() \ - x(free, 0) \ - x(sb, 1) \ - x(journal, 2) \ - x(btree, 3) \ - x(user, 4) \ - x(cached, 5) \ - x(parity, 6) \ - x(stripe, 7) \ - x(need_gc_gens, 8) \ - x(need_discard, 9) \ - x(unstriped, 10) - -enum bch_data_type { -#define x(t, n) BCH_DATA_##t, - BCH_DATA_TYPES() -#undef x - BCH_DATA_NR -}; - -static inline bool data_type_is_empty(enum bch_data_type type) -{ - switch (type) { - case BCH_DATA_free: - case BCH_DATA_need_gc_gens: - case BCH_DATA_need_discard: - return true; - default: - return false; - } -} - -static inline bool data_type_is_hidden(enum bch_data_type type) -{ - switch (type) { - case BCH_DATA_sb: - case BCH_DATA_journal: - return true; - default: - return false; - } -} - -/* - * field 1: name - * field 2: id - * field 3: number of counters (max 3) - */ - -#define BCH_DISK_ACCOUNTING_TYPES() \ - x(nr_inodes, 0, 1) \ - x(persistent_reserved, 1, 1) \ - x(replicas, 2, 1) \ - x(dev_data_type, 3, 3) \ - x(compression, 4, 3) \ - x(snapshot, 5, 1) \ - x(btree, 6, 1) \ - x(rebalance_work, 7, 1) \ - x(inum, 8, 3) - -enum disk_accounting_type { -#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, - BCH_DISK_ACCOUNTING_TYPES() -#undef x - BCH_DISK_ACCOUNTING_TYPE_NR, -}; - -/* - * No subtypes - number of inodes in the entire filesystem - * - * XXX: perhaps we could add a per-subvolume counter? - */ -struct bch_acct_nr_inodes { -}; - -/* - * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the - * reservation: - */ -struct bch_acct_persistent_reserved { - __u8 nr_replicas; -}; - -/* - * device, data type counter fields: - * [ - * nr_buckets - * live sectors (in buckets of that data type) - * sectors of internal fragmentation - * ] - * - * XXX: live sectors should've been done differently, you can have multiple data - * types in the same bucket (user, stripe, cached) and this collapses them to - * the bucket data type, and makes the internal fragmentation counter redundant - */ -struct bch_acct_dev_data_type { - __u8 dev; - __u8 data_type; -}; - -/* - * Compression type fields: - * [ - * number of extents - * uncompressed size - * compressed size - * ] - * - * Compression ratio, average extent size (fragmentation). - */ -struct bch_acct_compression { - __u8 type; -}; - -/* - * On disk usage by snapshot id; counts same values as replicas counter, but - * aggregated differently - */ -struct bch_acct_snapshot { - __u32 id; -} __packed; - -struct bch_acct_btree { - __u32 id; -} __packed; - -/* - * inum counter fields: - * [ - * number of extents - * sum of extent sizes - bkey size - * this field is similar to inode.bi_sectors, except here extents in - * different snapshots but the same inode number are all collapsed to the - * same counter - * sum of on disk size - same values tracked by replicas counters - * ] - * - * This tracks on disk fragmentation. - */ -struct bch_acct_inum { - __u64 inum; -} __packed; - -/* - * Simple counter of the amount of data (on disk sectors) rebalance needs to - * move, extents counted here are also in the rebalance_work btree. - */ -struct bch_acct_rebalance_work { -}; - -struct disk_accounting_pos { - union { - struct { - __u8 type; - union { - struct bch_acct_nr_inodes nr_inodes; - struct bch_acct_persistent_reserved persistent_reserved; - struct bch_replicas_entry_v1 replicas; - struct bch_acct_dev_data_type dev_data_type; - struct bch_acct_compression compression; - struct bch_acct_snapshot snapshot; - struct bch_acct_btree btree; - struct bch_acct_rebalance_work rebalance_work; - struct bch_acct_inum inum; - } __packed; - } __packed; - struct bpos _pad; - }; -}; - -#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */ diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h deleted file mode 100644 index b1982131b206..000000000000 --- a/fs/bcachefs/disk_accounting_types.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H -#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H - -#include "darray.h" - -struct accounting_mem_entry { - struct bpos pos; - struct bversion bversion; - unsigned nr_counters; - u64 __percpu *v[2]; -}; - -struct bch_accounting_mem { - DARRAY(struct accounting_mem_entry) k; - bool gc_running; -}; - -#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c deleted file mode 100644 index cde842ac1886..000000000000 --- a/fs/bcachefs/disk_groups.c +++ /dev/null @@ -1,591 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "disk_groups.h" -#include "sb-members.h" -#include "super-io.h" - -#include <linux/sort.h> - -static int group_cmp(const void *_l, const void *_r) -{ - const struct bch_disk_group *l = _l; - const struct bch_disk_group *r = _r; - - return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - - (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: - ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - - (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: - strncmp(l->label, r->label, sizeof(l->label)); -} - -static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_disk_groups *groups = - field_to_type(f, disk_groups); - struct bch_disk_group *g, *sorted = NULL; - unsigned nr_groups = disk_groups_nr(groups); - unsigned i, len; - int ret = 0; - - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member m = bch2_sb_member_get(sb, i); - unsigned group_id; - - if (!BCH_MEMBER_GROUP(&m)) - continue; - - group_id = BCH_MEMBER_GROUP(&m) - 1; - - if (group_id >= nr_groups) { - prt_printf(err, "disk %u has invalid label %u (have %u)", - i, group_id, nr_groups); - return -BCH_ERR_invalid_sb_disk_groups; - } - - if (BCH_GROUP_DELETED(&groups->entries[group_id])) { - prt_printf(err, "disk %u has deleted label %u", i, group_id); - return -BCH_ERR_invalid_sb_disk_groups; - } - } - - if (!nr_groups) - return 0; - - for (i = 0; i < nr_groups; i++) { - g = groups->entries + i; - - if (BCH_GROUP_DELETED(g)) - continue; - - len = strnlen(g->label, sizeof(g->label)); - if (!len) { - prt_printf(err, "label %u empty", i); - return -BCH_ERR_invalid_sb_disk_groups; - } - } - - sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); - if (!sorted) - return -BCH_ERR_ENOMEM_disk_groups_validate; - - memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); - sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); - - for (g = sorted; g + 1 < sorted + nr_groups; g++) - if (!BCH_GROUP_DELETED(g) && - !group_cmp(&g[0], &g[1])) { - prt_printf(err, "duplicate label %llu.%.*s", - BCH_GROUP_PARENT(g), - (int) sizeof(g->label), g->label); - ret = -BCH_ERR_invalid_sb_disk_groups; - goto err; - } -err: - kfree(sorted); - return ret; -} - -static void bch2_sb_disk_groups_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_disk_groups *groups = - field_to_type(f, disk_groups); - struct bch_disk_group *g; - unsigned nr_groups = disk_groups_nr(groups); - - for (g = groups->entries; - g < groups->entries + nr_groups; - g++) { - if (g != groups->entries) - prt_printf(out, " "); - - if (BCH_GROUP_DELETED(g)) - prt_printf(out, "[deleted]"); - else - prt_printf(out, "[parent %llu name %s]", - BCH_GROUP_PARENT(g), g->label); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { - .validate = bch2_sb_disk_groups_validate, - .to_text = bch2_sb_disk_groups_to_text -}; - -int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) -{ - struct bch_sb_field_disk_groups *groups; - struct bch_disk_groups_cpu *cpu_g, *old_g; - unsigned i, g, nr_groups; - - lockdep_assert_held(&c->sb_lock); - - groups = bch2_sb_field_get(c->disk_sb.sb, disk_groups); - nr_groups = disk_groups_nr(groups); - - if (!groups) - return 0; - - cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL); - if (!cpu_g) - return bch_err_throw(c, ENOMEM_disk_groups_to_cpu); - - cpu_g->nr = nr_groups; - - for (i = 0; i < nr_groups; i++) { - struct bch_disk_group *src = &groups->entries[i]; - struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; - - dst->deleted = BCH_GROUP_DELETED(src); - dst->parent = BCH_GROUP_PARENT(src); - memcpy(dst->label, src->label, sizeof(dst->label)); - } - - for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { - struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i); - struct bch_disk_group_cpu *dst; - - if (!bch2_member_alive(&m)) - continue; - - g = BCH_MEMBER_GROUP(&m); - while (g) { - dst = &cpu_g->entries[g - 1]; - __set_bit(i, dst->devs.d); - g = dst->parent; - } - } - - old_g = rcu_dereference_protected(c->disk_groups, - lockdep_is_held(&c->sb_lock)); - rcu_assign_pointer(c->disk_groups, cpu_g); - if (old_g) - kfree_rcu(old_g, rcu); - - return 0; -} - -const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) -{ - struct target t = target_decode(target); - - guard(rcu)(); - - switch (t.type) { - case TARGET_NULL: - return NULL; - case TARGET_DEV: { - struct bch_dev *ca = t.dev < c->sb.nr_devices - ? rcu_dereference(c->devs[t.dev]) - : NULL; - return ca ? &ca->self : NULL; - } - case TARGET_GROUP: { - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - - return g && t.group < g->nr && !g->entries[t.group].deleted - ? &g->entries[t.group].devs - : NULL; - } - default: - BUG(); - } -} - -bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) -{ - struct target t = target_decode(target); - - switch (t.type) { - case TARGET_NULL: - return false; - case TARGET_DEV: - return dev == t.dev; - case TARGET_GROUP: { - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - const struct bch_devs_mask *m = - g && t.group < g->nr && !g->entries[t.group].deleted - ? &g->entries[t.group].devs - : NULL; - - return m ? test_bit(dev, m->d) : false; - } - default: - BUG(); - } -} - -static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, - unsigned parent, - const char *name, unsigned namelen) -{ - unsigned i, nr_groups = disk_groups_nr(groups); - - if (!namelen || namelen > BCH_SB_LABEL_SIZE) - return -EINVAL; - - for (i = 0; i < nr_groups; i++) { - struct bch_disk_group *g = groups->entries + i; - - if (BCH_GROUP_DELETED(g)) - continue; - - if (!BCH_GROUP_DELETED(g) && - BCH_GROUP_PARENT(g) == parent && - strnlen(g->label, sizeof(g->label)) == namelen && - !memcmp(name, g->label, namelen)) - return i; - } - - return -1; -} - -static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, - const char *name, unsigned namelen) -{ - struct bch_sb_field_disk_groups *groups = - bch2_sb_field_get(sb->sb, disk_groups); - unsigned i, nr_groups = disk_groups_nr(groups); - struct bch_disk_group *g; - - if (!namelen || namelen > BCH_SB_LABEL_SIZE) - return -EINVAL; - - for (i = 0; - i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); - i++) - ; - - if (i == nr_groups) { - unsigned u64s = - (sizeof(struct bch_sb_field_disk_groups) + - sizeof(struct bch_disk_group) * (nr_groups + 1)) / - sizeof(u64); - - groups = bch2_sb_field_resize(sb, disk_groups, u64s); - if (!groups) - return -BCH_ERR_ENOSPC_disk_label_add; - - nr_groups = disk_groups_nr(groups); - } - - BUG_ON(i >= nr_groups); - - g = &groups->entries[i]; - - memcpy(g->label, name, namelen); - if (namelen < sizeof(g->label)) - g->label[namelen] = '\0'; - SET_BCH_GROUP_DELETED(g, 0); - SET_BCH_GROUP_PARENT(g, parent); - SET_BCH_GROUP_DATA_ALLOWED(g, ~0); - - return i; -} - -int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) -{ - struct bch_sb_field_disk_groups *groups = - bch2_sb_field_get(sb->sb, disk_groups); - int v = -1; - - do { - const char *next = strchrnul(name, '.'); - unsigned len = next - name; - - if (*next == '.') - next++; - - v = __bch2_disk_group_find(groups, v + 1, name, len); - name = next; - } while (*name && v >= 0); - - return v; -} - -int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) -{ - struct bch_sb_field_disk_groups *groups; - unsigned parent = 0; - int v = -1; - - do { - const char *next = strchrnul(name, '.'); - unsigned len = next - name; - - if (*next == '.') - next++; - - groups = bch2_sb_field_get(sb->sb, disk_groups); - - v = __bch2_disk_group_find(groups, parent, name, len); - if (v < 0) - v = __bch2_disk_group_add(sb, parent, name, len); - if (v < 0) - return v; - - parent = v + 1; - name = next; - } while (*name && v >= 0); - - return v; -} - -static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g, - unsigned v) -{ - u16 path[32]; - unsigned nr = 0; - - while (1) { - if (nr == ARRAY_SIZE(path)) - goto invalid; - - if (v >= (g ? g->nr : 0)) - goto invalid; - - struct bch_disk_group_cpu *e = g->entries + v; - - if (e->deleted) - goto invalid; - - path[nr++] = v; - - if (!e->parent) - break; - - v = e->parent - 1; - } - - while (nr) { - struct bch_disk_group_cpu *e = g->entries + path[--nr]; - - prt_printf(out, "%.*s", (int) sizeof(e->label), e->label); - if (nr) - prt_printf(out, "."); - } - return; -invalid: - prt_printf(out, "invalid label %u", v); -} - -void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) -{ - bch2_printbuf_make_room(out, 4096); - - out->atomic++; - guard(rcu)(); - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - - for (unsigned i = 0; i < (g ? g->nr : 0); i++) { - prt_printf(out, "%2u: ", i); - - if (g->entries[i].deleted) { - prt_printf(out, "[deleted]"); - goto next; - } - - __bch2_disk_path_to_text(out, g, i); - - prt_printf(out, " devs"); - - for_each_member_device_rcu(c, ca, &g->entries[i].devs) - prt_printf(out, " %s", ca->name); -next: - prt_newline(out); - } - - out->atomic--; -} - -void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) -{ - out->atomic++; - guard(rcu)(); - __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v), - --out->atomic; -} - -void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) -{ - struct bch_sb_field_disk_groups *groups = - bch2_sb_field_get(sb, disk_groups); - struct bch_disk_group *g; - unsigned nr = 0; - u16 path[32]; - - while (1) { - if (nr == ARRAY_SIZE(path)) - goto inval; - - if (v >= disk_groups_nr(groups)) - goto inval; - - g = groups->entries + v; - - if (BCH_GROUP_DELETED(g)) - goto inval; - - path[nr++] = v; - - if (!BCH_GROUP_PARENT(g)) - break; - - v = BCH_GROUP_PARENT(g) - 1; - } - - while (nr) { - v = path[--nr]; - g = groups->entries + v; - - prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); - if (nr) - prt_printf(out, "."); - } - return; -inval: - prt_printf(out, "invalid label %u", v); -} - -int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) -{ - lockdep_assert_held(&c->sb_lock); - - - if (!strlen(name) || !strcmp(name, "none")) { - struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_GROUP(mi, 0); - } else { - int v = bch2_disk_path_find_or_create(&c->disk_sb, name); - if (v < 0) - return v; - - struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_GROUP(mi, v + 1); - } - - return bch2_sb_disk_groups_to_cpu(c); -} - -int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) -{ - int ret; - - mutex_lock(&c->sb_lock); - ret = __bch2_dev_group_set(c, ca, name) ?: - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; -} - -int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, - struct printbuf *err) -{ - struct bch_dev *ca; - int g; - - if (!val) - return -EINVAL; - - if (!c) - return -BCH_ERR_option_needs_open_fs; - - if (!strlen(val) || !strcmp(val, "none")) { - *res = 0; - return 0; - } - - /* Is it a device? */ - ca = bch2_dev_lookup(c, val); - if (!IS_ERR(ca)) { - *res = dev_to_target(ca->dev_idx); - bch2_dev_put(ca); - return 0; - } - - mutex_lock(&c->sb_lock); - g = bch2_disk_path_find(&c->disk_sb, val); - mutex_unlock(&c->sb_lock); - - if (g >= 0) { - *res = group_to_target(g); - return 0; - } - - return -EINVAL; -} - -void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) -{ - struct target t = target_decode(v); - - switch (t.type) { - case TARGET_NULL: - prt_printf(out, "none"); - return; - case TARGET_DEV: { - out->atomic++; - guard(rcu)(); - struct bch_dev *ca = t.dev < c->sb.nr_devices - ? rcu_dereference(c->devs[t.dev]) - : NULL; - - if (ca && ca->disk_sb.bdev) - prt_printf(out, "/dev/%s", ca->name); - else if (ca) - prt_printf(out, "offline device %u", t.dev); - else - prt_printf(out, "invalid device %u", t.dev); - - out->atomic--; - return; - } - case TARGET_GROUP: - bch2_disk_path_to_text(out, c, t.group); - return; - default: - BUG(); - } -} - -static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) -{ - struct target t = target_decode(v); - - switch (t.type) { - case TARGET_NULL: - prt_printf(out, "none"); - break; - case TARGET_DEV: { - struct bch_member m = bch2_sb_member_get(sb, t.dev); - - if (bch2_member_exists(sb, t.dev)) { - prt_printf(out, "Device "); - pr_uuid(out, m.uuid.b); - prt_printf(out, " (%u)", t.dev); - } else { - prt_printf(out, "Bad device %u", t.dev); - } - break; - } - case TARGET_GROUP: - bch2_disk_path_to_text_sb(out, sb, t.group); - break; - default: - BUG(); - } -} - -void bch2_opt_target_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_sb *sb, - u64 v) -{ - if (c) - bch2_target_to_text(out, c, v); - else - bch2_target_to_text_sb(out, sb, v); -} diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h deleted file mode 100644 index 441826fff224..000000000000 --- a/fs/bcachefs/disk_groups.h +++ /dev/null @@ -1,111 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_GROUPS_H -#define _BCACHEFS_DISK_GROUPS_H - -#include "disk_groups_types.h" - -extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; - -static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) -{ - return groups - ? (vstruct_end(&groups->field) - - (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) - : 0; -} - -struct target { - enum { - TARGET_NULL, - TARGET_DEV, - TARGET_GROUP, - } type; - union { - unsigned dev; - unsigned group; - }; -}; - -#define TARGET_DEV_START 1 -#define TARGET_GROUP_START (256 + TARGET_DEV_START) - -static inline u16 dev_to_target(unsigned dev) -{ - return TARGET_DEV_START + dev; -} - -static inline u16 group_to_target(unsigned group) -{ - return TARGET_GROUP_START + group; -} - -static inline struct target target_decode(unsigned target) -{ - if (target >= TARGET_GROUP_START) - return (struct target) { - .type = TARGET_GROUP, - .group = target - TARGET_GROUP_START - }; - - if (target >= TARGET_DEV_START) - return (struct target) { - .type = TARGET_DEV, - .group = target - TARGET_DEV_START - }; - - return (struct target) { .type = TARGET_NULL }; -} - -const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); - -static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, - enum bch_data_type data_type, - u16 target) -{ - struct bch_devs_mask devs = c->rw_devs[data_type]; - const struct bch_devs_mask *t = bch2_target_to_mask(c, target); - - if (t) - bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); - return devs; -} - -static inline bool bch2_target_accepts_data(struct bch_fs *c, - enum bch_data_type data_type, - u16 target) -{ - struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target); - return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX); -} - -bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); - -int bch2_disk_path_find(struct bch_sb_handle *, const char *); - -/* Exported for userspace bcachefs-tools: */ -int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); - -void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned); -void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned); - -void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned); - -int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); -void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); - -#define bch2_opt_target (struct bch_opt_fn) { \ - .parse = bch2_opt_target_parse, \ - .to_text = bch2_opt_target_to_text, \ -} - -int bch2_sb_disk_groups_to_cpu(struct bch_fs *); - -int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); -int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); - -const char *bch2_sb_validate_disk_groups(struct bch_sb *, - struct bch_sb_field *); - -void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *); - -#endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h deleted file mode 100644 index 698990bbf1d2..000000000000 --- a/fs/bcachefs/disk_groups_format.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H -#define _BCACHEFS_DISK_GROUPS_FORMAT_H - -#define BCH_SB_LABEL_SIZE 32 - -struct bch_disk_group { - __u8 label[BCH_SB_LABEL_SIZE]; - __le64 flags[2]; -} __packed __aligned(8); - -LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) -LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) - -struct bch_sb_field_disk_groups { - struct bch_sb_field field; - struct bch_disk_group entries[]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */ diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h deleted file mode 100644 index a54ef085b13d..000000000000 --- a/fs/bcachefs/disk_groups_types.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H -#define _BCACHEFS_DISK_GROUPS_TYPES_H - -struct bch_disk_group_cpu { - bool deleted; - u16 parent; - u8 label[BCH_SB_LABEL_SIZE]; - struct bch_devs_mask devs; -}; - -struct bch_disk_groups_cpu { - struct rcu_head rcu; - unsigned nr; - struct bch_disk_group_cpu entries[] __counted_by(nr); -}; - -#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c deleted file mode 100644 index 543dbba9b14f..000000000000 --- a/fs/bcachefs/ec.c +++ /dev/null @@ -1,2405 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* erasure coding */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "bset.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "checksum.h" -#include "disk_accounting.h" -#include "disk_groups.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "io_read.h" -#include "io_write.h" -#include "keylist.h" -#include "lru.h" -#include "recovery.h" -#include "replicas.h" -#include "super-io.h" -#include "util.h" - -#include <linux/sort.h> -#include <linux/string_choices.h> - -#ifdef __KERNEL__ - -#include <linux/raid/pq.h> -#include <linux/raid/xor.h> - -static void raid5_recov(unsigned disks, unsigned failed_idx, - size_t size, void **data) -{ - unsigned i = 2, nr; - - BUG_ON(failed_idx >= disks); - - swap(data[0], data[failed_idx]); - memcpy(data[0], data[1], size); - - while (i < disks) { - nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); - xor_blocks(nr, size, data[0], data + i); - i += nr; - } - - swap(data[0], data[failed_idx]); -} - -static void raid_gen(int nd, int np, size_t size, void **v) -{ - if (np >= 1) - raid5_recov(nd + np, nd, size, v); - if (np >= 2) - raid6_call.gen_syndrome(nd + np, size, v); - BUG_ON(np > 2); -} - -static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) -{ - switch (nr) { - case 0: - break; - case 1: - if (ir[0] < nd + 1) - raid5_recov(nd + 1, ir[0], size, v); - else - raid6_call.gen_syndrome(nd + np, size, v); - break; - case 2: - if (ir[1] < nd) { - /* data+data failure. */ - raid6_2data_recov(nd + np, size, ir[0], ir[1], v); - } else if (ir[0] < nd) { - /* data + p/q failure */ - - if (ir[1] == nd) /* data + p failure */ - raid6_datap_recov(nd + np, size, ir[0], v); - else { /* data + q failure */ - raid5_recov(nd + 1, ir[0], size, v); - raid6_call.gen_syndrome(nd + np, size, v); - } - } else { - raid_gen(nd, np, size, v); - } - break; - default: - BUG(); - } -} - -#else - -#include <raid/raid.h> - -#endif - -struct ec_bio { - struct bch_dev *ca; - struct ec_stripe_buf *buf; - size_t idx; - int rw; - u64 submit_time; - struct bio bio; -}; - -/* Stripes btree keys: */ - -int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - int ret = 0; - - bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || - bpos_gt(k.k->p, POS(0, U32_MAX)), - c, stripe_pos_bad, - "stripe at bad pos"); - - bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), - c, stripe_val_size_bad, - "incorrect value size (%zu < %u)", - bkey_val_u64s(k.k), stripe_val_u64s(s)); - - bkey_fsck_err_on(s->csum_granularity_bits >= 64, - c, stripe_csum_granularity_bad, - "invalid csum granularity (%u >= 64)", - s->csum_granularity_bits); - - ret = bch2_bkey_ptrs_validate(c, k, from); -fsck_err: - return ret; -} - -void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; - struct bch_stripe s = {}; - - memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); - - unsigned nr_data = s.nr_blocks - s.nr_redundant; - - prt_printf(out, "algo %u sectors %u blocks %u:%u csum ", - s.algorithm, - le16_to_cpu(s.sectors), - nr_data, - s.nr_redundant); - bch2_prt_csum_type(out, s.csum_type); - prt_str(out, " gran "); - if (s.csum_granularity_bits < 64) - prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits); - else - prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits); - - if (s.disk_label) { - prt_str(out, " label"); - bch2_disk_path_to_text(out, c, s.disk_label - 1); - } - - for (unsigned i = 0; i < s.nr_blocks; i++) { - const struct bch_extent_ptr *ptr = sp->ptrs + i; - - if ((void *) ptr >= bkey_val_end(k)) - break; - - prt_char(out, ' '); - bch2_extent_ptr_to_text(out, c, ptr); - - if (s.csum_type < BCH_CSUM_NR && - i < nr_data && - stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k)) - prt_printf(out, "#%u", stripe_blockcount_get(sp, i)); - } -} - -/* Triggers: */ - -static int __mark_stripe_bucket(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c_stripe s, - unsigned ptr_idx, bool deleting, - struct bpos bucket, - struct bch_alloc_v4 *a, - enum btree_iter_update_trigger_flags flags) -{ - const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; - unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant; - bool parity = ptr_idx >= nr_data; - enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; - s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_fs *c = trans->c; - if (deleting) - sectors = -sectors; - - if (!deleting) { - if (bch2_trans_inconsistent_on(a->stripe || - a->stripe_redundancy, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s", - bucket.inode, bucket.offset, a->gen, - bch2_data_type_str(a->data_type), - a->dirty_sectors, - a->stripe, s.k->p.offset, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s", - bucket.inode, bucket.offset, a->gen, - bch2_data_type_str(a->data_type), - a->dirty_sectors, - a->cached_sectors, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - } else { - if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset || - a->stripe_redundancy != s.v->nr_redundant, trans, - "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s", - bucket.inode, bucket.offset, a->gen, - a->stripe, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - if (bch2_trans_inconsistent_on(a->data_type != data_type, trans, - "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s", - bucket.inode, bucket.offset, a->gen, - bch2_data_type_str(a->data_type), - bch2_data_type_str(data_type), - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - if (bch2_trans_inconsistent_on(parity && - (a->dirty_sectors != -sectors || - a->cached_sectors), trans, - "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s", - bucket.inode, bucket.offset, a->gen, - a->dirty_sectors, - a->cached_sectors, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - } - - if (sectors) { - ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, - a->gen, a->data_type, &a->dirty_sectors); - if (ret) - goto err; - } - - if (!deleting) { - a->stripe = s.k->p.offset; - a->stripe_redundancy = s.v->nr_redundant; - alloc_data_type_set(a, data_type); - } else { - a->stripe = 0; - a->stripe_redundancy = 0; - alloc_data_type_set(a, BCH_DATA_user); - } -err: - printbuf_exit(&buf); - return ret; -} - -static int mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c_stripe s, - unsigned ptr_idx, bool deleting, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); - if (unlikely(!ca)) { - if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - if (flags & BTREE_TRIGGER_transactional) { - struct extent_ptr_decoded p = { - .ptr = *ptr, - .crc = bch2_extent_crc_unpack(s.k, NULL), - }; - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p, - (const union bch_extent_entry *) ptr, &bp); - - struct bkey_i_alloc_v4 *a = - bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: - __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?: - bch2_bucket_backpointer_mod(trans, s.s_c, &bp, - !(flags & BTREE_TRIGGER_overwrite)); - if (ret) - goto err; - } - - if (flags & BTREE_TRIGGER_gc) { - struct bucket *g = gc_bucket(ca, bucket.offset); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s", - ptr->dev, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - bucket_lock(g); - struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); - alloc_to_bucket(g, new); - bucket_unlock(g); - - if (!ret) - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); - } -err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -static int mark_stripe_buckets(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - enum btree_iter_update_trigger_flags flags) -{ - const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(old).v : NULL; - const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(new).v : NULL; - - BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks); - - unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - - for (unsigned i = 0; i < nr_blocks; i++) { - if (new_s && old_s && - !memcmp(&new_s->ptrs[i], - &old_s->ptrs[i], - sizeof(new_s->ptrs[i]))) - continue; - - if (new_s) { - int ret = mark_stripe_bucket(trans, - bkey_s_c_to_stripe(new), i, false, flags); - if (ret) - return ret; - } - - if (old_s) { - int ret = mark_stripe_bucket(trans, - bkey_s_c_to_stripe(old), i, true, flags); - if (ret) - return ret; - } - } - - return 0; -} - -int bch2_trigger_stripe(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s _new, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_s_c new = _new.s_c; - struct bch_fs *c = trans->c; - u64 idx = new.k->p.offset; - const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(old).v : NULL; - const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(new).v : NULL; - - if (unlikely(flags & BTREE_TRIGGER_check_repair)) - return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags); - - BUG_ON(new_s && old_s && - (new_s->nr_blocks != old_s->nr_blocks || - new_s->nr_redundant != old_s->nr_redundant)); - - if (flags & BTREE_TRIGGER_transactional) { - int ret = bch2_lru_change(trans, - BCH_LRU_STRIPE_FRAGMENTATION, - idx, - stripe_lru_pos(old_s), - stripe_lru_pos(new_s)); - if (ret) - return ret; - } - - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - /* - * If the pointers aren't changing, we don't need to do anything: - */ - if (new_s && old_s && - new_s->nr_blocks == old_s->nr_blocks && - new_s->nr_redundant == old_s->nr_redundant && - !memcmp(old_s->ptrs, new_s->ptrs, - new_s->nr_blocks * sizeof(struct bch_extent_ptr))) - return 0; - - struct gc_stripe *gc = NULL; - if (flags & BTREE_TRIGGER_gc) { - gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); - if (!gc) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); - return bch_err_throw(c, ENOMEM_mark_stripe); - } - - /* - * This will be wrong when we bring back runtime gc: we should - * be unmarking the old key and then marking the new key - * - * Also: when we bring back runtime gc, locking - */ - gc->alive = true; - gc->sectors = le16_to_cpu(new_s->sectors); - gc->nr_blocks = new_s->nr_blocks; - gc->nr_redundant = new_s->nr_redundant; - - for (unsigned i = 0; i < new_s->nr_blocks; i++) - gc->ptrs[i] = new_s->ptrs[i]; - - /* - * gc recalculates this field from stripe ptr - * references: - */ - memset(gc->block_sectors, 0, sizeof(gc->block_sectors)); - } - - if (new_s) { - s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant; - - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, new); - int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); - if (ret) - return ret; - - if (gc) - unsafe_memcpy(&gc->r.e, &acc.replicas, - replicas_entry_bytes(&acc.replicas), "VLA"); - } - - if (old_s) { - s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant; - - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, old); - int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); - if (ret) - return ret; - } - - int ret = mark_stripe_buckets(trans, old, new, flags); - if (ret) - return ret; - } - - return 0; -} - -/* returns blocknr in stripe that we matched: */ -static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, - struct bkey_s_c k, unsigned *block) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i, nr_data = s->nr_blocks - s->nr_redundant; - - bkey_for_each_ptr(ptrs, ptr) - for (i = 0; i < nr_data; i++) - if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, - le16_to_cpu(s->sectors))) { - *block = i; - return ptr; - } - - return NULL; -} - -static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - - bkey_extent_entry_for_each(ptrs, entry) - if (extent_entry_type(entry) == - BCH_EXTENT_ENTRY_stripe_ptr && - entry->stripe_ptr.idx == idx) - return true; - - return false; -} - -/* Stripe bufs: */ - -static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) -{ - if (buf->key.k.type == KEY_TYPE_stripe) { - struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key); - unsigned i; - - for (i = 0; i < s->v.nr_blocks; i++) { - kvfree(buf->data[i]); - buf->data[i] = NULL; - } - } -} - -/* XXX: this is a non-mempoolified memory allocation: */ -static int ec_stripe_buf_init(struct bch_fs *c, - struct ec_stripe_buf *buf, - unsigned offset, unsigned size) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned csum_granularity = 1U << v->csum_granularity_bits; - unsigned end = offset + size; - unsigned i; - - BUG_ON(end > le16_to_cpu(v->sectors)); - - offset = round_down(offset, csum_granularity); - end = min_t(unsigned, le16_to_cpu(v->sectors), - round_up(end, csum_granularity)); - - buf->offset = offset; - buf->size = end - offset; - - memset(buf->valid, 0xFF, sizeof(buf->valid)); - - for (i = 0; i < v->nr_blocks; i++) { - buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); - if (!buf->data[i]) - goto err; - } - - return 0; -err: - ec_stripe_buf_exit(buf); - return bch_err_throw(c, ENOMEM_stripe_buf); -} - -/* Checksumming: */ - -static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, - unsigned block, unsigned offset) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned csum_granularity = 1 << v->csum_granularity_bits; - unsigned end = buf->offset + buf->size; - unsigned len = min(csum_granularity, end - offset); - - BUG_ON(offset >= end); - BUG_ON(offset < buf->offset); - BUG_ON(offset & (csum_granularity - 1)); - BUG_ON(offset + len != le16_to_cpu(v->sectors) && - (len & (csum_granularity - 1))); - - return bch2_checksum(NULL, v->csum_type, - null_nonce(), - buf->data[block] + ((offset - buf->offset) << 9), - len << 9); -} - -static void ec_generate_checksums(struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned i, j, csums_per_device = stripe_csums_per_device(v); - - if (!v->csum_type) - return; - - BUG_ON(buf->offset); - BUG_ON(buf->size != le16_to_cpu(v->sectors)); - - for (i = 0; i < v->nr_blocks; i++) - for (j = 0; j < csums_per_device; j++) - stripe_csum_set(v, i, j, - ec_block_checksum(buf, i, j << v->csum_granularity_bits)); -} - -static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned csum_granularity = 1 << v->csum_granularity_bits; - unsigned i; - - if (!v->csum_type) - return; - - for (i = 0; i < v->nr_blocks; i++) { - unsigned offset = buf->offset; - unsigned end = buf->offset + buf->size; - - if (!test_bit(i, buf->valid)) - continue; - - while (offset < end) { - unsigned j = offset >> v->csum_granularity_bits; - unsigned len = min(csum_granularity, end - offset); - struct bch_csum want = stripe_csum_get(v, i, j); - struct bch_csum got = ec_block_checksum(buf, i, offset); - - if (bch2_crc_cmp(want, got)) { - struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev); - if (ca) { - struct printbuf err = PRINTBUF; - - prt_str(&err, "stripe "); - bch2_csum_err_msg(&err, v->csum_type, want, got); - prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); - bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); - bch_err_ratelimited(ca, "%s", err.buf); - printbuf_exit(&err); - - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - } - - clear_bit(i, buf->valid); - break; - } - - offset += len; - } - } -} - -/* Erasure coding: */ - -static void ec_generate_ec(struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned nr_data = v->nr_blocks - v->nr_redundant; - unsigned bytes = le16_to_cpu(v->sectors) << 9; - - raid_gen(nr_data, v->nr_redundant, bytes, buf->data); -} - -static unsigned ec_nr_failed(struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - - return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks); -} - -static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; - unsigned nr_data = v->nr_blocks - v->nr_redundant; - unsigned bytes = buf->size << 9; - - if (ec_nr_failed(buf) > v->nr_redundant) { - bch_err_ratelimited(c, - "error doing reconstruct read: unable to read enough blocks"); - return -1; - } - - for (i = 0; i < nr_data; i++) - if (!test_bit(i, buf->valid)) - failed[nr_failed++] = i; - - raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); - return 0; -} - -/* IO: */ - -static void ec_block_endio(struct bio *bio) -{ - struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); - struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v; - struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; - struct bch_dev *ca = ec_bio->ca; - struct closure *cl = bio->bi_private; - int rw = ec_bio->rw; - unsigned ref = rw == READ - ? BCH_DEV_READ_REF_ec_block - : BCH_DEV_WRITE_REF_ec_block; - - bch2_account_io_completion(ca, bio_data_dir(bio), - ec_bio->submit_time, !bio->bi_status); - - if (bio->bi_status) { - bch_err_dev_ratelimited(ca, "erasure coding %s error: %s", - str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status)); - clear_bit(ec_bio->idx, ec_bio->buf->valid); - } - - int stale = dev_ptr_stale(ca, ptr); - if (stale) { - bch_err_ratelimited(ca->fs, - "error %s stripe: stale/invalid pointer (%i) after io", - bio_data_dir(bio) == READ ? "reading from" : "writing to", - stale); - clear_bit(ec_bio->idx, ec_bio->buf->valid); - } - - bio_put(&ec_bio->bio); - enumerated_ref_put(&ca->io_ref[rw], ref); - closure_put(cl); -} - -static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, - blk_opf_t opf, unsigned idx, struct closure *cl) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned offset = 0, bytes = buf->size << 9; - struct bch_extent_ptr *ptr = &v->ptrs[idx]; - enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant - ? BCH_DATA_user - : BCH_DATA_parity; - int rw = op_is_write(opf); - unsigned ref = rw == READ - ? BCH_DEV_READ_REF_ec_block - : BCH_DEV_WRITE_REF_ec_block; - - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref); - if (!ca) { - clear_bit(idx, buf->valid); - return; - } - - int stale = dev_ptr_stale(ca, ptr); - if (stale) { - bch_err_ratelimited(c, - "error %s stripe: stale pointer (%i)", - rw == READ ? "reading from" : "writing to", - stale); - clear_bit(idx, buf->valid); - return; - } - - - this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); - - while (offset < bytes) { - unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, - DIV_ROUND_UP(bytes, PAGE_SIZE)); - unsigned b = min_t(size_t, bytes - offset, - nr_iovecs << PAGE_SHIFT); - struct ec_bio *ec_bio; - - ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, - nr_iovecs, - opf, - GFP_KERNEL, - &c->ec_bioset), - struct ec_bio, bio); - - ec_bio->ca = ca; - ec_bio->buf = buf; - ec_bio->idx = idx; - ec_bio->rw = rw; - ec_bio->submit_time = local_clock(); - - ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); - ec_bio->bio.bi_end_io = ec_block_endio; - ec_bio->bio.bi_private = cl; - - bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); - - closure_get(cl); - enumerated_ref_get(&ca->io_ref[rw], ref); - - submit_bio(&ec_bio->bio); - - offset += b; - } - - enumerated_ref_put(&ca->io_ref[rw], ref); -} - -static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, - struct ec_stripe_buf *stripe) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - POS(0, idx), BTREE_ITER_slots); - ret = bkey_err(k); - if (ret) - goto err; - if (k.k->type != KEY_TYPE_stripe) { - ret = -ENOENT; - goto err; - } - bkey_reassemble(&stripe->key, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* recovery read path: */ -int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, - struct bkey_s_c orig_k) -{ - struct bch_fs *c = trans->c; - struct ec_stripe_buf *buf = NULL; - struct closure cl; - struct bch_stripe *v; - unsigned i, offset; - const char *msg = NULL; - struct printbuf msgbuf = PRINTBUF; - int ret = 0; - - closure_init_stack(&cl); - - BUG_ON(!rbio->pick.has_ec); - - buf = kzalloc(sizeof(*buf), GFP_NOFS); - if (!buf) - return bch_err_throw(c, ENOMEM_ec_read_extent); - - ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); - if (ret) { - msg = "stripe not found"; - goto err; - } - - v = &bkey_i_to_stripe(&buf->key)->v; - - if (!bch2_ptr_matches_stripe(v, rbio->pick)) { - msg = "pointer doesn't match stripe"; - goto err; - } - - offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; - if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { - msg = "read is bigger than stripe"; - goto err; - } - - ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio)); - if (ret) { - msg = "-ENOMEM"; - goto err; - } - - for (i = 0; i < v->nr_blocks; i++) - ec_block_io(c, buf, REQ_OP_READ, i, &cl); - - closure_sync(&cl); - - if (ec_nr_failed(buf) > v->nr_redundant) { - msg = "unable to read enough blocks"; - goto err; - } - - ec_validate_checksums(c, buf); - - ret = ec_do_recov(c, buf); - if (ret) - goto err; - - memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, - buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); -out: - ec_stripe_buf_exit(buf); - kfree(buf); - return ret; -err: - bch2_bkey_val_to_text(&msgbuf, c, orig_k); - bch_err_ratelimited(c, - "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); - printbuf_exit(&msgbuf); - ret = bch_err_throw(c, stripe_reconstruct); - goto out; -} - -/* stripe bucket accounting: */ - -static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) -{ - if (c->gc_pos.phase != GC_PHASE_not_running && - !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) - return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc); - - return 0; -} - -static int ec_stripe_mem_alloc(struct btree_trans *trans, - struct btree_iter *iter) -{ - return allocate_dropping_locks_errcode(trans, - __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); -} - -/* - * Hash table of open stripes: - * Stripes that are being created or modified are kept in a hash table, so that - * stripe deletion can skip them. - */ - -static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) -{ - unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); - struct ec_stripe_new *s; - - hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) - if (s->idx == idx) - return true; - return false; -} - -static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) -{ - bool ret = false; - - spin_lock(&c->ec_stripes_new_lock); - ret = __bch2_stripe_is_open(c, idx); - spin_unlock(&c->ec_stripes_new_lock); - - return ret; -} - -static bool bch2_try_open_stripe(struct bch_fs *c, - struct ec_stripe_new *s, - u64 idx) -{ - bool ret; - - spin_lock(&c->ec_stripes_new_lock); - ret = !__bch2_stripe_is_open(c, idx); - if (ret) { - unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); - - s->idx = idx; - hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); - } - spin_unlock(&c->ec_stripes_new_lock); - - return ret; -} - -static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) -{ - BUG_ON(!s->idx); - - spin_lock(&c->ec_stripes_new_lock); - hlist_del_init(&s->hash); - spin_unlock(&c->ec_stripes_new_lock); - - s->idx = 0; -} - -/* stripe deletion */ - -static int ec_stripe_delete(struct btree_trans *trans, u64 idx) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - goto err; - - /* - * We expect write buffer races here - * Important: check stripe_is_open with stripe key locked: - */ - if (k.k->type == KEY_TYPE_stripe && - !bch2_stripe_is_open(trans->c, idx) && - stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * XXX - * can we kill this and delete stripes from the trigger? - */ -static void ec_stripe_delete_work(struct work_struct *work) -{ - struct bch_fs *c = - container_of(work, struct bch_fs, ec_stripe_delete_work); - - bch2_trans_run(c, - bch2_btree_write_buffer_tryflush(trans) ?: - for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru, - lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0), - lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX), - 0, lru_k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, ({ - ec_stripe_delete(trans, lru_k.k->p.offset); - }))); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); -} - -void bch2_do_stripe_deletes(struct bch_fs *c) -{ - if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) && - !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); -} - -/* stripe creation: */ - -static int ec_stripe_key_update(struct btree_trans *trans, - struct bkey_i_stripe *old, - struct bkey_i_stripe *new) -{ - struct bch_fs *c = trans->c; - bool create = !old; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - goto err; - - if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe), - c, "error %s stripe: got existing key type %s", - create ? "creating" : "updating", - bch2_bkey_types[k.k->type])) { - ret = -EINVAL; - goto err; - } - - if (k.k->type == KEY_TYPE_stripe) { - const struct bch_stripe *v = bkey_s_c_to_stripe(k).v; - - BUG_ON(old->v.nr_blocks != new->v.nr_blocks); - BUG_ON(old->v.nr_blocks != v->nr_blocks); - - for (unsigned i = 0; i < new->v.nr_blocks; i++) { - unsigned sectors = stripe_blockcount_get(v, i); - - if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) { - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "stripe changed nonempty block %u", i); - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i)); - bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -EINVAL; - goto err; - } - - /* - * If the stripe ptr changed underneath us, it must have - * been dev_remove_stripes() -> * invalidate_stripe_to_dev() - */ - if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) { - BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID); - - if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i])) - new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID; - } - - stripe_blockcount_set(&new->v, i, sectors); - } - } - - ret = bch2_trans_update(trans, &iter, &new->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int ec_stripe_update_extent(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, u8 gen, - struct ec_stripe_buf *s, - struct bkey_s_c_backpointer bp, - struct bkey_buf *last_flushed) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - const struct bch_extent_ptr *ptr_c; - struct bch_extent_ptr *ec_ptr = NULL; - struct bch_extent_stripe_ptr stripe_ptr; - struct bkey_i *n; - int ret, dev, block; - - if (bp.v->level) { - struct printbuf buf = PRINTBUF; - struct btree_iter node_iter; - struct btree *b; - - b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); - bch2_trans_iter_exit(trans, &node_iter); - - if (!b) - return 0; - - prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); - bch2_bkey_val_to_text(&buf, c, bp.s_c); - - bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, erasure_coding_found_btree_node); - } - - k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); - ret = bkey_err(k); - if (ret) - return ret; - if (!k.k) { - /* - * extent no longer exists - we could flush the btree - * write buffer and retry to verify, but no need: - */ - return 0; - } - - if (extent_has_stripe_ptr(k, s->key.k.p.offset)) - goto out; - - ptr_c = bkey_matches_stripe(v, k, &block); - /* - * It doesn't generally make sense to erasure code cached ptrs: - * XXX: should we be incrementing a counter? - */ - if (!ptr_c || ptr_c->cached) - goto out; - - dev = v->ptrs[block].dev; - - n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto out; - - bkey_reassemble(n, k); - - bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev); - ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); - BUG_ON(!ec_ptr); - - stripe_ptr = (struct bch_extent_stripe_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, - .block = block, - .redundancy = v->nr_redundant, - .idx = s->key.k.p.offset, - }; - - __extent_entry_insert(n, - (union bch_extent_entry *) ec_ptr, - (union bch_extent_entry *) &stripe_ptr); - - ret = bch2_trans_update(trans, &iter, n, 0); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, - unsigned block) -{ - struct bch_fs *c = trans->c; - struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - struct bch_extent_ptr ptr = v->ptrs[block]; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); - if (!ca) - return bch_err_throw(c, ENOENT_dev_not_found); - - struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); - - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, bucket_pos), - bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, - NULL, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, ({ - if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0))) - break; - - if (bp_k.k->type != KEY_TYPE_backpointer) - continue; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); - if (bp.v->btree_id == BTREE_ID_stripes) - continue; - - ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, - bp, &last_flushed); - })); - - bch2_bkey_buf_exit(&last_flushed, c); - bch2_dev_put(ca); - return ret; -} - -static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - unsigned nr_data = v->nr_blocks - v->nr_redundant; - - int ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - for (unsigned i = 0; i < nr_data; i++) { - ret = ec_stripe_update_bucket(trans, s, i); - if (ret) - break; - } -err: - bch2_trans_put(trans); - return ret; -} - -static void zero_out_rest_of_ec_bucket(struct bch_fs *c, - struct ec_stripe_new *s, - unsigned block, - struct open_bucket *ob) -{ - struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE, - BCH_DEV_WRITE_REF_ec_bucket_zero); - if (!ca) { - s->err = bch_err_throw(c, erofs_no_writes); - return; - } - - unsigned offset = ca->mi.bucket_size - ob->sectors_free; - memset(s->new_stripe.data[block] + (offset << 9), - 0, - ob->sectors_free << 9); - - int ret = blkdev_issue_zeroout(ca->disk_sb.bdev, - ob->bucket * ca->mi.bucket_size + offset, - ob->sectors_free, - GFP_KERNEL, 0); - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero); - - if (ret) - s->err = ret; -} - -void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) -{ - if (s->idx) - bch2_stripe_close(c, s); - kfree(s); -} - -/* - * data buckets of new stripe all written: create the stripe - */ -static void ec_stripe_create(struct ec_stripe_new *s) -{ - struct bch_fs *c = s->c; - struct open_bucket *ob; - struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - unsigned i, nr_data = v->nr_blocks - v->nr_redundant; - int ret; - - BUG_ON(s->h->s == s); - - closure_sync(&s->iodone); - - if (!s->err) { - for (i = 0; i < nr_data; i++) - if (s->blocks[i]) { - ob = c->open_buckets + s->blocks[i]; - - if (ob->sectors_free) - zero_out_rest_of_ec_bucket(c, s, i, ob); - } - } - - if (s->err) { - if (!bch2_err_matches(s->err, EROFS)) - bch_err(c, "error creating stripe: error writing data buckets"); - ret = s->err; - goto err; - } - - if (s->have_existing_stripe) { - ec_validate_checksums(c, &s->existing_stripe); - - if (ec_do_recov(c, &s->existing_stripe)) { - bch_err(c, "error creating stripe: error reading existing stripe"); - ret = bch_err_throw(c, ec_block_read); - goto err; - } - - for (i = 0; i < nr_data; i++) - if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i)) - swap(s->new_stripe.data[i], - s->existing_stripe.data[i]); - - ec_stripe_buf_exit(&s->existing_stripe); - } - - BUG_ON(!s->allocated); - BUG_ON(!s->idx); - - ec_generate_ec(&s->new_stripe); - - ec_generate_checksums(&s->new_stripe); - - /* write p/q: */ - for (i = nr_data; i < v->nr_blocks; i++) - ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); - closure_sync(&s->iodone); - - if (ec_nr_failed(&s->new_stripe)) { - bch_err(c, "error creating stripe: error writing redundancy buckets"); - ret = bch_err_throw(c, ec_block_write); - goto err; - } - - ret = bch2_trans_commit_do(c, &s->res, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, - ec_stripe_key_update(trans, - s->have_existing_stripe - ? bkey_i_to_stripe(&s->existing_stripe.key) - : NULL, - bkey_i_to_stripe(&s->new_stripe.key))); - bch_err_msg(c, ret, "creating stripe key"); - if (ret) { - goto err; - } - - ret = ec_stripe_update_extents(c, &s->new_stripe); - bch_err_msg(c, ret, "error updating extents"); - if (ret) - goto err; -err: - trace_stripe_create(c, s->idx, ret); - - bch2_disk_reservation_put(c, &s->res); - - for (i = 0; i < v->nr_blocks; i++) - if (s->blocks[i]) { - ob = c->open_buckets + s->blocks[i]; - - if (i < nr_data) { - ob->ec = NULL; - __bch2_open_bucket_put(c, ob); - } else { - bch2_open_bucket_put(c, ob); - } - } - - mutex_lock(&c->ec_stripe_new_lock); - list_del(&s->list); - mutex_unlock(&c->ec_stripe_new_lock); - wake_up(&c->ec_stripe_new_wait); - - ec_stripe_buf_exit(&s->existing_stripe); - ec_stripe_buf_exit(&s->new_stripe); - closure_debug_destroy(&s->iodone); - - ec_stripe_new_put(c, s, STRIPE_REF_stripe); -} - -static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) -{ - struct ec_stripe_new *s; - - mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry(s, &c->ec_stripe_new_list, list) - if (!atomic_read(&s->ref[STRIPE_REF_io])) - goto out; - s = NULL; -out: - mutex_unlock(&c->ec_stripe_new_lock); - - return s; -} - -static void ec_stripe_create_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, - struct bch_fs, ec_stripe_create_work); - struct ec_stripe_new *s; - - while ((s = get_pending_stripe(c))) - ec_stripe_create(s); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); -} - -void bch2_ec_do_stripe_creates(struct bch_fs *c) -{ - enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create); - - if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); -} - -static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) -{ - struct ec_stripe_new *s = h->s; - - lockdep_assert_held(&h->lock); - - BUG_ON(!s->allocated && !s->err); - - h->s = NULL; - s->pending = true; - - mutex_lock(&c->ec_stripe_new_lock); - list_add(&s->list, &c->ec_stripe_new_list); - mutex_unlock(&c->ec_stripe_new_lock); - - ec_stripe_new_put(c, s, STRIPE_REF_io); -} - -static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err) -{ - h->s->err = err; - ec_stripe_new_set_pending(c, h); -} - -void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err) -{ - struct ec_stripe_new *s = ob->ec; - - s->err = err; -} - -void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) -{ - struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); - if (!ob) - return NULL; - - BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); - - struct bch_dev *ca = ob_dev(c, ob); - unsigned offset = ca->mi.bucket_size - ob->sectors_free; - - return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); -} - -static int unsigned_cmp(const void *_l, const void *_r) -{ - unsigned l = *((const unsigned *) _l); - unsigned r = *((const unsigned *) _r); - - return cmp_int(l, r); -} - -/* pick most common bucket size: */ -static unsigned pick_blocksize(struct bch_fs *c, - struct bch_devs_mask *devs) -{ - unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX]; - struct { - unsigned nr, size; - } cur = { 0, 0 }, best = { 0, 0 }; - - for_each_member_device_rcu(c, ca, devs) - sizes[nr++] = ca->mi.bucket_size; - - sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); - - for (unsigned i = 0; i < nr; i++) { - if (sizes[i] != cur.size) { - if (cur.nr > best.nr) - best = cur; - - cur.nr = 0; - cur.size = sizes[i]; - } - - cur.nr++; - } - - if (cur.nr > best.nr) - best = cur; - - return best.size; -} - -static bool may_create_new_stripe(struct bch_fs *c) -{ - return false; -} - -static void ec_stripe_key_init(struct bch_fs *c, - struct bkey_i *k, - unsigned nr_data, - unsigned nr_parity, - unsigned stripe_size, - unsigned disk_label) -{ - struct bkey_i_stripe *s = bkey_stripe_init(k); - unsigned u64s; - - s->v.sectors = cpu_to_le16(stripe_size); - s->v.algorithm = 0; - s->v.nr_blocks = nr_data + nr_parity; - s->v.nr_redundant = nr_parity; - s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); - s->v.csum_type = BCH_CSUM_crc32c; - s->v.disk_label = disk_label; - - while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { - BUG_ON(1 << s->v.csum_granularity_bits >= - le16_to_cpu(s->v.sectors) || - s->v.csum_granularity_bits == U8_MAX); - s->v.csum_granularity_bits++; - } - - set_bkey_val_u64s(&s->k, u64s); -} - -static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) -{ - struct ec_stripe_new *s; - - lockdep_assert_held(&h->lock); - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return NULL; - - mutex_init(&s->lock); - closure_init(&s->iodone, NULL); - atomic_set(&s->ref[STRIPE_REF_stripe], 1); - atomic_set(&s->ref[STRIPE_REF_io], 1); - s->c = c; - s->h = h; - s->nr_data = min_t(unsigned, h->nr_active_devs, - BCH_BKEY_PTRS_MAX) - h->redundancy; - s->nr_parity = h->redundancy; - - ec_stripe_key_init(c, &s->new_stripe.key, - s->nr_data, s->nr_parity, - h->blocksize, h->disk_label); - return s; -} - -static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) -{ - struct bch_devs_mask devs = h->devs; - unsigned nr_devs, nr_devs_with_durability; - - scoped_guard(rcu) { - h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label - ? group_to_target(h->disk_label - 1) - : 0); - nr_devs = dev_mask_nr(&h->devs); - - for_each_member_device_rcu(c, ca, &h->devs) - if (!ca->mi.durability) - __clear_bit(ca->dev_idx, h->devs.d); - nr_devs_with_durability = dev_mask_nr(&h->devs); - - h->blocksize = pick_blocksize(c, &h->devs); - - h->nr_active_devs = 0; - for_each_member_device_rcu(c, ca, &h->devs) - if (ca->mi.bucket_size == h->blocksize) - h->nr_active_devs++; - } - - /* - * If we only have redundancy + 1 devices, we're better off with just - * replication: - */ - h->insufficient_devs = h->nr_active_devs < h->redundancy + 2; - - if (h->insufficient_devs) { - const char *err; - - if (nr_devs < h->redundancy + 2) - err = NULL; - else if (nr_devs_with_durability < h->redundancy + 2) - err = "cannot use durability=0 devices"; - else - err = "mismatched bucket sizes"; - - if (err) - bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s", - h->nr_active_devs, h->redundancy + 2, err); - } - - struct bch_devs_mask devs_leaving; - bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX); - - if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving)) - ec_stripe_new_cancel(c, h, -EINTR); - - h->rw_devs_change_count = c->rw_devs_change_count; -} - -static struct ec_stripe_head * -ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label, - unsigned algo, unsigned redundancy, - enum bch_watermark watermark) -{ - struct ec_stripe_head *h; - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return NULL; - - mutex_init(&h->lock); - BUG_ON(!mutex_trylock(&h->lock)); - - h->disk_label = disk_label; - h->algo = algo; - h->redundancy = redundancy; - h->watermark = watermark; - - list_add(&h->list, &c->ec_stripe_head_list); - return h; -} - -void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) -{ - if (h->s && - h->s->allocated && - bitmap_weight(h->s->blocks_allocated, - h->s->nr_data) == h->s->nr_data) - ec_stripe_new_set_pending(c, h); - - mutex_unlock(&h->lock); -} - -static struct ec_stripe_head * -__bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned disk_label, - unsigned algo, - unsigned redundancy, - enum bch_watermark watermark) -{ - struct bch_fs *c = trans->c; - struct ec_stripe_head *h; - int ret; - - if (!redundancy) - return NULL; - - ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock); - if (ret) - return ERR_PTR(ret); - - if (test_bit(BCH_FS_going_ro, &c->flags)) { - h = ERR_PTR(-BCH_ERR_erofs_no_writes); - goto err; - } - - list_for_each_entry(h, &c->ec_stripe_head_list, list) - if (h->disk_label == disk_label && - h->algo == algo && - h->redundancy == redundancy && - h->watermark == watermark) { - ret = bch2_trans_mutex_lock(trans, &h->lock); - if (ret) { - h = ERR_PTR(ret); - goto err; - } - goto found; - } - - h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); - if (!h) { - h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc); - goto err; - } -found: - if (h->rw_devs_change_count != c->rw_devs_change_count) - ec_stripe_head_devs_update(c, h); - - if (h->insufficient_devs) { - mutex_unlock(&h->lock); - h = NULL; - } -err: - mutex_unlock(&c->ec_stripe_head_lock); - return h; -} - -static int new_stripe_alloc_buckets(struct btree_trans *trans, - struct alloc_request *req, - struct ec_stripe_head *h, struct ec_stripe_new *s, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - struct open_bucket *ob; - struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - unsigned i, j, nr_have_parity = 0, nr_have_data = 0; - int ret = 0; - - req->scratch_data_type = req->data_type; - req->scratch_ptrs = req->ptrs; - req->scratch_nr_replicas = req->nr_replicas; - req->scratch_nr_effective = req->nr_effective; - req->scratch_have_cache = req->have_cache; - req->scratch_devs_may_alloc = req->devs_may_alloc; - - req->devs_may_alloc = h->devs; - req->have_cache = true; - - BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); - BUG_ON(v->nr_redundant != s->nr_parity); - - /* * We bypass the sector allocator which normally does this: */ - bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d, - c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); - - for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { - /* - * Note: we don't yet repair invalid blocks (failed/removed - * devices) when reusing stripes - we still need a codepath to - * walk backpointers and update all extents that point to that - * block when updating the stripe - */ - if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) - __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d); - - if (i < s->nr_data) - nr_have_data++; - else - nr_have_parity++; - } - - BUG_ON(nr_have_data > s->nr_data); - BUG_ON(nr_have_parity > s->nr_parity); - - req->ptrs.nr = 0; - if (nr_have_parity < s->nr_parity) { - req->nr_replicas = s->nr_parity; - req->nr_effective = nr_have_parity; - req->data_type = BCH_DATA_parity; - - ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); - - open_bucket_for_each(c, &req->ptrs, ob, i) { - j = find_next_zero_bit(s->blocks_gotten, - s->nr_data + s->nr_parity, - s->nr_data); - BUG_ON(j >= s->nr_data + s->nr_parity); - - s->blocks[j] = req->ptrs.v[i]; - v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, s->blocks_gotten); - } - - if (ret) - goto err; - } - - req->ptrs.nr = 0; - if (nr_have_data < s->nr_data) { - req->nr_replicas = s->nr_data; - req->nr_effective = nr_have_data; - req->data_type = BCH_DATA_user; - - ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); - - open_bucket_for_each(c, &req->ptrs, ob, i) { - j = find_next_zero_bit(s->blocks_gotten, - s->nr_data, 0); - BUG_ON(j >= s->nr_data); - - s->blocks[j] = req->ptrs.v[i]; - v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, s->blocks_gotten); - } - - if (ret) - goto err; - } -err: - req->data_type = req->scratch_data_type; - req->ptrs = req->scratch_ptrs; - req->nr_replicas = req->scratch_nr_replicas; - req->nr_effective = req->scratch_nr_effective; - req->have_cache = req->scratch_have_cache; - req->devs_may_alloc = req->scratch_devs_may_alloc; - return ret; -} - -static int __get_existing_stripe(struct btree_trans *trans, - struct ec_stripe_head *head, - struct ec_stripe_buf *stripe, - u64 idx) -{ - struct bch_fs *c = trans->c; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_stripes, POS(0, idx), 0); - int ret = bkey_err(k); - if (ret) - goto err; - - /* We expect write buffer races here */ - if (k.k->type != KEY_TYPE_stripe) - goto out; - - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - if (stripe_lru_pos(s.v) <= 1) - goto out; - - if (s.v->disk_label == head->disk_label && - s.v->algorithm == head->algo && - s.v->nr_redundant == head->redundancy && - le16_to_cpu(s.v->sectors) == head->blocksize && - bch2_try_open_stripe(c, head->s, idx)) { - bkey_reassemble(&stripe->key, k); - ret = 1; - } -out: - bch2_set_btree_iter_dontneed(trans, &iter); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s) -{ - struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v; - unsigned i; - - BUG_ON(existing_v->nr_redundant != s->nr_parity); - s->nr_data = existing_v->nr_blocks - - existing_v->nr_redundant; - - int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); - if (ret) { - bch2_stripe_close(c, s); - return ret; - } - - BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); - - /* - * Free buckets we initially allocated - they might conflict with - * blocks from the stripe we're reusing: - */ - for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) { - bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]); - s->blocks[i] = 0; - } - memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten)); - memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated)); - - for (unsigned i = 0; i < existing_v->nr_blocks; i++) { - if (stripe_blockcount_get(existing_v, i)) { - __set_bit(i, s->blocks_gotten); - __set_bit(i, s->blocks_allocated); - } - - ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone); - } - - bkey_copy(&s->new_stripe.key, &s->existing_stripe.key); - s->have_existing_stripe = true; - - return 0; -} - -static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h, - struct ec_stripe_new *s) -{ - struct bch_fs *c = trans->c; - - /* - * If we can't allocate a new stripe, and there's no stripes with empty - * blocks for us to reuse, that means we have to wait on copygc: - */ - if (may_create_new_stripe(c)) - return -1; - - struct btree_iter lru_iter; - struct bkey_s_c lru_k; - int ret = 0; - - for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru, - lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0), - lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX), - 0, lru_k, ret) { - ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &lru_iter); - if (!ret) - ret = bch_err_throw(c, stripe_alloc_blocked); - if (ret == 1) - ret = 0; - if (ret) - return ret; - - return init_new_stripe_from_existing(c, s); -} - -static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h, - struct ec_stripe_new *s) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bpos min_pos = POS(0, 1); - struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); - int ret; - - if (!s->res.sectors) { - ret = bch2_disk_reservation_get(c, &s->res, - h->blocksize, - s->nr_parity, - BCH_DISK_RESERVATION_NOFAIL); - if (ret) - return ret; - } - - /* - * Allocate stripe slot - * XXX: we're going to need a bitrange btree of free stripes - */ - for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, - BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { - if (bkey_gt(k.k->p, POS(0, U32_MAX))) { - if (start_pos.offset) { - start_pos = min_pos; - bch2_btree_iter_set_pos(trans, &iter, start_pos); - continue; - } - - ret = bch_err_throw(c, ENOSPC_stripe_create); - break; - } - - if (bkey_deleted(k.k) && - bch2_try_open_stripe(c, s, k.k->p.offset)) - break; - } - - c->ec_stripe_hint = iter.pos.offset; - - if (ret) - goto err; - - ret = ec_stripe_mem_alloc(trans, &iter); - if (ret) { - bch2_stripe_close(c, s); - goto err; - } - - s->new_stripe.key.k.p = iter.pos; -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -err: - bch2_disk_reservation_put(c, &s->res); - goto out; -} - -struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - struct alloc_request *req, - unsigned algo, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - unsigned redundancy = req->nr_replicas - 1; - unsigned disk_label = 0; - struct target t = target_decode(req->target); - bool waiting = false; - int ret; - - if (t.type == TARGET_GROUP) { - if (t.group > U8_MAX) { - bch_err(c, "cannot create a stripe when disk_label > U8_MAX"); - return NULL; - } - disk_label = t.group + 1; /* 0 == no label */ - } - - struct ec_stripe_head *h = - __bch2_ec_stripe_head_get(trans, disk_label, algo, - redundancy, req->watermark); - if (IS_ERR_OR_NULL(h)) - return h; - - if (!h->s) { - h->s = ec_new_stripe_alloc(c, h); - if (!h->s) { - ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc); - bch_err(c, "failed to allocate new stripe"); - goto err; - } - - h->nr_created++; - } - - struct ec_stripe_new *s = h->s; - - if (s->allocated) - goto allocated; - - if (s->have_existing_stripe) - goto alloc_existing; - - /* First, try to allocate a full stripe: */ - enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; - swap(req->watermark, saved_watermark); - ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h, s); - swap(req->watermark, saved_watermark); - - if (!ret) - goto allocate_buf; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, ENOMEM)) - goto err; - - /* - * Not enough buckets available for a full stripe: we must reuse an - * existing stripe: - */ - while (1) { - ret = __bch2_ec_stripe_head_reuse(trans, h, s); - if (!ret) - break; - if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) - goto err; - - if (req->watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h, s); - if (ret) - goto err; - goto allocate_buf; - } - - /* XXX freelist_wait? */ - closure_wait(&c->freelist_wait, cl); - waiting = true; - } - - if (waiting) - closure_wake_up(&c->freelist_wait); -alloc_existing: - /* - * Retry allocating buckets, with the watermark for this - * particular write: - */ - ret = new_stripe_alloc_buckets(trans, req, h, s, cl); - if (ret) - goto err; - -allocate_buf: - ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize); - if (ret) - goto err; - - s->allocated = true; -allocated: - BUG_ON(!s->idx); - BUG_ON(!s->new_stripe.data[0]); - BUG_ON(trans->restarted); - return h; -err: - bch2_ec_stripe_head_put(c, h); - return ERR_PTR(ret); -} - -/* device removal */ - -int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - unsigned dev_idx, - unsigned flags) -{ - if (k.k->type != KEY_TYPE_stripe) - return 0; - - struct bch_fs *c = trans->c; - struct bkey_i_stripe *s = - bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); - int ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - struct disk_accounting_pos acc; - - s64 sectors = 0; - for (unsigned i = 0; i < s->v.nr_blocks; i++) - sectors -= stripe_blockcount_get(&s->v, i); - - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); - acc.replicas.data_type = BCH_DATA_user; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); - if (ret) - return ret; - - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); - - /* XXX: how much redundancy do we still have? check degraded flags */ - - unsigned nr_good = 0; - - scoped_guard(rcu) - bkey_for_each_ptr(ptrs, ptr) { - if (ptr->dev == dev_idx) - ptr->dev = BCH_SB_MEMBER_INVALID; - - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; - } - - if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) - return bch_err_throw(c, remove_would_lose_data); - - unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; - - if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) - return bch_err_throw(c, remove_would_lose_data); - - sectors = -sectors; - - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); - acc.replicas.data_type = BCH_DATA_user; - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); -} - -static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a, - unsigned flags) -{ - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); - - if (!a->stripe) - return 0; - - if (a->stripe_sectors) { - struct bch_fs *c = trans->c; - bch_err(c, "trying to invalidate device in stripe when bucket has stripe data"); - return bch_err_throw(c, invalidate_stripe_to_dev); - } - - struct btree_iter iter; - struct bkey_s_c_stripe s = - bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), - BTREE_ITER_slots, stripe); - int ret = bkey_err(s); - if (ret) - return ret; - - ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_max_commit(trans, iter, - BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), - BTREE_ITER_intent, k, - NULL, NULL, 0, ({ - bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); - }))); - bch_err_fn(c, ret); - return ret; -} - -/* startup/shutdown */ - -static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) -{ - struct ec_stripe_head *h; - struct open_bucket *ob; - unsigned i; - - mutex_lock(&c->ec_stripe_head_lock); - list_for_each_entry(h, &c->ec_stripe_head_list, list) { - mutex_lock(&h->lock); - if (!h->s) - goto unlock; - - if (!ca) - goto found; - - for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { - if (!h->s->blocks[i]) - continue; - - ob = c->open_buckets + h->s->blocks[i]; - if (ob->dev == ca->dev_idx) - goto found; - } - goto unlock; -found: - ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes); -unlock: - mutex_unlock(&h->lock); - } - mutex_unlock(&c->ec_stripe_head_lock); -} - -void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) -{ - __bch2_ec_stop(c, ca); -} - -void bch2_fs_ec_stop(struct bch_fs *c) -{ - __bch2_ec_stop(c, NULL); -} - -static bool bch2_fs_ec_flush_done(struct bch_fs *c) -{ - sched_annotate_sleep(); - - mutex_lock(&c->ec_stripe_new_lock); - bool ret = list_empty(&c->ec_stripe_new_list); - mutex_unlock(&c->ec_stripe_new_lock); - - return ret; -} - -void bch2_fs_ec_flush(struct bch_fs *c) -{ - wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); -} - -int bch2_stripes_read(struct bch_fs *c) -{ - return 0; -} - -static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, - struct ec_stripe_new *s) -{ - prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs", - s->idx, s->nr_data, s->nr_parity, - bitmap_weight(s->blocks_allocated, s->nr_data), - atomic_read(&s->ref[STRIPE_REF_io]), - atomic_read(&s->ref[STRIPE_REF_stripe]), - bch2_watermarks[s->h->watermark]); - - struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - unsigned i; - for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) - prt_printf(out, " %u", s->blocks[i]); - prt_newline(out); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key)); - prt_newline(out); -} - -void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct ec_stripe_head *h; - struct ec_stripe_new *s; - - mutex_lock(&c->ec_stripe_head_lock); - list_for_each_entry(h, &c->ec_stripe_head_list, list) { - prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n", - h->disk_label, h->algo, h->redundancy, - bch2_watermarks[h->watermark], - h->nr_created); - - if (h->s) - bch2_new_stripe_to_text(out, c, h->s); - } - mutex_unlock(&c->ec_stripe_head_lock); - - prt_printf(out, "in flight:\n"); - - mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry(s, &c->ec_stripe_new_list, list) - bch2_new_stripe_to_text(out, c, s); - mutex_unlock(&c->ec_stripe_new_lock); -} - -void bch2_fs_ec_exit(struct bch_fs *c) -{ - struct ec_stripe_head *h; - unsigned i; - - while (1) { - mutex_lock(&c->ec_stripe_head_lock); - h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); - mutex_unlock(&c->ec_stripe_head_lock); - - if (!h) - break; - - if (h->s) { - for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) - BUG_ON(h->s->blocks[i]); - - kfree(h->s); - } - kfree(h); - } - - BUG_ON(!list_empty(&c->ec_stripe_new_list)); - - bioset_exit(&c->ec_bioset); -} - -void bch2_fs_ec_init_early(struct bch_fs *c) -{ - spin_lock_init(&c->ec_stripes_new_lock); - - INIT_LIST_HEAD(&c->ec_stripe_head_list); - mutex_init(&c->ec_stripe_head_lock); - - INIT_LIST_HEAD(&c->ec_stripe_new_list); - mutex_init(&c->ec_stripe_new_lock); - init_waitqueue_head(&c->ec_stripe_new_wait); - - INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); - INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); -} - -int bch2_fs_ec_init(struct bch_fs *c) -{ - return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), - BIOSET_NEED_BVECS); -} - -static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, - struct bkey_s_c k, - struct bkey_buf *last_flushed) -{ - if (k.k->type != KEY_TYPE_stripe) - return 0; - - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - u64 lru_idx = stripe_lru_pos(s.v); - if (lru_idx) { - int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION, - k.k->p.offset, lru_idx, k, last_flushed); - if (ret) - return ret; - } - return 0; -} - -int bch2_check_stripe_to_lru_refs(struct bch_fs *c) -{ - struct bkey_buf last_flushed; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, - POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_stripe_to_lru_ref(trans, k, &last_flushed))); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h deleted file mode 100644 index 548048adf0d5..000000000000 --- a/fs/bcachefs/ec.h +++ /dev/null @@ -1,309 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EC_H -#define _BCACHEFS_EC_H - -#include "ec_types.h" -#include "buckets_types.h" -#include "extents_types.h" - -int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); -int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_stripe ((struct bkey_ops) { \ - .key_validate = bch2_stripe_validate, \ - .val_to_text = bch2_stripe_to_text, \ - .swab = bch2_ptr_swab, \ - .trigger = bch2_trigger_stripe, \ - .min_val_size = 8, \ -}) - -static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) -{ - return DIV_ROUND_UP(le16_to_cpu(s->sectors), - 1 << s->csum_granularity_bits); -} - -static inline unsigned stripe_csum_offset(const struct bch_stripe *s, - unsigned dev, unsigned csum_idx) -{ - EBUG_ON(s->csum_type >= BCH_CSUM_NR); - - unsigned csum_bytes = bch_crc_bytes[s->csum_type]; - - return sizeof(struct bch_stripe) + - sizeof(struct bch_extent_ptr) * s->nr_blocks + - (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; -} - -static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, - unsigned idx) -{ - return stripe_csum_offset(s, s->nr_blocks, 0) + - sizeof(u16) * idx; -} - -static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, - unsigned idx) -{ - return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); -} - -static inline void stripe_blockcount_set(struct bch_stripe *s, - unsigned idx, unsigned v) -{ - __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); - - *p = cpu_to_le16(v); -} - -static inline unsigned stripe_val_u64s(const struct bch_stripe *s) -{ - return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), - sizeof(u64)); -} - -static inline void *stripe_csum(struct bch_stripe *s, - unsigned block, unsigned csum_idx) -{ - EBUG_ON(block >= s->nr_blocks); - EBUG_ON(csum_idx >= stripe_csums_per_device(s)); - - return (void *) s + stripe_csum_offset(s, block, csum_idx); -} - -static inline struct bch_csum stripe_csum_get(struct bch_stripe *s, - unsigned block, unsigned csum_idx) -{ - struct bch_csum csum = { 0 }; - - memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]); - return csum; -} - -static inline void stripe_csum_set(struct bch_stripe *s, - unsigned block, unsigned csum_idx, - struct bch_csum csum) -{ - memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); -} - -#define STRIPE_LRU_POS_EMPTY 1 - -static inline u64 stripe_lru_pos(const struct bch_stripe *s) -{ - if (!s) - return 0; - - unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0; - - for (unsigned i = 0; i < nr_data; i++) - blocks_empty += !stripe_blockcount_get(s, i); - - /* Will be picked up by the stripe_delete worker */ - if (blocks_empty == nr_data) - return STRIPE_LRU_POS_EMPTY; - - if (!blocks_empty) - return 0; - - /* invert: more blocks empty = reuse first */ - return LRU_TIME_MAX - blocks_empty; -} - -static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, - const struct bch_extent_ptr *data_ptr, - unsigned sectors) -{ - return (data_ptr->dev == stripe_ptr->dev || - data_ptr->dev == BCH_SB_MEMBER_INVALID || - stripe_ptr->dev == BCH_SB_MEMBER_INVALID) && - data_ptr->gen == stripe_ptr->gen && - data_ptr->offset >= stripe_ptr->offset && - data_ptr->offset < stripe_ptr->offset + sectors; -} - -static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, - struct extent_ptr_decoded p) -{ - unsigned nr_data = s->nr_blocks - s->nr_redundant; - - BUG_ON(!p.has_ec); - - if (p.ec.block >= nr_data) - return false; - - return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr, - le16_to_cpu(s->sectors)); -} - -static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, - struct extent_ptr_decoded p) -{ - unsigned nr_data = m->nr_blocks - m->nr_redundant; - - BUG_ON(!p.has_ec); - - if (p.ec.block >= nr_data) - return false; - - return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr, - m->sectors); -} - -static inline void gc_stripe_unlock(struct gc_stripe *s) -{ - BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); - - clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock); - smp_mb__after_atomic(); - wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR); -} - -static inline void gc_stripe_lock(struct gc_stripe *s) -{ - wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR, - TASK_UNINTERRUPTIBLE); -} - -struct bch_read_bio; - -struct ec_stripe_buf { - /* might not be buffering the entire stripe: */ - unsigned offset; - unsigned size; - unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; - - void *data[BCH_BKEY_PTRS_MAX]; - - __BKEY_PADDED(key, 255); -}; - -struct ec_stripe_head; - -enum ec_stripe_ref { - STRIPE_REF_io, - STRIPE_REF_stripe, - STRIPE_REF_NR -}; - -struct ec_stripe_new { - struct bch_fs *c; - struct ec_stripe_head *h; - struct mutex lock; - struct list_head list; - - struct hlist_node hash; - u64 idx; - - struct closure iodone; - - atomic_t ref[STRIPE_REF_NR]; - - int err; - - u8 nr_data; - u8 nr_parity; - bool allocated; - bool pending; - bool have_existing_stripe; - - unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; - unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; - open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; - struct disk_reservation res; - - struct ec_stripe_buf new_stripe; - struct ec_stripe_buf existing_stripe; -}; - -struct ec_stripe_head { - struct list_head list; - struct mutex lock; - - unsigned disk_label; - unsigned algo; - unsigned redundancy; - enum bch_watermark watermark; - bool insufficient_devs; - - unsigned long rw_devs_change_count; - - u64 nr_created; - - struct bch_devs_mask devs; - unsigned nr_active_devs; - - unsigned blocksize; - - struct dev_stripe_state block_stripe; - struct dev_stripe_state parity_stripe; - - struct ec_stripe_new *s; -}; - -int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c); - -void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); - -void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); - -int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); - -void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); - -struct alloc_request; -struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, - struct alloc_request *, unsigned, struct closure *); - -void bch2_do_stripe_deletes(struct bch_fs *); -void bch2_ec_do_stripe_creates(struct bch_fs *); -void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); - -static inline void ec_stripe_new_get(struct ec_stripe_new *s, - enum ec_stripe_ref ref) -{ - atomic_inc(&s->ref[ref]); -} - -static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, - enum ec_stripe_ref ref) -{ - BUG_ON(atomic_read(&s->ref[ref]) <= 0); - - if (atomic_dec_and_test(&s->ref[ref])) - switch (ref) { - case STRIPE_REF_stripe: - bch2_ec_stripe_new_free(c, s); - break; - case STRIPE_REF_io: - bch2_ec_do_stripe_creates(c); - break; - default: - BUG(); - } -} - -int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *, - struct bkey_s_c, unsigned, unsigned); -int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned); - -void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); -void bch2_fs_ec_stop(struct bch_fs *); -void bch2_fs_ec_flush(struct bch_fs *); - -int bch2_stripes_read(struct bch_fs *); - -void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_ec_exit(struct bch_fs *); -void bch2_fs_ec_init_early(struct bch_fs *); -int bch2_fs_ec_init(struct bch_fs *); - -int bch2_check_stripe_to_lru_refs(struct bch_fs *); - -#endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h deleted file mode 100644 index b9770f24f213..000000000000 --- a/fs/bcachefs/ec_format.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EC_FORMAT_H -#define _BCACHEFS_EC_FORMAT_H - -struct bch_stripe { - struct bch_val v; - __le16 sectors; - __u8 algorithm; - __u8 nr_blocks; - __u8 nr_redundant; - - __u8 csum_granularity_bits; - __u8 csum_type; - - /* - * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2 - * - * we can manage with this because this only needs to point to a - * disk label, not a target: - */ - __u8 disk_label; - - /* - * Variable length sections: - * - Pointers - * - Checksums - * 2D array of [stripe block/device][csum block], with checksum block - * size given by csum_granularity_bits - * - Block sector counts: per-block array of u16s - * - * XXX: - * Either checksums should have come last, or we should have included a - * checksum_size field (the size in bytes of the checksum itself, not - * the blocksize the checksum covers). - * - * Currently we aren't able to access the block sector counts if the - * checksum type is unknown. - */ - - struct bch_extent_ptr ptrs[]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_EC_FORMAT_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h deleted file mode 100644 index 809446c78951..000000000000 --- a/fs/bcachefs/ec_types.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EC_TYPES_H -#define _BCACHEFS_EC_TYPES_H - -#include "bcachefs_format.h" - -union bch_replicas_padded { - u8 bytes[struct_size_t(struct bch_replicas_entry_v1, - devs, BCH_BKEY_PTRS_MAX)]; - struct bch_replicas_entry_v1 e; -}; - -struct stripe { - size_t heap_idx; - u16 sectors; - u8 algorithm; - u8 nr_blocks; - u8 nr_redundant; - u8 blocks_nonempty; - u8 disk_label; -}; - -struct gc_stripe { - u8 lock; - unsigned alive:1; /* does a corresponding key exist in stripes btree? */ - u16 sectors; - u8 nr_blocks; - u8 nr_redundant; - u16 block_sectors[BCH_BKEY_PTRS_MAX]; - struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; - - union bch_replicas_padded r; -}; - -#endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c deleted file mode 100644 index 56ab430f209f..000000000000 --- a/fs/bcachefs/enumerated_ref.c +++ /dev/null @@ -1,144 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "enumerated_ref.h" -#include "util.h" - -#include <linux/completion.h> - -#ifdef ENUMERATED_REF_DEBUG -void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) -{ - BUG_ON(idx >= ref->nr); - atomic_long_inc(&ref->refs[idx]); -} - -bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -{ - BUG_ON(idx >= ref->nr); - return atomic_long_inc_not_zero(&ref->refs[idx]); -} - -bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -{ - BUG_ON(idx >= ref->nr); - return !ref->dying && - atomic_long_inc_not_zero(&ref->refs[idx]); -} - -void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) -{ - BUG_ON(idx >= ref->nr); - long v = atomic_long_dec_return(&ref->refs[idx]); - - BUG_ON(v < 0); - if (v) - return; - - for (unsigned i = 0; i < ref->nr; i++) - if (atomic_long_read(&ref->refs[i])) - return; - - if (ref->stop_fn) - ref->stop_fn(ref); - complete(&ref->stop_complete); -} -#endif - -#ifndef ENUMERATED_REF_DEBUG -static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref) -{ - struct enumerated_ref *ref = - container_of(percpu_ref, struct enumerated_ref, ref); - - if (ref->stop_fn) - ref->stop_fn(ref); - complete(&ref->stop_complete); -} -#endif - -void enumerated_ref_stop_async(struct enumerated_ref *ref) -{ - reinit_completion(&ref->stop_complete); - -#ifndef ENUMERATED_REF_DEBUG - percpu_ref_kill(&ref->ref); -#else - ref->dying = true; - for (unsigned i = 0; i < ref->nr; i++) - enumerated_ref_put(ref, i); -#endif -} - -void enumerated_ref_stop(struct enumerated_ref *ref, - const char * const names[]) -{ - enumerated_ref_stop_async(ref); - while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n"); - prt_str(&buf, "Outstanding refs:\n"); - enumerated_ref_to_text(&buf, ref, names); - printk(KERN_ERR "%s", buf.buf); - printbuf_exit(&buf); - } -} - -void enumerated_ref_start(struct enumerated_ref *ref) -{ -#ifndef ENUMERATED_REF_DEBUG - percpu_ref_reinit(&ref->ref); -#else - ref->dying = false; - for (unsigned i = 0; i < ref->nr; i++) { - BUG_ON(atomic_long_read(&ref->refs[i])); - atomic_long_inc(&ref->refs[i]); - } -#endif -} - -void enumerated_ref_exit(struct enumerated_ref *ref) -{ -#ifndef ENUMERATED_REF_DEBUG - percpu_ref_exit(&ref->ref); -#else - kfree(ref->refs); - ref->refs = NULL; - ref->nr = 0; -#endif -} - -int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr, - void (*stop_fn)(struct enumerated_ref *)) -{ - init_completion(&ref->stop_complete); - ref->stop_fn = stop_fn; - -#ifndef ENUMERATED_REF_DEBUG - return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb, - PERCPU_REF_INIT_DEAD, GFP_KERNEL); -#else - ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL); - if (!ref->refs) - return -ENOMEM; - - ref->nr = nr; - return 0; -#endif -} - -void enumerated_ref_to_text(struct printbuf *out, - struct enumerated_ref *ref, - const char * const names[]) -{ -#ifdef ENUMERATED_REF_DEBUG - bch2_printbuf_tabstop_push(out, 32); - - for (unsigned i = 0; i < ref->nr; i++) - prt_printf(out, "%s\t%li\n", names[i], - atomic_long_read(&ref->refs[i])); -#else - prt_str(out, "(not in debug mode)\n"); -#endif -} diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h deleted file mode 100644 index ec01cf59ef80..000000000000 --- a/fs/bcachefs/enumerated_ref.h +++ /dev/null @@ -1,66 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ENUMERATED_REF_H -#define _BCACHEFS_ENUMERATED_REF_H - -#include "enumerated_ref_types.h" - -/* - * A refcount where the users are enumerated: in debug mode, we create sepate - * refcounts for each user, to make leaks and refcount errors easy to track - * down: - */ - -#ifdef ENUMERATED_REF_DEBUG -void enumerated_ref_get(struct enumerated_ref *, unsigned); -bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned); -bool enumerated_ref_tryget(struct enumerated_ref *, unsigned); -void enumerated_ref_put(struct enumerated_ref *, unsigned); -#else - -static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) -{ - percpu_ref_get(&ref->ref); -} - -static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -{ - return percpu_ref_tryget(&ref->ref); -} - -static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -{ - return percpu_ref_tryget_live(&ref->ref); -} - -static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) -{ - percpu_ref_put(&ref->ref); -} -#endif - -static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref) -{ -#ifndef ENUMERATED_REF_DEBUG - return percpu_ref_is_zero(&ref->ref); -#else - for (unsigned i = 0; i < ref->nr; i++) - if (atomic_long_read(&ref->refs[i])) - return false; - return true; -#endif -} - -void enumerated_ref_stop_async(struct enumerated_ref *); -void enumerated_ref_stop(struct enumerated_ref *, const char * const[]); -void enumerated_ref_start(struct enumerated_ref *); - -void enumerated_ref_exit(struct enumerated_ref *); -int enumerated_ref_init(struct enumerated_ref *, unsigned, - void (*stop_fn)(struct enumerated_ref *)); - -struct printbuf; -void enumerated_ref_to_text(struct printbuf *, - struct enumerated_ref *, - const char * const[]); - -#endif /* _BCACHEFS_ENUMERATED_REF_H */ diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h deleted file mode 100644 index 0e6076f466d3..000000000000 --- a/fs/bcachefs/enumerated_ref_types.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H -#define _BCACHEFS_ENUMERATED_REF_TYPES_H - -#include <linux/percpu-refcount.h> - -struct enumerated_ref { -#ifdef ENUMERATED_REF_DEBUG - unsigned nr; - bool dying; - atomic_long_t *refs; -#else - struct percpu_ref ref; -#endif - void (*stop_fn)(struct enumerated_ref *); - struct completion stop_complete; -}; - -#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */ diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c deleted file mode 100644 index c39cf304c681..000000000000 --- a/fs/bcachefs/errcode.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "errcode.h" -#include "trace.h" - -#include <linux/errname.h> - -static const char * const bch2_errcode_strs[] = { -#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err, - BCH_ERRCODES() -#undef x - NULL -}; - -static const unsigned bch2_errcode_parents[] = { -#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, - BCH_ERRCODES() -#undef x -}; - -__attribute__((const)) -const char *bch2_err_str(int err) -{ - const char *errstr; - - err = abs(err); - - BUG_ON(err >= BCH_ERR_MAX); - - if (err >= BCH_ERR_START) - errstr = bch2_errcode_strs[err - BCH_ERR_START]; - else if (err) - errstr = errname(err); - else - errstr = "(No error)"; - return errstr ?: "(Invalid error)"; -} - -__attribute__((const)) -bool __bch2_err_matches(int err, int class) -{ - err = abs(err); - class = abs(class); - - BUG_ON(err >= BCH_ERR_MAX); - BUG_ON(class >= BCH_ERR_MAX); - - while (err >= BCH_ERR_START && err != class) - err = bch2_errcode_parents[err - BCH_ERR_START]; - - return err == class; -} - -int __bch2_err_class(int bch_err) -{ - int std_err = -bch_err; - BUG_ON((unsigned) std_err >= BCH_ERR_MAX); - - while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START]) - std_err = bch2_errcode_parents[std_err - BCH_ERR_START]; - - trace_error_downcast(bch_err, std_err, _RET_IP_); - - return -std_err; -} - -const char *bch2_blk_status_to_str(blk_status_t status) -{ - if (status == BLK_STS_REMOVED) - return "device removed"; - return blk_status_to_str(status); -} diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h deleted file mode 100644 index acc3b7b67704..000000000000 --- a/fs/bcachefs/errcode.h +++ /dev/null @@ -1,387 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ERRCODE_H -#define _BCACHEFS_ERRCODE_H - -#define BCH_ERRCODES() \ - x(ERANGE, ERANGE_option_too_small) \ - x(ERANGE, ERANGE_option_too_big) \ - x(EINVAL, injected) \ - x(BCH_ERR_injected, injected_fs_start) \ - x(EINVAL, mount_option) \ - x(BCH_ERR_mount_option, option_name) \ - x(BCH_ERR_mount_option, option_value) \ - x(BCH_ERR_mount_option, option_not_bool) \ - x(ENOMEM, ENOMEM_stripe_buf) \ - x(ENOMEM, ENOMEM_replicas_table) \ - x(ENOMEM, ENOMEM_cpu_replicas) \ - x(ENOMEM, ENOMEM_replicas_gc) \ - x(ENOMEM, ENOMEM_disk_groups_validate) \ - x(ENOMEM, ENOMEM_disk_groups_to_cpu) \ - x(ENOMEM, ENOMEM_mark_snapshot) \ - x(ENOMEM, ENOMEM_mark_stripe) \ - x(ENOMEM, ENOMEM_mark_stripe_ptr) \ - x(ENOMEM, ENOMEM_btree_key_cache_create) \ - x(ENOMEM, ENOMEM_btree_key_cache_fill) \ - x(ENOMEM, ENOMEM_btree_key_cache_insert) \ - x(ENOMEM, ENOMEM_trans_kmalloc) \ - x(ENOMEM, ENOMEM_trans_log_msg) \ - x(ENOMEM, ENOMEM_do_encrypt) \ - x(ENOMEM, ENOMEM_ec_read_extent) \ - x(ENOMEM, ENOMEM_ec_stripe_mem_alloc) \ - x(ENOMEM, ENOMEM_ec_new_stripe_alloc) \ - x(ENOMEM, ENOMEM_fs_btree_cache_init) \ - x(ENOMEM, ENOMEM_fs_btree_key_cache_init) \ - x(ENOMEM, ENOMEM_fs_counters_init) \ - x(ENOMEM, ENOMEM_fs_btree_write_buffer_init) \ - x(ENOMEM, ENOMEM_io_clock_init) \ - x(ENOMEM, ENOMEM_blacklist_table_init) \ - x(ENOMEM, ENOMEM_sb_realloc_injected) \ - x(ENOMEM, ENOMEM_sb_bio_realloc) \ - x(ENOMEM, ENOMEM_sb_buf_realloc) \ - x(ENOMEM, ENOMEM_sb_journal_validate) \ - x(ENOMEM, ENOMEM_sb_journal_v2_validate) \ - x(ENOMEM, ENOMEM_journal_entry_add) \ - x(ENOMEM, ENOMEM_journal_read_buf_realloc) \ - x(ENOMEM, ENOMEM_btree_interior_update_worker_init)\ - x(ENOMEM, ENOMEM_btree_interior_update_pool_init) \ - x(ENOMEM, ENOMEM_bio_read_init) \ - x(ENOMEM, ENOMEM_bio_read_split_init) \ - x(ENOMEM, ENOMEM_bio_write_init) \ - x(ENOMEM, ENOMEM_bio_bounce_pages_init) \ - x(ENOMEM, ENOMEM_writepage_bioset_init) \ - x(ENOMEM, ENOMEM_dio_read_bioset_init) \ - x(ENOMEM, ENOMEM_dio_write_bioset_init) \ - x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ - x(ENOMEM, ENOMEM_promote_table_init) \ - x(ENOMEM, ENOMEM_async_obj_init) \ - x(ENOMEM, ENOMEM_compression_bounce_read_init) \ - x(ENOMEM, ENOMEM_compression_bounce_write_init) \ - x(ENOMEM, ENOMEM_compression_workspace_init) \ - x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \ - x(EIO, compression_workspace_not_initialized) \ - x(ENOMEM, ENOMEM_bucket_gens) \ - x(ENOMEM, ENOMEM_buckets_nouse) \ - x(ENOMEM, ENOMEM_usage_init) \ - x(ENOMEM, ENOMEM_btree_node_read_all_replicas) \ - x(ENOMEM, ENOMEM_btree_node_reclaim) \ - x(ENOMEM, ENOMEM_btree_node_mem_alloc) \ - x(ENOMEM, ENOMEM_btree_cache_cannibalize_lock) \ - x(ENOMEM, ENOMEM_buckets_waiting_for_journal_init)\ - x(ENOMEM, ENOMEM_buckets_waiting_for_journal_set) \ - x(ENOMEM, ENOMEM_set_nr_journal_buckets) \ - x(ENOMEM, ENOMEM_dev_journal_init) \ - x(ENOMEM, ENOMEM_journal_pin_fifo) \ - x(ENOMEM, ENOMEM_journal_buf) \ - x(ENOMEM, ENOMEM_gc_start) \ - x(ENOMEM, ENOMEM_gc_alloc_start) \ - x(ENOMEM, ENOMEM_gc_reflink_start) \ - x(ENOMEM, ENOMEM_gc_gens) \ - x(ENOMEM, ENOMEM_gc_repair_key) \ - x(ENOMEM, ENOMEM_fsck_extent_ends_at) \ - x(ENOMEM, ENOMEM_fsck_add_nlink) \ - x(ENOMEM, ENOMEM_journal_key_insert) \ - x(ENOMEM, ENOMEM_journal_keys_sort) \ - x(ENOMEM, ENOMEM_read_superblock_clean) \ - x(ENOMEM, ENOMEM_fs_alloc) \ - x(ENOMEM, ENOMEM_fs_name_alloc) \ - x(ENOMEM, ENOMEM_fs_other_alloc) \ - x(ENOMEM, ENOMEM_dev_alloc) \ - x(ENOMEM, ENOMEM_disk_accounting) \ - x(ENOMEM, ENOMEM_stripe_head_alloc) \ - x(ENOMEM, ENOMEM_journal_read_bucket) \ - x(ENOSPC, ENOSPC_disk_reservation) \ - x(ENOSPC, ENOSPC_bucket_alloc) \ - x(ENOSPC, ENOSPC_disk_label_add) \ - x(ENOSPC, ENOSPC_stripe_create) \ - x(ENOSPC, ENOSPC_inode_create) \ - x(ENOSPC, ENOSPC_str_hash_create) \ - x(ENOSPC, ENOSPC_snapshot_create) \ - x(ENOSPC, ENOSPC_subvolume_create) \ - x(ENOSPC, ENOSPC_sb) \ - x(ENOSPC, ENOSPC_sb_journal) \ - x(ENOSPC, ENOSPC_sb_journal_seq_blacklist) \ - x(ENOSPC, ENOSPC_sb_quota) \ - x(ENOSPC, ENOSPC_sb_replicas) \ - x(ENOSPC, ENOSPC_sb_members) \ - x(ENOSPC, ENOSPC_sb_members_v2) \ - x(ENOSPC, ENOSPC_sb_crypt) \ - x(ENOSPC, ENOSPC_sb_downgrade) \ - x(ENOSPC, ENOSPC_btree_slot) \ - x(ENOSPC, ENOSPC_snapshot_tree) \ - x(ENOENT, ENOENT_bkey_type_mismatch) \ - x(ENOENT, ENOENT_str_hash_lookup) \ - x(ENOENT, ENOENT_str_hash_set_must_replace) \ - x(ENOENT, ENOENT_inode) \ - x(ENOENT, ENOENT_not_subvol) \ - x(ENOENT, ENOENT_not_directory) \ - x(ENOENT, ENOENT_directory_dead) \ - x(ENOENT, ENOENT_subvolume) \ - x(ENOENT, ENOENT_snapshot_tree) \ - x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ - x(ENOENT, ENOENT_dev_not_found) \ - x(ENOENT, ENOENT_dev_bucket_not_found) \ - x(ENOENT, ENOENT_dev_idx_not_found) \ - x(ENOENT, ENOENT_inode_no_backpointer) \ - x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ - x(ENOENT, btree_node_dying) \ - x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ - x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ - x(EEXIST, EEXIST_str_hash_set) \ - x(EEXIST, EEXIST_discard_in_flight_add) \ - x(EEXIST, EEXIST_subvolume_create) \ - x(ENOSPC, open_buckets_empty) \ - x(ENOSPC, freelist_empty) \ - x(BCH_ERR_freelist_empty, no_buckets_found) \ - x(0, transaction_restart) \ - x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ - x(BCH_ERR_transaction_restart, transaction_restart_relock) \ - x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ - x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ - x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ - x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ - x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ - x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\ - x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \ - x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \ - x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \ - x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ - x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ - x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ - x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ - x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ - x(BCH_ERR_transaction_restart, transaction_restart_nested) \ - x(BCH_ERR_transaction_restart, transaction_restart_commit) \ - x(0, no_btree_node) \ - x(BCH_ERR_no_btree_node, no_btree_node_relock) \ - x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ - x(BCH_ERR_no_btree_node, no_btree_node_drop) \ - x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \ - x(BCH_ERR_no_btree_node, no_btree_node_up) \ - x(BCH_ERR_no_btree_node, no_btree_node_down) \ - x(BCH_ERR_no_btree_node, no_btree_node_init) \ - x(BCH_ERR_no_btree_node, no_btree_node_cached) \ - x(BCH_ERR_no_btree_node, no_btree_node_srcu_reset) \ - x(0, btree_insert_fail) \ - x(BCH_ERR_btree_insert_fail, btree_insert_btree_node_full) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ - x(0, backpointer_to_overwritten_btree_node) \ - x(0, journal_reclaim_would_deadlock) \ - x(EINVAL, fsck) \ - x(BCH_ERR_fsck, fsck_ask) \ - x(BCH_ERR_fsck, fsck_fix) \ - x(BCH_ERR_fsck, fsck_delete_bkey) \ - x(BCH_ERR_fsck, fsck_ignore) \ - x(BCH_ERR_fsck, fsck_errors_not_fixed) \ - x(BCH_ERR_fsck, fsck_repair_unimplemented) \ - x(BCH_ERR_fsck, fsck_repair_impossible) \ - x(EINVAL, recovery_will_run) \ - x(BCH_ERR_recovery_will_run, restart_recovery) \ - x(BCH_ERR_recovery_will_run, cannot_rewind_recovery) \ - x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \ - x(0, data_update_done) \ - x(0, bkey_was_deleted) \ - x(BCH_ERR_data_update_done, data_update_done_would_block) \ - x(BCH_ERR_data_update_done, data_update_done_unwritten) \ - x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ - x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ - x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ - x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \ - x(EINVAL, device_state_not_allowed) \ - x(EINVAL, member_info_missing) \ - x(EINVAL, mismatched_block_size) \ - x(EINVAL, block_size_too_small) \ - x(EINVAL, bucket_size_too_small) \ - x(EINVAL, device_size_too_small) \ - x(EINVAL, device_size_too_big) \ - x(EINVAL, device_not_a_member_of_filesystem) \ - x(EINVAL, device_has_been_removed) \ - x(EINVAL, device_splitbrain) \ - x(EINVAL, device_already_online) \ - x(EINVAL, filesystem_uuid_already_open) \ - x(EINVAL, insufficient_devices_to_start) \ - x(EINVAL, invalid) \ - x(EINVAL, internal_fsck_err) \ - x(EINVAL, opt_parse_error) \ - x(EINVAL, remove_with_metadata_missing_unimplemented)\ - x(EINVAL, remove_would_lose_data) \ - x(EINVAL, no_resize_with_buckets_nouse) \ - x(EINVAL, inode_unpack_error) \ - x(EINVAL, inode_not_unlinked) \ - x(EINVAL, inode_has_child_snapshot) \ - x(EINVAL, varint_decode_error) \ - x(EINVAL, erasure_coding_found_btree_node) \ - x(EINVAL, option_negative) \ - x(EOPNOTSUPP, may_not_use_incompat_feature) \ - x(EROFS, erofs_trans_commit) \ - x(EROFS, erofs_no_writes) \ - x(EROFS, erofs_journal_err) \ - x(EROFS, erofs_sb_err) \ - x(EROFS, erofs_unfixed_errors) \ - x(EROFS, erofs_norecovery) \ - x(EROFS, erofs_nochanges) \ - x(EROFS, erofs_no_alloc_info) \ - x(EROFS, erofs_filesystem_full) \ - x(EROFS, insufficient_devices) \ - x(0, operation_blocked) \ - x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ - x(BCH_ERR_operation_blocked, journal_res_blocked) \ - x(BCH_ERR_journal_res_blocked, journal_blocked) \ - x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \ - x(BCH_ERR_journal_res_blocked, journal_max_open) \ - x(BCH_ERR_journal_res_blocked, journal_full) \ - x(BCH_ERR_journal_res_blocked, journal_pin_full) \ - x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ - x(BCH_ERR_journal_res_blocked, journal_stuck) \ - x(BCH_ERR_journal_res_blocked, journal_retry_open) \ - x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ - x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ - x(BCH_ERR_invalid, invalid_sb) \ - x(BCH_ERR_invalid_sb, invalid_sb_magic) \ - x(BCH_ERR_invalid_sb, invalid_sb_version) \ - x(BCH_ERR_invalid_sb, invalid_sb_features) \ - x(BCH_ERR_invalid_sb, invalid_sb_too_big) \ - x(BCH_ERR_invalid_sb, invalid_sb_csum_type) \ - x(BCH_ERR_invalid_sb, invalid_sb_csum) \ - x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ - x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ - x(BCH_ERR_invalid_sb, invalid_sb_offset) \ - x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ - x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ - x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ - x(BCH_ERR_invalid_sb, invalid_sb_field_size) \ - x(BCH_ERR_invalid_sb, invalid_sb_layout) \ - x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \ - x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \ - x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \ - x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \ - x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \ - x(BCH_ERR_invalid_sb, invalid_sb_members) \ - x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ - x(BCH_ERR_invalid_sb, invalid_sb_replicas) \ - x(BCH_ERR_invalid_sb, invalid_replicas_entry) \ - x(BCH_ERR_invalid_sb, invalid_sb_journal) \ - x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \ - x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ - x(BCH_ERR_invalid_sb, invalid_sb_clean) \ - x(BCH_ERR_invalid_sb, invalid_sb_quota) \ - x(BCH_ERR_invalid_sb, invalid_sb_errors) \ - x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \ - x(BCH_ERR_invalid_sb, invalid_sb_ext) \ - x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ - x(BCH_ERR_invalid, invalid_bkey) \ - x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ - x(EROFS, journal_shutdown) \ - x(EIO, journal_flush_err) \ - x(EIO, journal_write_err) \ - x(EIO, btree_node_read_err) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ - x(EIO, sb_not_downgraded) \ - x(EIO, btree_node_write_all_failed) \ - x(EIO, btree_node_read_error) \ - x(EIO, btree_need_topology_repair) \ - x(EIO, bucket_ref_update) \ - x(EIO, trigger_alloc) \ - x(EIO, trigger_pointer) \ - x(EIO, trigger_stripe_pointer) \ - x(EIO, metadata_bucket_inconsistency) \ - x(EIO, mark_stripe) \ - x(EIO, stripe_reconstruct) \ - x(EIO, key_type_error) \ - x(EIO, extent_poisoned) \ - x(EIO, missing_indirect_extent) \ - x(EIO, invalidate_stripe_to_dev) \ - x(EIO, no_encryption_key) \ - x(EIO, insufficient_journal_devices) \ - x(EIO, device_offline) \ - x(EIO, EIO_fault_injected) \ - x(EIO, ec_block_read) \ - x(EIO, ec_block_write) \ - x(EIO, recompute_checksum) \ - x(EIO, decompress) \ - x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \ - x(BCH_ERR_decompress, decompress_lz4) \ - x(BCH_ERR_decompress, decompress_gzip) \ - x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \ - x(BCH_ERR_decompress, decompress_zstd) \ - x(EIO, data_write) \ - x(BCH_ERR_data_write, data_write_io) \ - x(BCH_ERR_data_write, data_write_csum) \ - x(BCH_ERR_data_write, data_write_invalid_ptr) \ - x(BCH_ERR_data_write, data_write_misaligned) \ - x(BCH_ERR_decompress, data_read) \ - x(BCH_ERR_data_read, no_device_to_read_from) \ - x(BCH_ERR_data_read, no_devices_valid) \ - x(BCH_ERR_data_read, data_read_io_err) \ - x(BCH_ERR_data_read, data_read_csum_err) \ - x(BCH_ERR_data_read, data_read_retry) \ - x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ - x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \ - x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \ - x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \ - x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \ - x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\ - x(BCH_ERR_data_read, data_read_decompress_err) \ - x(BCH_ERR_data_read, data_read_decrypt_err) \ - x(BCH_ERR_data_read, data_read_ptr_stale_race) \ - x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \ - x(BCH_ERR_data_read, data_read_no_encryption_key) \ - x(BCH_ERR_data_read, data_read_buffer_too_small) \ - x(BCH_ERR_data_read, data_read_key_overwritten) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) \ - x(0, nopromote) \ - x(BCH_ERR_nopromote, nopromote_may_not) \ - x(BCH_ERR_nopromote, nopromote_already_promoted) \ - x(BCH_ERR_nopromote, nopromote_unwritten) \ - x(BCH_ERR_nopromote, nopromote_congested) \ - x(BCH_ERR_nopromote, nopromote_in_flight) \ - x(BCH_ERR_nopromote, nopromote_no_writes) \ - x(BCH_ERR_nopromote, nopromote_enomem) \ - x(0, invalid_snapshot_node) \ - x(0, option_needs_open_fs) \ - x(0, remove_disk_accounting_entry) - -enum bch_errcode { - BCH_ERR_START = 2048, -#define x(class, err) BCH_ERR_##err, - BCH_ERRCODES() -#undef x - BCH_ERR_MAX -}; - -__attribute__((const)) const char *bch2_err_str(int); - -__attribute__((const)) bool __bch2_err_matches(int, int); - -__attribute__((const)) -static inline bool _bch2_err_matches(int err, int class) -{ - return err < 0 && __bch2_err_matches(err, class); -} - -#define bch2_err_matches(_err, _class) \ -({ \ - BUILD_BUG_ON(!__builtin_constant_p(_class)); \ - unlikely(_bch2_err_matches(_err, _class)); \ -}) - -int __bch2_err_class(int); - -static inline long bch2_err_class(long err) -{ - return err < 0 ? __bch2_err_class(err) : err; -} - -#define BLK_STS_REMOVED ((__force blk_status_t)128) - -#include <linux/blk_types.h> -const char *bch2_blk_status_to_str(blk_status_t); - -#endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c deleted file mode 100644 index 267e73d9d7e6..000000000000 --- a/fs/bcachefs/error.c +++ /dev/null @@ -1,771 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "error.h" -#include "journal.h" -#include "namei.h" -#include "recovery_passes.h" -#include "super.h" -#include "thread_with_file.h" - -#define FSCK_ERR_RATELIMIT_NR 10 - -void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out) -{ - printbuf_indent_add_nextline(out, 2); - -#ifdef BCACHEFS_LOG_PREFIX - prt_printf(out, "bcachefs (%s): ", fs_or_dev_name); -#endif -} - -bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) -{ - set_bit(BCH_FS_error, &c->flags); - - switch (c->opts.errors) { - case BCH_ON_ERROR_continue: - return false; - case BCH_ON_ERROR_fix_safe: - case BCH_ON_ERROR_ro: - bch2_fs_emergency_read_only2(c, out); - return true; - case BCH_ON_ERROR_panic: - bch2_print_str(c, KERN_ERR, out->buf); - panic(bch2_fmt(c, "panic after error")); - return true; - default: - BUG(); - } -} - -bool bch2_inconsistent_error(struct bch_fs *c) -{ - struct printbuf buf = PRINTBUF; - buf.atomic++; - - printbuf_indent_add_nextline(&buf, 2); - - bool ret = __bch2_inconsistent_error(c, &buf); - if (ret) - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - return ret; -} - -__printf(3, 0) -static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans, - const char *fmt, va_list args) -{ - struct printbuf buf = PRINTBUF; - buf.atomic++; - - bch2_log_msg_start(c, &buf); - - prt_vprintf(&buf, fmt, args); - prt_newline(&buf); - - if (trans) - bch2_trans_updates_to_text(&buf, trans); - bool ret = __bch2_inconsistent_error(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - - printbuf_exit(&buf); - return ret; -} - -bool bch2_fs_inconsistent(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - bool ret = bch2_fs_trans_inconsistent(c, NULL, fmt, args); - va_end(args); - return ret; -} - -bool bch2_trans_inconsistent(struct btree_trans *trans, const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - bool ret = bch2_fs_trans_inconsistent(trans->c, trans, fmt, args); - va_end(args); - return ret; -} - -int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) -{ - prt_printf(out, "btree topology error: "); - - set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_in_recovery, &c->flags)) { - __bch2_inconsistent_error(c, out); - return bch_err_throw(c, btree_need_topology_repair); - } else { - return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: - bch_err_throw(c, btree_need_topology_repair); - } -} - -int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) -{ - struct printbuf buf = PRINTBUF; - - bch2_log_msg_start(c, &buf); - - va_list args; - va_start(args, fmt); - prt_vprintf(&buf, fmt, args); - va_end(args); - - int ret = __bch2_topology_error(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - - printbuf_exit(&buf); - return ret; -} - -void bch2_fatal_error(struct bch_fs *c) -{ - if (bch2_fs_emergency_read_only(c)) - bch_err(c, "fatal error - emergency read only"); -} - -void bch2_io_error_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); - struct bch_fs *c = ca->fs; - - /* XXX: if it's reads or checksums that are failing, set it to failed */ - - down_write(&c->state_lock); - unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); - - if (write_errors_start && - time_after(jiffies, - write_errors_start + c->opts.write_error_timeout * HZ)) { - if (ca->mi.state >= BCH_MEMBER_STATE_ro) - goto out; - - bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED); - struct printbuf buf = PRINTBUF; - __bch2_log_msg_start(ca->name, &buf); - - prt_printf(&buf, "writes erroring for %u seconds, setting %s ro", - c->opts.write_error_timeout, - dev ? "device" : "filesystem"); - if (!dev) - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } -out: - up_write(&c->state_lock); -} - -void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) -{ - atomic64_inc(&ca->errors[type]); - - if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start) - ca->write_errors_start = jiffies; - - queue_work(system_long_wq, &ca->io_error_work); -} - -enum ask_yn { - YN_NO, - YN_YES, - YN_ALLNO, - YN_ALLYES, -}; - -static enum ask_yn parse_yn_response(char *buf) -{ - buf = strim(buf); - - if (strlen(buf) == 1) - switch (buf[0]) { - case 'n': - return YN_NO; - case 'y': - return YN_YES; - case 'N': - return YN_ALLNO; - case 'Y': - return YN_ALLYES; - } - return -1; -} - -#ifdef __KERNEL__ -static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) -{ - struct stdio_redirect *stdio = c->stdio; - - if (c->stdio_filter && c->stdio_filter != current) - stdio = NULL; - - if (!stdio) - return YN_NO; - - if (trans) - bch2_trans_unlock(trans); - - unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0; - darray_char line = {}; - int ret; - - do { - unsigned long t; - bch2_print(c, " (y,n, or Y,N for all errors of this type) "); -rewait: - t = unlock_long_at - ? max_t(long, unlock_long_at - jiffies, 0) - : MAX_SCHEDULE_TIMEOUT; - - int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t); - if (r == -ETIME) { - bch2_trans_unlock_long(trans); - unlock_long_at = 0; - goto rewait; - } - - if (r < 0) { - ret = YN_NO; - break; - } - - darray_last(line) = '\0'; - } while ((ret = parse_yn_response(line.data)) < 0); - - darray_exit(&line); - return ret; -} -#else - -#include "tools-util.h" - -static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) -{ - char *buf = NULL; - size_t buflen = 0; - int ret; - - do { - fputs(" (y,n, or Y,N for all errors of this type) ", stdout); - fflush(stdout); - - if (getline(&buf, &buflen, stdin) < 0) - die("error reading from standard input"); - } while ((ret = parse_yn_response(buf)) < 0); - - free(buf); - return ret; -} - -#endif - -static struct fsck_err_state *fsck_err_get(struct bch_fs *c, - enum bch_sb_error_id id) -{ - struct fsck_err_state *s; - - list_for_each_entry(s, &c->fsck_error_msgs, list) - if (s->id == id) { - /* - * move it to the head of the list: repeated fsck errors - * are common - */ - list_move(&s->list, &c->fsck_error_msgs); - return s; - } - - s = kzalloc(sizeof(*s), GFP_NOFS); - if (!s) { - if (!c->fsck_alloc_msgs_err) - bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); - c->fsck_alloc_msgs_err = true; - return NULL; - } - - INIT_LIST_HEAD(&s->list); - s->id = id; - list_add(&s->list, &c->fsck_error_msgs); - return s; -} - -/* s/fix?/fixing/ s/recreate?/recreating/ */ -static void prt_actioning(struct printbuf *out, const char *action) -{ - unsigned len = strlen(action); - - BUG_ON(action[len - 1] != '?'); - --len; - - if (action[len - 1] == 'e') - --len; - - prt_bytes(out, action, len); - prt_str(out, "ing"); -} - -static const u8 fsck_flags_extra[] = { -#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags, - BCH_SB_ERRS() -#undef x -}; - -static int do_fsck_ask_yn(struct bch_fs *c, - struct btree_trans *trans, - struct printbuf *question, - const char *action) -{ - prt_str(question, ", "); - prt_str(question, action); - - if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s", question->buf); - else - bch2_print_str(c, KERN_ERR, question->buf); - - int ask = bch2_fsck_ask_yn(c, trans); - - if (trans) { - int ret = bch2_trans_relock(trans); - if (ret) - return ret; - } - - return ask; -} - -static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c, - enum bch_sb_error_id id, const char *msg, - bool *repeat, bool *print, bool *suppress) -{ - bch2_sb_error_count(c, id); - - struct fsck_err_state *s = fsck_err_get(c, id); - if (s) { - /* - * We may be called multiple times for the same error on - * transaction restart - this memoizes instead of asking the user - * multiple times for the same error: - */ - if (s->last_msg && !strcmp(msg, s->last_msg)) { - *repeat = true; - *print = false; - return s; - } - - kfree(s->last_msg); - s->last_msg = kstrdup(msg, GFP_KERNEL); - - if (c->opts.ratelimit_errors && - s->nr >= FSCK_ERR_RATELIMIT_NR) { - if (s->nr == FSCK_ERR_RATELIMIT_NR) - *suppress = true; - else - *print = false; - } - - s->nr++; - } - return s; -} - -bool __bch2_count_fsck_err(struct bch_fs *c, - enum bch_sb_error_id id, struct printbuf *msg) -{ - bch2_sb_error_count(c, id); - - mutex_lock(&c->fsck_error_msgs_lock); - bool print = true, repeat = false, suppress = false; - - count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress); - mutex_unlock(&c->fsck_error_msgs_lock); - - if (suppress) - prt_printf(msg, "Ratelimiting new instances of previous error\n"); - - return print && !repeat; -} - -int bch2_fsck_err_opt(struct bch_fs *c, - enum bch_fsck_flags flags, - enum bch_sb_error_id err) -{ - if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) - flags |= fsck_flags_extra[err]; - - if (test_bit(BCH_FS_in_fsck, &c->flags)) { - if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) - return bch_err_throw(c, fsck_repair_unimplemented); - - switch (c->opts.fix_errors) { - case FSCK_FIX_exit: - return bch_err_throw(c, fsck_errors_not_fixed); - case FSCK_FIX_yes: - if (flags & FSCK_CAN_FIX) - return bch_err_throw(c, fsck_fix); - fallthrough; - case FSCK_FIX_no: - if (flags & FSCK_CAN_IGNORE) - return bch_err_throw(c, fsck_ignore); - return bch_err_throw(c, fsck_errors_not_fixed); - case FSCK_FIX_ask: - if (flags & FSCK_AUTOFIX) - return bch_err_throw(c, fsck_fix); - return bch_err_throw(c, fsck_ask); - default: - BUG(); - } - } else { - if ((flags & FSCK_AUTOFIX) && - (c->opts.errors == BCH_ON_ERROR_continue || - c->opts.errors == BCH_ON_ERROR_fix_safe)) - return bch_err_throw(c, fsck_fix); - - if (c->opts.errors == BCH_ON_ERROR_continue && - (flags & FSCK_CAN_IGNORE)) - return bch_err_throw(c, fsck_ignore); - return bch_err_throw(c, fsck_errors_not_fixed); - } -} - -int __bch2_fsck_err(struct bch_fs *c, - struct btree_trans *trans, - enum bch_fsck_flags flags, - enum bch_sb_error_id err, - const char *fmt, ...) -{ - va_list args; - struct printbuf buf = PRINTBUF, *out = &buf; - int ret = 0; - const char *action_orig = "fix?", *action = action_orig; - - might_sleep(); - - if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) - flags |= fsck_flags_extra[err]; - - if (!c) - c = trans->c; - - /* - * Ugly: if there's a transaction in the current task it has to be - * passed in to unlock if we prompt for user input. - * - * But, plumbing a transaction and transaction restarts into - * bkey_validate() is problematic. - * - * So: - * - make all bkey errors AUTOFIX, they're simple anyways (we just - * delete the key) - * - and we don't need to warn if we're not prompting - */ - WARN_ON((flags & FSCK_CAN_FIX) && - !(flags & FSCK_AUTOFIX) && - !trans && - bch2_current_has_btree_trans(c)); - - if (test_bit(err, c->sb.errors_silent)) - return flags & FSCK_CAN_FIX - ? bch_err_throw(c, fsck_fix) - : bch_err_throw(c, fsck_ignore); - - printbuf_indent_add_nextline(out, 2); - -#ifdef BCACHEFS_LOG_PREFIX - if (strncmp(fmt, "bcachefs", 8)) - prt_printf(out, bch2_log_msg(c, "")); -#endif - - va_start(args, fmt); - prt_vprintf(out, fmt, args); - va_end(args); - - /* Custom fix/continue/recreate/etc.? */ - if (out->buf[out->pos - 1] == '?') { - const char *p = strrchr(out->buf, ','); - if (p) { - out->pos = p - out->buf; - action = kstrdup(p + 2, GFP_KERNEL); - if (!action) { - ret = -ENOMEM; - goto err; - } - } - } - - mutex_lock(&c->fsck_error_msgs_lock); - bool repeat = false, print = true, suppress = false; - bool inconsistent = false, exiting = false; - struct fsck_err_state *s = - count_fsck_err_locked(c, err, buf.buf, &repeat, &print, &suppress); - if (repeat) { - ret = s->ret; - goto err_unlock; - } - - if ((flags & FSCK_AUTOFIX) && - (c->opts.errors == BCH_ON_ERROR_continue || - c->opts.errors == BCH_ON_ERROR_fix_safe)) { - prt_str(out, ", "); - if (flags & FSCK_CAN_FIX) { - prt_actioning(out, action); - ret = bch_err_throw(c, fsck_fix); - } else { - prt_str(out, ", continuing"); - ret = bch_err_throw(c, fsck_ignore); - } - - goto print; - } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) { - if (c->opts.errors != BCH_ON_ERROR_continue || - !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { - prt_str_indented(out, ", shutting down\n" - "error not marked as autofix and not in fsck\n" - "run fsck, and forward to devs so error can be marked for self-healing"); - inconsistent = true; - print = true; - ret = bch_err_throw(c, fsck_errors_not_fixed); - } else if (flags & FSCK_CAN_FIX) { - prt_str(out, ", "); - prt_actioning(out, action); - ret = bch_err_throw(c, fsck_fix); - } else { - prt_str(out, ", continuing"); - ret = bch_err_throw(c, fsck_ignore); - } - } else if (c->opts.fix_errors == FSCK_FIX_exit) { - prt_str(out, ", exiting"); - ret = bch_err_throw(c, fsck_errors_not_fixed); - } else if (flags & FSCK_CAN_FIX) { - int fix = s && s->fix - ? s->fix - : c->opts.fix_errors; - - if (fix == FSCK_FIX_ask) { - print = false; - - ret = do_fsck_ask_yn(c, trans, out, action); - if (ret < 0) - goto err_unlock; - - if (ret >= YN_ALLNO && s) - s->fix = ret == YN_ALLNO - ? FSCK_FIX_no - : FSCK_FIX_yes; - - ret = ret & 1 - ? bch_err_throw(c, fsck_fix) - : bch_err_throw(c, fsck_ignore); - } else if (fix == FSCK_FIX_yes || - (c->opts.nochanges && - !(flags & FSCK_CAN_IGNORE))) { - prt_str(out, ", "); - prt_actioning(out, action); - ret = bch_err_throw(c, fsck_fix); - } else { - prt_str(out, ", not "); - prt_actioning(out, action); - ret = bch_err_throw(c, fsck_ignore); - } - } else { - if (flags & FSCK_CAN_IGNORE) { - prt_str(out, ", continuing"); - ret = bch_err_throw(c, fsck_ignore); - } else { - prt_str(out, " (repair unimplemented)"); - ret = bch_err_throw(c, fsck_repair_unimplemented); - } - } - - if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) && - (c->opts.fix_errors == FSCK_FIX_exit || - !(flags & FSCK_CAN_IGNORE))) - ret = bch_err_throw(c, fsck_errors_not_fixed); - - if (test_bit(BCH_FS_in_fsck, &c->flags) && - (!bch2_err_matches(ret, BCH_ERR_fsck_fix) && - !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) { - exiting = true; - print = true; - } -print: - prt_newline(out); - - if (inconsistent) - __bch2_inconsistent_error(c, out); - else if (exiting) - prt_printf(out, "Unable to continue, halting\n"); - else if (suppress) - prt_printf(out, "Ratelimiting new instances of previous error\n"); - - if (print) { - /* possibly strip an empty line, from printbuf_indent_add */ - while (out->pos && out->buf[out->pos - 1] == ' ') - --out->pos; - printbuf_nul_terminate(out); - - if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s", out->buf); - else - bch2_print_str(c, KERN_ERR, out->buf); - } - - if (s) - s->ret = ret; - - if (trans && - !(flags & FSCK_ERR_NO_LOG) && - ret == -BCH_ERR_fsck_fix) - ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret; -err_unlock: - mutex_unlock(&c->fsck_error_msgs_lock); -err: - /* - * We don't yet track whether the filesystem currently has errors, for - * log_fsck_err()s: that would require us to track for every error type - * which recovery pass corrects it, to get the fsck exit status correct: - */ - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - /* nothing */ - } else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) { - set_bit(BCH_FS_errors_fixed, &c->flags); - } else { - set_bit(BCH_FS_errors_not_fixed, &c->flags); - set_bit(BCH_FS_error, &c->flags); - } - - if (action != action_orig) - kfree(action); - printbuf_exit(&buf); - - BUG_ON(!ret); - return ret; -} - -static const char * const bch2_bkey_validate_contexts[] = { -#define x(n) #n, - BKEY_VALIDATE_CONTEXTS() -#undef x - NULL -}; - -int __bch2_bkey_fsck_err(struct bch_fs *c, - struct bkey_s_c k, - struct bkey_validate_context from, - enum bch_sb_error_id err, - const char *fmt, ...) -{ - if (from.flags & BCH_VALIDATE_silent) - return bch_err_throw(c, fsck_delete_bkey); - - unsigned fsck_flags = 0; - if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { - if (test_bit(err, c->sb.errors_silent)) - return bch_err_throw(c, fsck_delete_bkey); - - fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; - } - if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) - fsck_flags |= fsck_flags_extra[err]; - - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "invalid bkey in %s", - bch2_bkey_validate_contexts[from.from]); - - if (from.from == BKEY_VALIDATE_journal) - prt_printf(&buf, " journal seq=%llu offset=%u", - from.journal_seq, from.journal_offset); - - prt_str(&buf, " btree="); - bch2_btree_id_to_text(&buf, from.btree); - prt_printf(&buf, " level=%u: ", from.level); - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - va_list args; - va_start(args, fmt); - prt_vprintf(&buf, fmt, args); - va_end(args); - - int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf); - printbuf_exit(&buf); - return ret; -} - -static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print) -{ - struct fsck_err_state *s, *n; - - mutex_lock(&c->fsck_error_msgs_lock); - - list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { - if (print && s->ratelimited && s->last_msg) - bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); - - list_del(&s->list); - kfree(s->last_msg); - kfree(s); - } - - mutex_unlock(&c->fsck_error_msgs_lock); -} - -void bch2_flush_fsck_errs(struct bch_fs *c) -{ - __bch2_flush_fsck_errs(c, true); -} - -void bch2_free_fsck_errs(struct bch_fs *c) -{ - __bch2_flush_fsck_errs(c, false); -} - -int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - subvol_inum inum, u64 offset) -{ - u32 restart_count = trans->restart_count; - int ret = 0; - - if (inum.subvol) { - ret = bch2_inum_to_path(trans, inum, out); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - } - if (!inum.subvol || ret) - prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); - prt_printf(out, " offset %llu: ", offset); - - return trans_was_restarted(trans, restart_count); -} - -void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, - subvol_inum inum, u64 offset) -{ - bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); -} - -int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - struct bpos pos) -{ - int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out); - if (ret) - return ret; - - prt_printf(out, " offset %llu: ", pos.offset << 8); - return 0; -} - -void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, - struct bpos pos) -{ - bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); -} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h deleted file mode 100644 index 0c3c3a24fc6f..000000000000 --- a/fs/bcachefs/error.h +++ /dev/null @@ -1,258 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ERROR_H -#define _BCACHEFS_ERROR_H - -#include <linux/list.h> -#include <linux/printk.h> -#include "bkey_types.h" -#include "sb-errors.h" - -struct bch_dev; -struct bch_fs; -struct work_struct; - -/* - * XXX: separate out errors that indicate on disk data is inconsistent, and flag - * superblock as such - */ - -/* Error messages: */ - -void __bch2_log_msg_start(const char *, struct printbuf *); - -static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) -{ - __bch2_log_msg_start(c->name, out); -} - -/* - * Inconsistency errors: The on disk data is inconsistent. If these occur during - * initial recovery, they don't indicate a bug in the running code - we walk all - * the metadata before modifying anything. If they occur at runtime, they - * indicate either a bug in the running code or (less likely) data is being - * silently corrupted under us. - * - * XXX: audit all inconsistent errors and make sure they're all recoverable, in - * BCH_ON_ERROR_CONTINUE mode - */ - -bool __bch2_inconsistent_error(struct bch_fs *, struct printbuf *); -bool bch2_inconsistent_error(struct bch_fs *); -__printf(2, 3) -bool bch2_fs_inconsistent(struct bch_fs *, const char *, ...); - -#define bch2_fs_inconsistent_on(cond, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - if (_ret) \ - bch2_fs_inconsistent(__VA_ARGS__); \ - _ret; \ -}) - -__printf(2, 3) -bool bch2_trans_inconsistent(struct btree_trans *, const char *, ...); - -#define bch2_trans_inconsistent_on(cond, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - if (_ret) \ - bch2_trans_inconsistent(__VA_ARGS__); \ - _ret; \ -}) - -int __bch2_topology_error(struct bch_fs *, struct printbuf *); -__printf(2, 3) -int bch2_fs_topology_error(struct bch_fs *, const char *, ...); - -/* - * Fsck errors: inconsistency errors we detect at mount time, and should ideally - * be able to repair: - */ - -struct fsck_err_state { - struct list_head list; - enum bch_sb_error_id id; - u64 nr; - bool ratelimited; - int ret; - int fix; - char *last_msg; -}; - -#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) - -bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *); -#define bch2_count_fsck_err(_c, _err, ...) \ - __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__) - -int bch2_fsck_err_opt(struct bch_fs *, - enum bch_fsck_flags, - enum bch_sb_error_id); - -__printf(5, 6) __cold -int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, - enum bch_fsck_flags, - enum bch_sb_error_id, - const char *, ...); -#define bch2_fsck_err(c, _flags, _err_type, ...) \ - __bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\ - type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\ - _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__) - -void bch2_flush_fsck_errs(struct bch_fs *); -void bch2_free_fsck_errs(struct bch_fs *); - -#define fsck_err_wrap(_do) \ -({ \ - int _ret = _do; \ - if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ - !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) { \ - ret = _ret; \ - goto fsck_err; \ - } \ - \ - bch2_err_matches(_ret, BCH_ERR_fsck_fix); \ -}) - -#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) - -/* These macros return true if error should be fixed: */ - -/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ - -#define __fsck_err_on(cond, c, _flags, _err_type, ...) \ -({ \ - might_sleep(); \ - \ - if (type_is(c, struct bch_fs *)) \ - WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\ - \ - (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\ -}) - -#define mustfix_fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) - -#define mustfix_fsck_err_on(cond, c, _err_type, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) - -#define fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) - -#define fsck_err_on(cond, c, _err_type, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) - -#define log_fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) - -#define log_fsck_err_on(cond, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - if (_ret) \ - log_fsck_err(__VA_ARGS__); \ - _ret; \ -}) - -enum bch_validate_flags; -__printf(5, 6) -int __bch2_bkey_fsck_err(struct bch_fs *, - struct bkey_s_c, - struct bkey_validate_context from, - enum bch_sb_error_id, - const char *, ...); - -/* - * for now, bkey fsck errors are always handled by deleting the entire key - - * this will change at some point - */ -#define bkey_fsck_err(c, _err_type, _err_msg, ...) \ -do { \ - int _ret = __bch2_bkey_fsck_err(c, k, from, \ - BCH_FSCK_ERR_##_err_type, \ - _err_msg, ##__VA_ARGS__); \ - if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ - !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \ - ret = _ret; \ - ret = bch_err_throw(c, fsck_delete_bkey); \ - goto fsck_err; \ -} while (0) - -#define bkey_fsck_err_on(cond, ...) \ -do { \ - if (unlikely(cond)) \ - bkey_fsck_err(__VA_ARGS__); \ -} while (0) - -/* - * Fatal errors: these don't indicate a bug, but we can't continue running in RW - * mode - pretty much just due to metadata IO errors: - */ - -void bch2_fatal_error(struct bch_fs *); - -#define bch2_fs_fatal_error(c, _msg, ...) \ -do { \ - bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__); \ - bch2_fatal_error(c); \ -} while (0) - -#define bch2_fs_fatal_err_on(cond, c, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - \ - if (_ret) \ - bch2_fs_fatal_error(c, __VA_ARGS__); \ - _ret; \ -}) - -/* - * IO errors: either recoverable metadata IO (because we have replicas), or data - * IO - we need to log it and print out a message, but we don't (necessarily) - * want to shut down the fs: - */ - -void bch2_io_error_work(struct work_struct *); - -/* Does the error handling without logging a message */ -void bch2_io_error(struct bch_dev *, enum bch_member_error_type); - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void bch2_latency_acct(struct bch_dev *, u64, int); -#else -static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -#endif - -static inline void bch2_account_io_success_fail(struct bch_dev *ca, - enum bch_member_error_type type, - bool success) -{ - if (likely(success)) { - if (type == BCH_MEMBER_ERROR_write && - ca->write_errors_start) - ca->write_errors_start = 0; - } else { - bch2_io_error(ca, type); - } -} - -static inline void bch2_account_io_completion(struct bch_dev *ca, - enum bch_member_error_type type, - u64 submit_time, bool success) -{ - if (unlikely(!ca)) - return; - - if (type != BCH_MEMBER_ERROR_checksum) - bch2_latency_acct(ca, submit_time, type); - - bch2_account_io_success_fail(ca, type, success); -} - -int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); - -void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); - -int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); -void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos); - -#endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c deleted file mode 100644 index e76e58a568bf..000000000000 --- a/fs/bcachefs/extent_update.c +++ /dev/null @@ -1,155 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "debug.h" -#include "extents.h" -#include "extent_update.h" - -/* - * This counts the number of iterators to the alloc & ec btrees we'll need - * inserting/removing this extent: - */ -static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - unsigned ret = 0, lru = 0; - - bkey_extent_entry_for_each(ptrs, entry) { - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - /* Might also be updating LRU btree */ - if (entry->ptr.cached) - lru++; - - fallthrough; - case BCH_EXTENT_ENTRY_stripe_ptr: - ret++; - } - } - - /* - * Updating keys in the alloc btree may also update keys in the - * freespace or discard btrees: - */ - return lru + ret * 2; -} - -#define EXTENT_ITERS_MAX 64 - -static int count_iters_for_insert(struct btree_trans *trans, - struct bkey_s_c k, - unsigned offset, - struct bpos *end, - unsigned *nr_iters) -{ - int ret = 0, ret2 = 0; - - if (*nr_iters >= EXTENT_ITERS_MAX) { - *end = bpos_min(*end, k.k->p); - ret = 1; - } - - switch (k.k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - *nr_iters += bch2_bkey_nr_alloc_ptrs(k); - - if (*nr_iters >= EXTENT_ITERS_MAX) { - *end = bpos_min(*end, k.k->p); - ret = 1; - } - - break; - case KEY_TYPE_reflink_p: { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx = REFLINK_P_IDX(p.v); - unsigned sectors = bpos_min(*end, p.k->p).offset - - bkey_start_offset(p.k); - struct btree_iter iter; - struct bkey_s_c r_k; - - for_each_btree_key_norestart(trans, iter, - BTREE_ID_reflink, POS(0, idx + offset), - BTREE_ITER_slots, r_k, ret2) { - if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors))) - break; - - /* extent_update_to_keys(), for the reflink_v update */ - *nr_iters += 1; - - *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); - - if (*nr_iters >= EXTENT_ITERS_MAX) { - struct bpos pos = bkey_start_pos(k.k); - pos.offset += min_t(u64, k.k->size, - r_k.k->p.offset - idx); - - *end = bpos_min(*end, pos); - ret = 1; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - break; - } - } - - return ret2 ?: ret; -} - -int bch2_extent_atomic_end(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos *end) -{ - unsigned nr_iters = 0; - - struct btree_iter copy; - bch2_trans_copy_iter(trans, ©, iter); - - int ret = bch2_btree_iter_traverse(trans, ©); - if (ret) - goto err; - - struct bkey_s_c k; - for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) { - unsigned offset = 0; - - if (bkey_gt(iter->pos, bkey_start_pos(k.k))) - offset = iter->pos.offset - bkey_start_offset(k.k); - - ret = count_iters_for_insert(trans, k, offset, end, &nr_iters); - if (ret) - break; - } -err: - bch2_trans_iter_exit(trans, ©); - return ret < 0 ? ret : 0; -} - -int bch2_extent_trim_atomic(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *k) -{ - struct bpos end = k->k.p; - int ret = bch2_extent_atomic_end(trans, iter, &end); - if (ret) - return ret; - - /* tracepoint */ - - if (bpos_lt(end, k->k.p)) { - if (trace_extent_trim_atomic_enabled()) { - CLASS(printbuf, buf)(); - bch2_bpos_to_text(&buf, end); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); - trace_extent_trim_atomic(trans->c, buf.buf); - } - bch2_cut_back(end, k); - } - return 0; -} diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h deleted file mode 100644 index 34467db53f45..000000000000 --- a/fs/bcachefs/extent_update.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EXTENT_UPDATE_H -#define _BCACHEFS_EXTENT_UPDATE_H - -#include "bcachefs.h" - -int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, - struct bpos *); -int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, - struct bkey_i *); - -#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c deleted file mode 100644 index 83cbd77dcb9c..000000000000 --- a/fs/bcachefs/extents.c +++ /dev/null @@ -1,1735 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> - * - * Code for managing the extent btree and dynamically updating the writeback - * dirty sector count. - */ - -#include "bcachefs.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_gc.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "buckets.h" -#include "checksum.h" -#include "compress.h" -#include "debug.h" -#include "disk_groups.h" -#include "error.h" -#include "extents.h" -#include "inode.h" -#include "journal.h" -#include "rebalance.h" -#include "replicas.h" -#include "super.h" -#include "super-io.h" -#include "trace.h" -#include "util.h" - -static const char * const bch2_extent_flags_strs[] = { -#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, - BCH_EXTENT_FLAGS() -#undef x - NULL, -}; - -static unsigned bch2_crc_field_size_max[] = { - [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, -}; - -static void bch2_extent_crc_pack(union bch_extent_crc *, - struct bch_extent_crc_unpacked, - enum bch_extent_entry_type); - -void bch2_io_failures_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_io_failures *failed) -{ - static const char * const error_types[] = { - "btree validate", "io", "checksum", "ec reconstruct", NULL - }; - - for (struct bch_dev_io_failures *f = failed->devs; - f < failed->devs + failed->nr; - f++) { - unsigned errflags = - ((!!f->failed_btree_validate) << 0) | - ((!!f->failed_io) << 1) | - ((!!f->failed_csum_nr) << 2) | - ((!!f->failed_ec) << 3); - - bch2_printbuf_make_room(out, 1024); - out->atomic++; - scoped_guard(rcu) { - struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); - if (ca) - prt_str(out, ca->name); - else - prt_printf(out, "(invalid device %u)", f->dev); - } - --out->atomic; - - prt_char(out, ' '); - - if (!errflags) { - prt_str(out, "no error - confused"); - } else if (is_power_of_2(errflags)) { - prt_bitflags(out, error_types, errflags); - prt_str(out, " error"); - } else { - prt_str(out, "errors: "); - prt_bitflags(out, error_types, errflags); - } - prt_newline(out); - } -} - -struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, - unsigned dev) -{ - struct bch_dev_io_failures *i; - - for (i = f->devs; i < f->devs + f->nr; i++) - if (i->dev == dev) - return i; - - return NULL; -} - -void bch2_mark_io_failure(struct bch_io_failures *failed, - struct extent_ptr_decoded *p, - bool csum_error) -{ - struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); - - if (!f) { - BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); - - f = &failed->devs[failed->nr++]; - memset(f, 0, sizeof(*f)); - f->dev = p->ptr.dev; - } - - if (p->do_ec_reconstruct) - f->failed_ec = true; - else if (!csum_error) - f->failed_io = true; - else - f->failed_csum_nr++; -} - -void bch2_mark_btree_validate_failure(struct bch_io_failures *failed, - unsigned dev) -{ - struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev); - - if (!f) { - BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); - - f = &failed->devs[failed->nr++]; - memset(f, 0, sizeof(*f)); - f->dev = dev; - } - - f->failed_btree_validate = true; -} - -static inline u64 dev_latency(struct bch_dev *ca) -{ - return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; -} - -static inline int dev_failed(struct bch_dev *ca) -{ - return !ca || ca->mi.state == BCH_MEMBER_STATE_failed; -} - -/* - * returns true if p1 is better than p2: - */ -static inline bool ptr_better(struct bch_fs *c, - const struct extent_ptr_decoded p1, - u64 p1_latency, - struct bch_dev *ca1, - const struct extent_ptr_decoded p2, - u64 p2_latency) -{ - struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); - - int failed_delta = dev_failed(ca1) - dev_failed(ca2); - if (unlikely(failed_delta)) - return failed_delta < 0; - - if (static_branch_unlikely(&bch2_force_reconstruct_read)) - return p1.do_ec_reconstruct > p2.do_ec_reconstruct; - - if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) - return p1.do_ec_reconstruct < p2.do_ec_reconstruct; - - int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr; - if (unlikely(crc_retry_delta)) - return crc_retry_delta < 0; - - /* Pick at random, biased in favor of the faster device: */ - - return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency; -} - -/* - * This picks a non-stale pointer, preferably from a device other than @avoid. - * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to - * other devices, it will still pick a pointer from avoid. - */ -int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_failures *failed, - struct extent_ptr_decoded *pick, - int dev) -{ - bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false; - bool have_dirty_ptrs = false, have_pick = false; - - if (k.k->type == KEY_TYPE_error) - return bch_err_throw(c, key_type_error); - - rcu_read_lock(); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - u64 pick_latency; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - have_dirty_ptrs |= !p.ptr.cached; - - /* - * Unwritten extent: no need to actually read, treat it as a - * hole and return 0s: - */ - if (p.ptr.unwritten) { - rcu_read_unlock(); - return 0; - } - - /* Are we being asked to read from a specific device? */ - if (dev >= 0 && p.ptr.dev != dev) - continue; - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - - if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) { - rcu_read_unlock(); - int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev); - if (ret) - return ret; - rcu_read_lock(); - } - - if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) - continue; - - struct bch_dev_io_failures *f = - unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; - if (unlikely(f)) { - p.crc_retry_nr = f->failed_csum_nr; - p.has_ec &= ~f->failed_ec; - - if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { - have_io_errors |= f->failed_io; - have_io_errors |= f->failed_btree_validate; - have_io_errors |= f->failed_ec; - } - have_csum_errors |= !!f->failed_csum_nr; - - if (p.has_ec && (f->failed_io || f->failed_csum_nr)) - p.do_ec_reconstruct = true; - else if (f->failed_io || - f->failed_btree_validate || - f->failed_csum_nr > c->opts.checksum_err_retry_nr) - continue; - } - - have_missing_devs |= ca && !bch2_dev_is_online(ca); - - if (!ca || !bch2_dev_is_online(ca)) { - if (!p.has_ec) - continue; - p.do_ec_reconstruct = true; - } - - if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec) - p.do_ec_reconstruct = true; - - u64 p_latency = dev_latency(ca); - /* - * Square the latencies, to bias more in favor of the faster - * device - we never want to stop issuing reads to the slower - * device altogether, so that we can update our latency numbers: - */ - p_latency *= p_latency; - - if (!have_pick || - ptr_better(c, - p, p_latency, ca, - *pick, pick_latency)) { - *pick = p; - pick_latency = p_latency; - have_pick = true; - } - } - rcu_read_unlock(); - - if (have_pick) - return 1; - if (!have_dirty_ptrs) - return 0; - if (have_missing_devs) - return bch_err_throw(c, no_device_to_read_from); - if (have_csum_errors) - return bch_err_throw(c, data_read_csum_err); - if (have_io_errors) - return bch_err_throw(c, data_read_io_err); - - /* - * If we get here, we have pointers (bkey_ptrs_validate() ensures that), - * but they don't point to valid devices: - */ - return bch_err_throw(c, no_devices_valid); -} - -/* KEY_TYPE_btree_ptr: */ - -int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, - c, btree_ptr_val_too_big, - "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - - ret = bch2_bkey_ptrs_validate(c, k, from); -fsck_err: - return ret; -} - -void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); -} - -int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - int ret = 0; - - bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, - c, btree_ptr_v2_val_too_big, - "value too big (%zu > %zu)", - bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); - - bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), - c, btree_ptr_v2_min_key_bad, - "min_key > key"); - - if ((from.flags & BCH_VALIDATE_write) && - c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written) - bkey_fsck_err_on(!bp.v->sectors_written, - c, btree_ptr_v2_written_0, - "sectors_written == 0"); - - ret = bch2_bkey_ptrs_validate(c, k, from); -fsck_err: - return ret; -} - -void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - - prt_printf(out, "seq %llx written %u min_key %s", - le64_to_cpu(bp.v->seq), - le16_to_cpu(bp.v->sectors_written), - BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); - - bch2_bpos_to_text(out, bp.v->min_key); - prt_printf(out, " "); - bch2_bkey_ptrs_to_text(out, c, k); -} - -void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, - unsigned big_endian, int write, - struct bkey_s k) -{ - struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); - - compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); - - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id_is_extents(btree_id) && - !bkey_eq(bp.v->min_key, POS_MIN)) - bp.v->min_key = write - ? bpos_nosnap_predecessor(bp.v->min_key) - : bpos_nosnap_successor(bp.v->min_key); -} - -/* KEY_TYPE_extent: */ - -bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) -{ - struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); - struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); - union bch_extent_entry *en_l; - const union bch_extent_entry *en_r; - struct extent_ptr_decoded lp, rp; - bool use_right_ptr; - - en_l = l_ptrs.start; - en_r = r_ptrs.start; - while (en_l < l_ptrs.end && en_r < r_ptrs.end) { - if (extent_entry_type(en_l) != extent_entry_type(en_r)) - return false; - - en_l = extent_entry_next(en_l); - en_r = extent_entry_next(en_r); - } - - if (en_l < l_ptrs.end || en_r < r_ptrs.end) - return false; - - en_l = l_ptrs.start; - en_r = r_ptrs.start; - lp.crc = bch2_extent_crc_unpack(l.k, NULL); - rp.crc = bch2_extent_crc_unpack(r.k, NULL); - - guard(rcu)(); - - while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && - __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { - if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != - rp.ptr.offset + rp.crc.offset || - lp.ptr.dev != rp.ptr.dev || - lp.ptr.gen != rp.ptr.gen || - lp.ptr.unwritten != rp.ptr.unwritten || - lp.has_ec != rp.has_ec) - return false; - - /* Extents may not straddle buckets: */ - struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); - bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); - - if (!same_bucket) - return false; - - if (lp.has_ec != rp.has_ec || - (lp.has_ec && - (lp.ec.block != rp.ec.block || - lp.ec.redundancy != rp.ec.redundancy || - lp.ec.idx != rp.ec.idx))) - return false; - - if (lp.crc.compression_type != rp.crc.compression_type || - lp.crc.nonce != rp.crc.nonce) - return false; - - if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= - lp.crc.uncompressed_size) { - /* can use left extent's crc entry */ - } else if (lp.crc.live_size <= rp.crc.offset) { - /* can use right extent's crc entry */ - } else { - /* check if checksums can be merged: */ - if (lp.crc.csum_type != rp.crc.csum_type || - lp.crc.nonce != rp.crc.nonce || - crc_is_compressed(lp.crc) || - !bch2_checksum_mergeable(lp.crc.csum_type)) - return false; - - if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || - rp.crc.offset) - return false; - - if (lp.crc.csum_type && - lp.crc.uncompressed_size + - rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) - return false; - } - - en_l = extent_entry_next(en_l); - en_r = extent_entry_next(en_r); - } - - en_l = l_ptrs.start; - en_r = r_ptrs.start; - while (en_l < l_ptrs.end && en_r < r_ptrs.end) { - if (extent_entry_is_crc(en_l)) { - struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - if (crc_l.uncompressed_size + crc_r.uncompressed_size > - bch2_crc_field_size_max[extent_entry_type(en_l)]) - return false; - } - - en_l = extent_entry_next(en_l); - en_r = extent_entry_next(en_r); - } - - use_right_ptr = false; - en_l = l_ptrs.start; - en_r = r_ptrs.start; - while (en_l < l_ptrs.end) { - if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && - use_right_ptr) - en_l->ptr = en_r->ptr; - - if (extent_entry_is_crc(en_l)) { - struct bch_extent_crc_unpacked crc_l = - bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - struct bch_extent_crc_unpacked crc_r = - bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - use_right_ptr = false; - - if (crc_l.offset + crc_l.live_size + crc_r.live_size <= - crc_l.uncompressed_size) { - /* can use left extent's crc entry */ - } else if (crc_l.live_size <= crc_r.offset) { - /* can use right extent's crc entry */ - crc_r.offset -= crc_l.live_size; - bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, - extent_entry_type(en_l)); - use_right_ptr = true; - } else { - crc_l.csum = bch2_checksum_merge(crc_l.csum_type, - crc_l.csum, - crc_r.csum, - crc_r.uncompressed_size << 9); - - crc_l.uncompressed_size += crc_r.uncompressed_size; - crc_l.compressed_size += crc_r.compressed_size; - bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, - extent_entry_type(en_l)); - } - } - - en_l = extent_entry_next(en_l); - en_r = extent_entry_next(en_r); - } - - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; -} - -/* KEY_TYPE_reservation: */ - -int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - int ret = 0; - - bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, - c, reservation_key_nr_replicas_invalid, - "invalid nr_replicas (%u)", r.v->nr_replicas); -fsck_err: - return ret; -} - -void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - prt_printf(out, "generation %u replicas %u", - le32_to_cpu(r.v->generation), - r.v->nr_replicas); -} - -bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -{ - struct bkey_s_reservation l = bkey_s_to_reservation(_l); - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); - - if (l.v->generation != r.v->generation || - l.v->nr_replicas != r.v->nr_replicas) - return false; - - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; -} - -/* Extent checksum entries: */ - -/* returns true if not equal */ -static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, - struct bch_extent_crc_unpacked r) -{ - return (l.csum_type != r.csum_type || - l.compression_type != r.compression_type || - l.compressed_size != r.compressed_size || - l.uncompressed_size != r.uncompressed_size || - l.offset != r.offset || - l.live_size != r.live_size || - l.nonce != r.nonce || - bch2_crc_cmp(l.csum, r.csum)); -} - -static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, - struct bch_extent_crc_unpacked n) -{ - return !crc_is_compressed(u) && - u.csum_type && - u.uncompressed_size > u.live_size && - bch2_csum_type_is_encryption(u.csum_type) == - bch2_csum_type_is_encryption(n.csum_type); -} - -bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, - struct bch_extent_crc_unpacked n) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - - if (!n.csum_type) - return false; - - bkey_for_each_crc(k.k, ptrs, crc, i) - if (can_narrow_crc(crc, n)) - return true; - - return false; -} - -/* - * We're writing another replica for this extent, so while we've got the data in - * memory we'll be computing a new checksum for the currently live data. - * - * If there are other replicas we aren't moving, and they are checksummed but - * not compressed, we can modify them to point to only the data that is - * currently live (so that readers won't have to bounce) while we've got the - * checksum we need: - */ -bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - struct bch_extent_crc_unpacked u; - struct extent_ptr_decoded p; - union bch_extent_entry *i; - bool ret = false; - - /* Find a checksum entry that covers only live data: */ - if (!n.csum_type) { - bkey_for_each_crc(&k->k, ptrs, u, i) - if (!crc_is_compressed(u) && - u.csum_type && - u.live_size == u.uncompressed_size) { - n = u; - goto found; - } - return false; - } -found: - BUG_ON(crc_is_compressed(n)); - BUG_ON(n.offset); - BUG_ON(n.live_size != k->k.size); - -restart_narrow_pointers: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - - bkey_for_each_ptr_decode(&k->k, ptrs, p, i) - if (can_narrow_crc(p.crc, n)) { - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); - p.ptr.offset += p.crc.offset; - p.crc = n; - bch2_extent_ptr_decoded_append(k, &p); - ret = true; - goto restart_narrow_pointers; - } - - return ret; -} - -static void bch2_extent_crc_pack(union bch_extent_crc *dst, - struct bch_extent_crc_unpacked src, - enum bch_extent_entry_type type) -{ -#define common_fields(_src) \ - .type = BIT(type), \ - .csum_type = _src.csum_type, \ - .compression_type = _src.compression_type, \ - ._compressed_size = _src.compressed_size - 1, \ - ._uncompressed_size = _src.uncompressed_size - 1, \ - .offset = _src.offset - - switch (type) { - case BCH_EXTENT_ENTRY_crc32: - dst->crc32 = (struct bch_extent_crc32) { - common_fields(src), - .csum = (u32 __force) *((__le32 *) &src.csum.lo), - }; - break; - case BCH_EXTENT_ENTRY_crc64: - dst->crc64 = (struct bch_extent_crc64) { - common_fields(src), - .nonce = src.nonce, - .csum_lo = (u64 __force) src.csum.lo, - .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi), - }; - break; - case BCH_EXTENT_ENTRY_crc128: - dst->crc128 = (struct bch_extent_crc128) { - common_fields(src), - .nonce = src.nonce, - .csum = src.csum, - }; - break; - default: - BUG(); - } -#undef set_common_fields -} - -void bch2_extent_crc_append(struct bkey_i *k, - struct bch_extent_crc_unpacked new) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - union bch_extent_crc *crc = (void *) ptrs.end; - enum bch_extent_entry_type type; - - if (bch_crc_bytes[new.csum_type] <= 4 && - new.uncompressed_size <= CRC32_SIZE_MAX && - new.nonce <= CRC32_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc32; - else if (bch_crc_bytes[new.csum_type] <= 10 && - new.uncompressed_size <= CRC64_SIZE_MAX && - new.nonce <= CRC64_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc64; - else if (bch_crc_bytes[new.csum_type] <= 16 && - new.uncompressed_size <= CRC128_SIZE_MAX && - new.nonce <= CRC128_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc128; - else - BUG(); - - bch2_extent_crc_pack(crc, new, type); - - k->k.u64s += extent_entry_u64s(ptrs.end); - - EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); -} - -/* Generic code for keys with pointers: */ - -unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) -{ - return bch2_bkey_devs(k).nr; -} - -unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) -{ - return k.k->type == KEY_TYPE_reservation - ? bkey_s_c_to_reservation(k).v->nr_replicas - : bch2_bkey_dirty_devs(k).nr; -} - -unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) -{ - unsigned ret = 0; - - if (k.k->type == KEY_TYPE_reservation) { - ret = bkey_s_c_to_reservation(k).v->nr_replicas; - } else { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - ret += !p.ptr.cached && !crc_is_compressed(p.crc); - } - - return ret; -} - -unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned ret = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && crc_is_compressed(p.crc)) - ret += p.crc.compressed_size; - - return ret; -} - -bool bch2_bkey_is_incompressible(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - - bkey_for_each_crc(k.k, ptrs, crc, entry) - if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) - return true; - return false; -} - -unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p = { 0 }; - unsigned replicas = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.cached) - continue; - - if (p.has_ec) - replicas += p.ec.redundancy; - - replicas++; - - } - - return replicas; -} - -static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p) -{ - if (p->ptr.cached) - return 0; - - return p->has_ec - ? p->ec.redundancy + 1 - : ca->mi.durability; -} - -unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) -{ - struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - - return ca ? __extent_ptr_durability(ca, p) : 0; -} - -unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) -{ - struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - - if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed) - return 0; - - return __extent_ptr_durability(ca, p); -} - -unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned durability = 0; - - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - durability += bch2_extent_ptr_durability(c, &p); - return durability; -} - -static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned durability = 0; - - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) - durability += bch2_extent_ptr_durability(c, &p); - return durability; -} - -void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) -{ - union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - union bch_extent_entry *next = extent_entry_next(entry); - - memmove_u64s(entry, next, (u64 *) end - (u64 *) next); - k->k.u64s -= extent_entry_u64s(entry); -} - -void bch2_extent_ptr_decoded_append(struct bkey_i *k, - struct extent_ptr_decoded *p) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - struct bch_extent_crc_unpacked crc = - bch2_extent_crc_unpack(&k->k, NULL); - union bch_extent_entry *pos; - - if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = ptrs.start; - goto found; - } - - bkey_for_each_crc(&k->k, ptrs, crc, pos) - if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = extent_entry_next(pos); - goto found; - } - - bch2_extent_crc_append(k, p->crc); - pos = bkey_val_end(bkey_i_to_s(k)); -found: - p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ptr)); - - if (p->has_ec) { - p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ec)); - } -} - -static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, - union bch_extent_entry *entry) -{ - union bch_extent_entry *i = ptrs.start; - - if (i == entry) - return NULL; - - while (extent_entry_next(i) != entry) - i = extent_entry_next(i); - return i; -} - -/* - * Returns pointer to the next entry after the one being dropped: - */ -void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry = to_entry(ptr), *next; - bool drop_crc = true; - - if (k.k->type == KEY_TYPE_stripe) { - ptr->dev = BCH_SB_MEMBER_INVALID; - return; - } - - EBUG_ON(ptr < &ptrs.start->ptr || - ptr >= &ptrs.end->ptr); - EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - - for (next = extent_entry_next(entry); - next != ptrs.end; - next = extent_entry_next(next)) { - if (extent_entry_is_crc(next)) { - break; - } else if (extent_entry_is_ptr(next)) { - drop_crc = false; - break; - } - } - - extent_entry_drop(k, entry); - - while ((entry = extent_entry_prev(ptrs, entry))) { - if (extent_entry_is_ptr(entry)) - break; - - if ((extent_entry_is_crc(entry) && drop_crc) || - extent_entry_is_stripe_ptr(entry)) - extent_entry_drop(k, entry); - } -} - -void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr) -{ - if (k.k->type != KEY_TYPE_stripe) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == ptr->dev && p.has_ec) { - ptr->dev = BCH_SB_MEMBER_INVALID; - return; - } - } - - bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; - - bch2_bkey_drop_ptr_noerror(k, ptr); - - /* - * If we deleted all the dirty pointers and there's still cached - * pointers, we could set the cached pointers to dirty if they're not - * stale - but to do that correctly we'd need to grab an open_bucket - * reference so that we don't race with bucket reuse: - */ - if (have_dirty && - !bch2_bkey_dirty_devs(k.s_c).nr) { - k.k->type = KEY_TYPE_error; - set_bkey_val_u64s(k.k, 0); - } else if (!bch2_bkey_nr_ptrs(k.s_c)) { - k.k->type = KEY_TYPE_deleted; - set_bkey_val_u64s(k.k, 0); - } -} - -void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) -{ - bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); -} - -void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) -{ - bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev); -} - -const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == dev) - return ptr; - - return NULL; -} - -bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_dev *ca; - - guard(rcu)(); - bkey_for_each_ptr(ptrs, ptr) - if (bch2_dev_in_target(c, ptr->dev, target) && - (ca = bch2_dev_rcu(c, ptr->dev)) && - (!ptr->cached || - !dev_ptr_stale_rcu(ca, ptr))) - return true; - - return false; -} - -bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, - struct bch_extent_ptr m, u64 offset) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == m.dev && - p.ptr.gen == m.gen && - (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == - (s64) m.offset - offset) - return true; - - return false; -} - -/* - * Returns true if two extents refer to the same data: - */ -bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) -{ - if (k1.k->type != k2.k->type) - return false; - - if (bkey_extent_is_direct_data(k1.k)) { - struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); - struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); - const union bch_extent_entry *entry1, *entry2; - struct extent_ptr_decoded p1, p2; - - if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) - return false; - - bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) - bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) - if (p1.ptr.dev == p2.ptr.dev && - p1.ptr.gen == p2.ptr.gen && - - /* - * This checks that the two pointers point - * to the same region on disk - adjusting - * for the difference in where the extents - * start, since one may have been trimmed: - */ - (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) && - - /* - * This additionally checks that the - * extents overlap on disk, since the - * previous check may trigger spuriously - * when one extent is immediately partially - * overwritten with another extent (so that - * on disk they are adjacent) and - * compression is in use: - */ - ((p1.ptr.offset >= p2.ptr.offset && - p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) || - (p2.ptr.offset >= p1.ptr.offset && - p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) - return true; - - return false; - } else { - /* KEY_TYPE_deleted, etc. */ - return true; - } -} - -struct bch_extent_ptr * -bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2) -{ - struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2); - union bch_extent_entry *entry2; - struct extent_ptr_decoded p2; - - bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) - if (p1.ptr.dev == p2.ptr.dev && - p1.ptr.gen == p2.ptr.gen && - (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) - return &entry2->ptr; - - return NULL; -} - -static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, - struct bch_extent_ptr *ptr) -{ - unsigned target = opts->promote_target ?: opts->foreground_target; - - if (target && !bch2_dev_in_target(c, ptr->dev, target)) - return false; - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - - return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr); -} - -void bch2_extent_ptr_set_cached(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s k, - struct bch_extent_ptr *ptr) -{ - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bool have_cached_ptr; - unsigned drop_dev = ptr->dev; - - guard(rcu)(); -restart_drop_ptrs: - ptrs = bch2_bkey_ptrs(k); - have_cached_ptr = false; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - /* - * Check if it's erasure coded - stripes can't contain cached - * data. Possibly something we can fix in the future? - */ - if (&entry->ptr == ptr && p.has_ec) - goto drop; - - if (p.ptr.cached) { - if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) { - bch2_bkey_drop_ptr_noerror(k, &entry->ptr); - ptr = NULL; - goto restart_drop_ptrs; - } - - have_cached_ptr = true; - } - } - - if (!ptr) - bkey_for_each_ptr(ptrs, ptr2) - if (ptr2->dev == drop_dev) - ptr = ptr2; - - if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) - goto drop; - - ptr->cached = true; - return; -drop: - bch2_bkey_drop_ptr_noerror(k, ptr); -} - -/* - * bch2_extent_normalize - clean up an extent, dropping stale pointers etc. - * - * Returns true if @k should be dropped entirely - * - * For existing keys, only called when btree nodes are being rewritten, not when - * they're merely being compacted/resorted in memory. - */ -bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) -{ - struct bch_dev *ca; - - guard(rcu)(); - bch2_bkey_drop_ptrs(k, ptr, - ptr->cached && - (!(ca = bch2_dev_rcu(c, ptr->dev)) || - dev_ptr_stale_rcu(ca, ptr) > 0)); - - return bkey_deleted(k.k); -} - -/* - * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc. - * - * Like bch2_extent_normalize(), but also only keeps a single cached pointer on - * the promote target. - */ -bool bch2_extent_normalize_by_opts(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s k) -{ - struct bkey_ptrs ptrs; - bool have_cached_ptr; - - guard(rcu)(); -restart_drop_ptrs: - ptrs = bch2_bkey_ptrs(k); - have_cached_ptr = false; - - bkey_for_each_ptr(ptrs, ptr) - if (ptr->cached) { - if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) { - bch2_bkey_drop_ptr(k, ptr); - goto restart_drop_ptrs; - } - have_cached_ptr = true; - } - - return bkey_deleted(k.k); -} - -void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) -{ - out->atomic++; - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - if (!ca) { - prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : ""); - } else { - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); - - prt_printf(out, "ptr: %u:%llu:%u gen %u", - ptr->dev, b, offset, ptr->gen); - if (ca->mi.durability != 1) - prt_printf(out, " d=%u", ca->mi.durability); - if (ptr->cached) - prt_str(out, " cached"); - if (ptr->unwritten) - prt_str(out, " unwritten"); - int stale = dev_ptr_stale_rcu(ca, ptr); - if (stale > 0) - prt_printf(out, " stale"); - else if (stale) - prt_printf(out, " invalid"); - } - --out->atomic; -} - -void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc) -{ - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", - crc->compressed_size, - crc->uncompressed_size, - crc->offset, crc->nonce); - bch2_prt_csum_type(out, crc->csum_type); - prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo); - prt_str(out, " compress "); - bch2_prt_compression_type(out, crc->compression_type); -} - -static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, - const struct bch_extent_rebalance *r) -{ - prt_str(out, "rebalance:"); - - prt_printf(out, " replicas=%u", r->data_replicas); - if (r->data_replicas_from_inode) - prt_str(out, " (inode)"); - - prt_str(out, " checksum="); - bch2_prt_csum_opt(out, r->data_checksum); - if (r->data_checksum_from_inode) - prt_str(out, " (inode)"); - - if (r->background_compression || r->background_compression_from_inode) { - prt_str(out, " background_compression="); - bch2_compression_opt_to_text(out, r->background_compression); - - if (r->background_compression_from_inode) - prt_str(out, " (inode)"); - } - - if (r->background_target || r->background_target_from_inode) { - prt_str(out, " background_target="); - if (c) - bch2_target_to_text(out, c, r->background_target); - else - prt_printf(out, "%u", r->background_target); - - if (r->background_target_from_inode) - prt_str(out, " (inode)"); - } - - if (r->promote_target || r->promote_target_from_inode) { - prt_str(out, " promote_target="); - if (c) - bch2_target_to_text(out, c, r->promote_target); - else - prt_printf(out, "%u", r->promote_target); - - if (r->promote_target_from_inode) - prt_str(out, " (inode)"); - } - - if (r->erasure_code || r->erasure_code_from_inode) { - prt_printf(out, " ec=%u", r->erasure_code); - if (r->erasure_code_from_inode) - prt_str(out, " (inode)"); - } -} - -void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - bool first = true; - - if (c) - prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k)); - - bkey_extent_entry_for_each(ptrs, entry) { - if (!first) - prt_printf(out, " "); - - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); - break; - - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: { - struct bch_extent_crc_unpacked crc = - bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - - bch2_extent_crc_unpacked_to_text(out, &crc); - break; - } - case BCH_EXTENT_ENTRY_stripe_ptr: { - const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr; - - prt_printf(out, "ec: idx %llu block %u", - (u64) ec->idx, ec->block); - break; - } - case BCH_EXTENT_ENTRY_rebalance: - bch2_extent_rebalance_to_text(out, c, &entry->rebalance); - break; - - case BCH_EXTENT_ENTRY_flags: - prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags); - break; - - default: - prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); - return; - } - - first = false; - } -} - -static int extent_ptr_validate(struct bch_fs *c, - struct bkey_s_c k, - struct bkey_validate_context from, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata) -{ - int ret = 0; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr2) - bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, - c, ptr_to_duplicate_device, - "multiple pointers to same device (%u)", ptr->dev); - - /* bad pointers are repaired by check_fix_ptrs(): */ - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - if (!ca) { - rcu_read_unlock(); - return 0; - } - u32 bucket_offset; - u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - unsigned first_bucket = ca->mi.first_bucket; - u64 nbuckets = ca->mi.nbuckets; - unsigned bucket_size = ca->mi.bucket_size; - rcu_read_unlock(); - - bkey_fsck_err_on(bucket >= nbuckets, - c, ptr_after_last_bucket, - "pointer past last bucket (%llu > %llu)", bucket, nbuckets); - bkey_fsck_err_on(bucket < first_bucket, - c, ptr_before_first_bucket, - "pointer before first bucket (%llu < %u)", bucket, first_bucket); - bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, - c, ptr_spans_multiple_buckets, - "pointer spans multiple buckets (%u + %u > %u)", - bucket_offset, size_ondisk, bucket_size); -fsck_err: - return ret; -} - -int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - unsigned size_ondisk = k.k->size; - unsigned nonce = UINT_MAX; - unsigned nr_ptrs = 0; - bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false; - int ret = 0; - - if (bkey_is_btree_ptr(k.k)) - size_ondisk = btree_sectors(c); - - bkey_extent_entry_for_each(ptrs, entry) { - bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, - c, extent_ptrs_invalid_entry, - "invalid extent entry type (got %u, max %u)", - __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); - - bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && - !extent_entry_is_ptr(entry), - c, btree_ptr_has_non_ptr, - "has non ptr field"); - - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false); - if (ret) - return ret; - - bkey_fsck_err_on(entry->ptr.cached && have_ec, - c, ptr_cached_and_erasure_coded, - "cached, erasure coded ptr"); - - if (!entry->ptr.unwritten) - have_written = true; - else - have_unwritten = true; - - have_ec = false; - crc_since_last_ptr = false; - nr_ptrs++; - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - - bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), - c, ptr_crc_csum_type_unknown, - "invalid checksum type"); - bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, - c, ptr_crc_compression_type_unknown, - "invalid compression type"); - - bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, - c, ptr_crc_uncompressed_size_too_small, - "checksum offset + key size > uncompressed size"); - bkey_fsck_err_on(crc_is_encoded(crc) && - (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), - c, ptr_crc_uncompressed_size_too_big, - "too large encoded extent"); - bkey_fsck_err_on(!crc_is_compressed(crc) && - crc.compressed_size != crc.uncompressed_size, - c, ptr_crc_uncompressed_size_mismatch, - "not compressed but compressed != uncompressed size"); - - if (bch2_csum_type_is_encryption(crc.csum_type)) { - if (nonce == UINT_MAX) - nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) - bkey_fsck_err(c, ptr_crc_nonce_mismatch, - "incorrect nonce"); - } - - bkey_fsck_err_on(crc_since_last_ptr, - c, ptr_crc_redundant, - "redundant crc entry"); - crc_since_last_ptr = true; - - size_ondisk = crc.compressed_size; - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - bkey_fsck_err_on(have_ec, - c, ptr_stripe_redundant, - "redundant stripe entry"); - have_ec = true; - break; - case BCH_EXTENT_ENTRY_rebalance: { - /* - * this shouldn't be a fsck error, for forward - * compatibility; the rebalance code should just refetch - * the compression opt if it's unknown - */ -#if 0 - const struct bch_extent_rebalance *r = &entry->rebalance; - - if (!bch2_compression_opt_valid(r->compression)) { - struct bch_compression_opt opt = __bch2_compression_decode(r->compression); - prt_printf(err, "invalid compression opt %u:%u", - opt.type, opt.level); - return bch_err_throw(c, invalid_bkey); - } -#endif - break; - } - case BCH_EXTENT_ENTRY_flags: - bkey_fsck_err_on(entry != ptrs.start, - c, extent_flags_not_at_start, - "extent flags entry not at start"); - break; - } - } - - bkey_fsck_err_on(!nr_ptrs, - c, extent_ptrs_no_ptrs, - "no ptrs"); - bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, - c, extent_ptrs_too_many_ptrs, - "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); - bkey_fsck_err_on(have_written && have_unwritten, - c, extent_ptrs_written_and_unwritten, - "extent with unwritten and written ptrs"); - bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, - c, extent_ptrs_unwritten, - "has unwritten ptrs"); - bkey_fsck_err_on(crc_since_last_ptr, - c, extent_ptrs_redundant_crc, - "redundant crc entry"); - bkey_fsck_err_on(have_ec, - c, extent_ptrs_redundant_stripe, - "redundant stripe entry"); -fsck_err: - return ret; -} - -void bch2_ptr_swab(struct bkey_s k) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; - u64 *d; - - for (d = (u64 *) ptrs.start; - d != (u64 *) ptrs.end; - d++) - *d = swab64(*d); - - for (entry = ptrs.start; - entry < ptrs.end; - entry = extent_entry_next(entry)) { - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - break; - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.csum = swab32(entry->crc32.csum); - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); - entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.csum.hi = (__force __le64) - swab64((__force u64) entry->crc128.csum.hi); - entry->crc128.csum.lo = (__force __le64) - swab64((__force u64) entry->crc128.csum.lo); - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - break; - case BCH_EXTENT_ENTRY_rebalance: - break; - default: - /* Bad entry type: will be caught by validate() */ - return; - } - } -} - -int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags) -{ - int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags); - if (ret) - return ret; - - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - - if (ptrs.start != ptrs.end && - extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) { - ptrs.start->flags.flags = flags; - } else { - struct bch_extent_flags f = { - .type = BIT(BCH_EXTENT_ENTRY_flags), - .flags = flags, - }; - __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f); - } - - return 0; -} - -/* Generic extent code: */ - -int bch2_cut_front_s(struct bpos where, struct bkey_s k) -{ - unsigned new_val_u64s = bkey_val_u64s(k.k); - int val_u64s_delta; - u64 sub; - - if (bkey_le(where, bkey_start_pos(k.k))) - return 0; - - EBUG_ON(bkey_gt(where, k.k->p)); - - sub = where.offset - bkey_start_offset(k.k); - - k.k->size -= sub; - - if (!k.k->size) { - k.k->type = KEY_TYPE_deleted; - new_val_u64s = 0; - } - - switch (k.k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; - bool seen_crc = false; - - bkey_extent_entry_for_each(ptrs, entry) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - if (!seen_crc) - entry->ptr.offset += sub; - break; - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.offset += sub; - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.offset += sub; - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.offset += sub; - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - case BCH_EXTENT_ENTRY_rebalance: - case BCH_EXTENT_ENTRY_flags: - break; - } - - if (extent_entry_is_crc(entry)) - seen_crc = true; - } - - break; - } - case KEY_TYPE_reflink_p: { - struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); - - SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub); - break; - } - case KEY_TYPE_inline_data: - case KEY_TYPE_indirect_inline_data: { - void *p = bkey_inline_data_p(k); - unsigned bytes = bkey_inline_data_bytes(k.k); - - sub = min_t(u64, sub << 9, bytes); - - memmove(p, p + sub, bytes - sub); - - new_val_u64s -= sub >> 3; - break; - } - } - - val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; - BUG_ON(val_u64s_delta < 0); - - set_bkey_val_u64s(k.k, new_val_u64s); - memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); - return -val_u64s_delta; -} - -int bch2_cut_back_s(struct bpos where, struct bkey_s k) -{ - unsigned new_val_u64s = bkey_val_u64s(k.k); - int val_u64s_delta; - u64 len = 0; - - if (bkey_ge(where, k.k->p)) - return 0; - - EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); - - len = where.offset - bkey_start_offset(k.k); - - k.k->p.offset = where.offset; - k.k->size = len; - - if (!len) { - k.k->type = KEY_TYPE_deleted; - new_val_u64s = 0; - } - - switch (k.k->type) { - case KEY_TYPE_inline_data: - case KEY_TYPE_indirect_inline_data: - new_val_u64s = (bkey_inline_data_offset(k.k) + - min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; - break; - } - - val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; - BUG_ON(val_u64s_delta < 0); - - set_bkey_val_u64s(k.k, new_val_u64s); - memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); - return -val_u64s_delta; -} diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h deleted file mode 100644 index b8590e51b76e..000000000000 --- a/fs/bcachefs/extents.h +++ /dev/null @@ -1,768 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EXTENTS_H -#define _BCACHEFS_EXTENTS_H - -#include "bcachefs.h" -#include "bkey.h" -#include "extents_types.h" - -struct bch_fs; -struct btree_trans; - -/* extent entries: */ - -#define extent_entry_last(_e) \ - ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) - -#define entry_to_ptr(_entry) \ -({ \ - EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ - \ - __builtin_choose_expr( \ - type_is_exact(_entry, const union bch_extent_entry *), \ - (const struct bch_extent_ptr *) (_entry), \ - (struct bch_extent_ptr *) (_entry)); \ -}) - -/* downcast, preserves const */ -#define to_entry(_entry) \ -({ \ - BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ - !type_is(_entry, struct bch_extent_ptr *) && \ - !type_is(_entry, struct bch_extent_stripe_ptr *)); \ - \ - __builtin_choose_expr( \ - (type_is_exact(_entry, const union bch_extent_crc *) || \ - type_is_exact(_entry, const struct bch_extent_ptr *) ||\ - type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ - (const union bch_extent_entry *) (_entry), \ - (union bch_extent_entry *) (_entry)); \ -}) - -#define extent_entry_next(_entry) \ - ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) - -#define extent_entry_next_safe(_entry, _end) \ - (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \ - ? extent_entry_next(_entry) \ - : _end) - -static inline unsigned -__extent_entry_type(const union bch_extent_entry *e) -{ - return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; -} - -static inline enum bch_extent_entry_type -extent_entry_type(const union bch_extent_entry *e) -{ - int ret = __ffs(e->type); - - EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); - - return ret; -} - -static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) -{ - switch (extent_entry_type(entry)) { -#define x(f, n) \ - case BCH_EXTENT_ENTRY_##f: \ - return sizeof(struct bch_extent_##f); - BCH_EXTENT_ENTRY_TYPES() -#undef x - default: - BUG(); - } -} - -static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) -{ - return extent_entry_bytes(entry) / sizeof(u64); -} - -static inline void __extent_entry_insert(struct bkey_i *k, - union bch_extent_entry *dst, - union bch_extent_entry *new) -{ - union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - - memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), - dst, (u64 *) end - (u64 *) dst); - k->k.u64s += extent_entry_u64s(new); - memcpy_u64s_small(dst, new, extent_entry_u64s(new)); -} - -static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) -{ - union bch_extent_entry *next = extent_entry_next(entry); - - /* stripes have ptrs, but their layout doesn't work with this code */ - BUG_ON(k.k->type == KEY_TYPE_stripe); - - memmove_u64s_down(entry, next, - (u64 *) bkey_val_end(k) - (u64 *) next); - k.k->u64s -= (u64 *) next - (u64 *) entry; -} - -static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) -{ - return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; -} - -static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) -{ - return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; -} - -static inline bool extent_entry_is_crc(const union bch_extent_entry *e) -{ - switch (__extent_entry_type(e)) { - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - return true; - default: - return false; - } -} - -union bch_extent_crc { - u8 type; - struct bch_extent_crc32 crc32; - struct bch_extent_crc64 crc64; - struct bch_extent_crc128 crc128; -}; - -#define __entry_to_crc(_entry) \ - __builtin_choose_expr( \ - type_is_exact(_entry, const union bch_extent_entry *), \ - (const union bch_extent_crc *) (_entry), \ - (union bch_extent_crc *) (_entry)) - -#define entry_to_crc(_entry) \ -({ \ - EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ - \ - __entry_to_crc(_entry); \ -}) - -static inline struct bch_extent_crc_unpacked -bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) -{ -#define common_fields(_crc) \ - .csum_type = _crc.csum_type, \ - .compression_type = _crc.compression_type, \ - .compressed_size = _crc._compressed_size + 1, \ - .uncompressed_size = _crc._uncompressed_size + 1, \ - .offset = _crc.offset, \ - .live_size = k->size - - if (!crc) - return (struct bch_extent_crc_unpacked) { - .compressed_size = k->size, - .uncompressed_size = k->size, - .live_size = k->size, - }; - - switch (extent_entry_type(to_entry(crc))) { - case BCH_EXTENT_ENTRY_crc32: { - struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { - common_fields(crc->crc32), - }; - - *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum; - return ret; - } - case BCH_EXTENT_ENTRY_crc64: { - struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { - common_fields(crc->crc64), - .nonce = crc->crc64.nonce, - .csum.lo = (__force __le64) crc->crc64.csum_lo, - }; - - *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi; - - return ret; - } - case BCH_EXTENT_ENTRY_crc128: { - struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { - common_fields(crc->crc128), - .nonce = crc->crc128.nonce, - .csum = crc->crc128.csum, - }; - - return ret; - } - default: - BUG(); - } -#undef common_fields -} - -static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) -{ - return (crc.compression_type != BCH_COMPRESSION_TYPE_none && - crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); -} - -static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc) -{ - return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc); -} - -void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *); - -/* bkey_ptrs: generically over any key type that has ptrs */ - -struct bkey_ptrs_c { - const union bch_extent_entry *start; - const union bch_extent_entry *end; -}; - -struct bkey_ptrs { - union bch_extent_entry *start; - union bch_extent_entry *end; -}; - -static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: { - struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); - - return (struct bkey_ptrs_c) { - to_entry(&e.v->start[0]), - to_entry(extent_entry_last(e)) - }; - } - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - - return (struct bkey_ptrs_c) { - e.v->start, - extent_entry_last(e) - }; - } - case KEY_TYPE_stripe: { - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - return (struct bkey_ptrs_c) { - to_entry(&s.v->ptrs[0]), - to_entry(&s.v->ptrs[s.v->nr_blocks]), - }; - } - case KEY_TYPE_reflink_v: { - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - - return (struct bkey_ptrs_c) { - r.v->start, - bkey_val_end(r), - }; - } - case KEY_TYPE_btree_ptr_v2: { - struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); - - return (struct bkey_ptrs_c) { - to_entry(&e.v->start[0]), - to_entry(extent_entry_last(e)) - }; - } - default: - return (struct bkey_ptrs_c) { NULL, NULL }; - } -} - -static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) -{ - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); - - return (struct bkey_ptrs) { - (void *) p.start, - (void *) p.end - }; -} - -#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ - for ((_entry) = (_start); \ - (_entry) < (_end); \ - (_entry) = extent_entry_next_safe(_entry, _end)) - -#define __bkey_ptr_next(_ptr, _end) \ -({ \ - typeof(_end) _entry; \ - \ - __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ - if (extent_entry_is_ptr(_entry)) \ - break; \ - \ - _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ -}) - -#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ - __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) - -#define bkey_extent_entry_for_each(_p, _entry) \ - bkey_extent_entry_for_each_from(_p, _entry, _p.start) - -#define __bkey_for_each_ptr(_start, _end, _ptr) \ - for (typeof(_start) (_ptr) = (_start); \ - ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ - (_ptr)++) - -#define bkey_ptr_next(_p, _ptr) \ - __bkey_ptr_next(_ptr, (_p).end) - -#define bkey_for_each_ptr(_p, _ptr) \ - __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) - -#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ -({ \ - __label__ out; \ - \ - (_ptr).has_ec = false; \ - (_ptr).do_ec_reconstruct = false; \ - (_ptr).crc_retry_nr = 0; \ - \ - __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ - switch (__extent_entry_type(_entry)) { \ - case BCH_EXTENT_ENTRY_ptr: \ - (_ptr).ptr = _entry->ptr; \ - goto out; \ - case BCH_EXTENT_ENTRY_crc32: \ - case BCH_EXTENT_ENTRY_crc64: \ - case BCH_EXTENT_ENTRY_crc128: \ - (_ptr).crc = bch2_extent_crc_unpack(_k, \ - entry_to_crc(_entry)); \ - break; \ - case BCH_EXTENT_ENTRY_stripe_ptr: \ - (_ptr).ec = _entry->stripe_ptr; \ - (_ptr).has_ec = true; \ - break; \ - default: \ - /* nothing */ \ - break; \ - } \ -out: \ - _entry < (_end); \ -}) - -#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ - for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ - (_entry) = _start; \ - __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ - (_entry) = extent_entry_next_safe(_entry, _end)) - -#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ - __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ - _ptr, _entry) - -#define bkey_crc_next(_k, _end, _crc, _iter) \ -({ \ - __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ - if (extent_entry_is_crc(_iter)) { \ - (_crc) = bch2_extent_crc_unpack(_k, \ - entry_to_crc(_iter)); \ - break; \ - } \ - \ - (_iter) < (_end); \ -}) - -#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ - for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ - (_iter) = (_start); \ - bkey_crc_next(_k, _end, _crc, _iter); \ - (_iter) = extent_entry_next(_iter)) - -#define bkey_for_each_crc(_k, _p, _crc, _iter) \ - __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) - -/* Iterate over pointers in KEY_TYPE_extent: */ - -#define extent_ptr_next(_e, _ptr) \ - __bkey_ptr_next(_ptr, extent_entry_last(_e)) - -#define extent_for_each_ptr(_e, _ptr) \ - __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) - -#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ - __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ - extent_entry_last(_e), _ptr, _entry) - -/* utility code common to all keys with pointers: */ - -void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *, - struct bch_io_failures *); -struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, - unsigned); -void bch2_mark_io_failure(struct bch_io_failures *, - struct extent_ptr_decoded *, bool); -void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned); -int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, - struct bch_io_failures *, - struct extent_ptr_decoded *, int); - -/* KEY_TYPE_btree_ptr: */ - -int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - -int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, - int, struct bkey_s); - -#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \ - .key_validate = bch2_btree_ptr_validate, \ - .val_to_text = bch2_btree_ptr_to_text, \ - .swab = bch2_ptr_swab, \ - .trigger = bch2_trigger_extent, \ -}) - -#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ - .key_validate = bch2_btree_ptr_v2_validate, \ - .val_to_text = bch2_btree_ptr_v2_to_text, \ - .swab = bch2_ptr_swab, \ - .compat = bch2_btree_ptr_v2_compat, \ - .trigger = bch2_trigger_extent, \ - .min_val_size = 40, \ -}) - -/* KEY_TYPE_extent: */ - -bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); - -#define bch2_bkey_ops_extent ((struct bkey_ops) { \ - .key_validate = bch2_bkey_ptrs_validate, \ - .val_to_text = bch2_bkey_ptrs_to_text, \ - .swab = bch2_ptr_swab, \ - .key_normalize = bch2_extent_normalize, \ - .key_merge = bch2_extent_merge, \ - .trigger = bch2_trigger_extent, \ -}) - -/* KEY_TYPE_reservation: */ - -int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); - -#define bch2_bkey_ops_reservation ((struct bkey_ops) { \ - .key_validate = bch2_reservation_validate, \ - .val_to_text = bch2_reservation_to_text, \ - .key_merge = bch2_reservation_merge, \ - .trigger = bch2_trigger_reservation, \ - .min_val_size = 8, \ -}) - -/* Extent checksum entries: */ - -bool bch2_can_narrow_extent_crcs(struct bkey_s_c, - struct bch_extent_crc_unpacked); -bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); -void bch2_extent_crc_append(struct bkey_i *, - struct bch_extent_crc_unpacked); - -/* Generic code for keys with pointers: */ - -static inline bool bkey_is_btree_ptr(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - return true; - default: - return false; - } -} - -static inline bool bkey_extent_is_direct_data(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return true; - default: - return false; - } -} - -static inline bool bkey_extent_is_inline_data(const struct bkey *k) -{ - return k->type == KEY_TYPE_inline_data || - k->type == KEY_TYPE_indirect_inline_data; -} - -static inline unsigned bkey_inline_data_offset(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_inline_data: - return sizeof(struct bch_inline_data); - case KEY_TYPE_indirect_inline_data: - return sizeof(struct bch_indirect_inline_data); - default: - BUG(); - } -} - -static inline unsigned bkey_inline_data_bytes(const struct bkey *k) -{ - return bkey_val_bytes(k) - bkey_inline_data_offset(k); -} - -#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k)) - -static inline bool bkey_extent_is_data(const struct bkey *k) -{ - return bkey_extent_is_direct_data(k) || - bkey_extent_is_inline_data(k) || - k->type == KEY_TYPE_reflink_p; -} - -/* - * Should extent be counted under inode->i_sectors? - */ -static inline bool bkey_extent_is_allocation(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reservation: - case KEY_TYPE_reflink_p: - case KEY_TYPE_reflink_v: - case KEY_TYPE_inline_data: - case KEY_TYPE_indirect_inline_data: - case KEY_TYPE_error: - return true; - default: - return false; - } -} - -static inline bool bkey_extent_is_unwritten(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) - if (ptr->unwritten) - return true; - return false; -} - -static inline bool bkey_extent_is_reservation(struct bkey_s_c k) -{ - return k.k->type == KEY_TYPE_reservation || - bkey_extent_is_unwritten(k); -} - -static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) -{ - struct bch_devs_list ret = (struct bch_devs_list) { 0 }; - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(p, ptr) - ret.data[ret.nr++] = ptr->dev; - - return ret; -} - -static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) -{ - struct bch_devs_list ret = (struct bch_devs_list) { 0 }; - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(p, ptr) - if (!ptr->cached) - ret.data[ret.nr++] = ptr->dev; - - return ret; -} - -static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) -{ - struct bch_devs_list ret = (struct bch_devs_list) { 0 }; - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(p, ptr) - if (ptr->cached) - ret.data[ret.nr++] = ptr->dev; - - return ret; -} - -unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); -unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); -unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); -bool bch2_bkey_is_incompressible(struct bkey_s_c); -unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); - -unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); -unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *); -unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); -unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); - -const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned); - -static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev) -{ - return (void *) bch2_bkey_has_device_c(k.s_c, dev); -} - -bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); - -void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); - -static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr) -{ - struct bch_extent_ptr *dest; - - EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev)); - - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); - - ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k)); - *dest = ptr; - k->k.u64s++; - break; - default: - BUG(); - } -} - -void bch2_extent_ptr_decoded_append(struct bkey_i *, - struct extent_ptr_decoded *); -void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *); -void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); - -void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); -void bch2_bkey_drop_device(struct bkey_s, unsigned); - -#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \ -do { \ - __label__ _again; \ - struct bkey_ptrs _ptrs; \ -_again: \ - _ptrs = bch2_bkey_ptrs(_k); \ - \ - bkey_for_each_ptr(_ptrs, _ptr) \ - if (_cond) { \ - bch2_bkey_drop_ptr_noerror(_k, _ptr); \ - goto _again; \ - } \ -} while (0) - -#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ -do { \ - __label__ _again; \ - struct bkey_ptrs _ptrs; \ -_again: \ - _ptrs = bch2_bkey_ptrs(_k); \ - \ - bkey_for_each_ptr(_ptrs, _ptr) \ - if (_cond) { \ - bch2_bkey_drop_ptr(_k, _ptr); \ - goto _again; \ - } \ -} while (0) - -bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, - struct bch_extent_ptr, u64); -bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); -struct bch_extent_ptr * -bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); - -void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *, - struct bkey_s, struct bch_extent_ptr *); - -bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s); -bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); - -void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); -void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); -int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); - -static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, - struct bch_extent_ptr ptr2) -{ - return (ptr1.cached == ptr2.cached && - ptr1.unwritten == ptr2.unwritten && - ptr1.offset == ptr2.offset && - ptr1.dev == ptr2.dev && - ptr1.gen == ptr2.gen); -} - -void bch2_ptr_swab(struct bkey_s); - -/* Generic extent code: */ - -enum bch_extent_overlap { - BCH_EXTENT_OVERLAP_ALL = 0, - BCH_EXTENT_OVERLAP_BACK = 1, - BCH_EXTENT_OVERLAP_FRONT = 2, - BCH_EXTENT_OVERLAP_MIDDLE = 3, -}; - -/* Returns how k overlaps with m */ -static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, - const struct bkey *m) -{ - int cmp1 = bkey_lt(k->p, m->p); - int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m)); - - return (cmp1 << 1) + cmp2; -} - -int bch2_cut_front_s(struct bpos, struct bkey_s); -int bch2_cut_back_s(struct bpos, struct bkey_s); - -static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) -{ - bch2_cut_front_s(where, bkey_i_to_s(k)); -} - -static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) -{ - bch2_cut_back_s(where, bkey_i_to_s(k)); -} - -/** - * bch_key_resize - adjust size of @k - * - * bkey_start_offset(k) will be preserved, modifies where the extent ends - */ -static inline void bch2_key_resize(struct bkey *k, unsigned new_size) -{ - k->p.offset -= k->size; - k->p.offset += new_size; - k->size = new_size; -} - -static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs) -{ - if (ptrs.start != ptrs.end && - extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) - return ptrs.start->flags.flags; - return 0; -} - -static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k) -{ - return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k)); -} - -int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64); - -#endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h deleted file mode 100644 index 74c0252cbd98..000000000000 --- a/fs/bcachefs/extents_format.h +++ /dev/null @@ -1,304 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EXTENTS_FORMAT_H -#define _BCACHEFS_EXTENTS_FORMAT_H - -/* - * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally - * preceded by checksum/compression information (bch_extent_crc32 or - * bch_extent_crc64). - * - * One major determining factor in the format of extents is how we handle and - * represent extents that have been partially overwritten and thus trimmed: - * - * If an extent is not checksummed or compressed, when the extent is trimmed we - * don't have to remember the extent we originally allocated and wrote: we can - * merely adjust ptr->offset to point to the start of the data that is currently - * live. The size field in struct bkey records the current (live) size of the - * extent, and is also used to mean "size of region on disk that we point to" in - * this case. - * - * Thus an extent that is not checksummed or compressed will consist only of a - * list of bch_extent_ptrs, with none of the fields in - * bch_extent_crc32/bch_extent_crc64. - * - * When an extent is checksummed or compressed, it's not possible to read only - * the data that is currently live: we have to read the entire extent that was - * originally written, and then return only the part of the extent that is - * currently live. - * - * Thus, in addition to the current size of the extent in struct bkey, we need - * to store the size of the originally allocated space - this is the - * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, - * when the extent is trimmed, instead of modifying the offset field of the - * pointer, we keep a second smaller offset field - "offset into the original - * extent of the currently live region". - * - * The other major determining factor is replication and data migration: - * - * Each pointer may have its own bch_extent_crc32/64. When doing a replicated - * write, we will initially write all the replicas in the same format, with the - * same checksum type and compression format - however, when copygc runs later (or - * tiering/cache promotion, anything that moves data), it is not in general - * going to rewrite all the pointers at once - one of the replicas may be in a - * bucket on one device that has very little fragmentation while another lives - * in a bucket that has become heavily fragmented, and thus is being rewritten - * sooner than the rest. - * - * Thus it will only move a subset of the pointers (or in the case of - * tiering/cache promotion perhaps add a single pointer without dropping any - * current pointers), and if the extent has been partially overwritten it must - * write only the currently live portion (or copygc would not be able to reduce - * fragmentation!) - which necessitates a different bch_extent_crc format for - * the new pointer. - * - * But in the interests of space efficiency, we don't want to store one - * bch_extent_crc for each pointer if we don't have to. - * - * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and - * bch_extent_ptrs appended arbitrarily one after the other. We determine the - * type of a given entry with a scheme similar to utf8 (except we're encoding a - * type, not a size), encoding the type in the position of the first set bit: - * - * bch_extent_crc32 - 0b1 - * bch_extent_ptr - 0b10 - * bch_extent_crc64 - 0b100 - * - * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and - * bch_extent_crc64 is the least constrained). - * - * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, - * until the next bch_extent_crc32/64. - * - * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer - * is neither checksummed nor compressed. - */ - -#define BCH_EXTENT_ENTRY_TYPES() \ - x(ptr, 0) \ - x(crc32, 1) \ - x(crc64, 2) \ - x(crc128, 3) \ - x(stripe_ptr, 4) \ - x(rebalance, 5) \ - x(flags, 6) -#define BCH_EXTENT_ENTRY_MAX 7 - -enum bch_extent_entry_type { -#define x(f, n) BCH_EXTENT_ENTRY_##f = n, - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -/* Compressed/uncompressed size are stored biased by 1: */ -struct bch_extent_crc32 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u32 type:2, - _compressed_size:7, - _uncompressed_size:7, - offset:7, - _unused:1, - csum_type:4, - compression_type:4; - __u32 csum; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u32 csum; - __u32 compression_type:4, - csum_type:4, - _unused:1, - offset:7, - _uncompressed_size:7, - _compressed_size:7, - type:2; -#endif -} __packed __aligned(8); - -#define CRC32_SIZE_MAX (1U << 7) -#define CRC32_NONCE_MAX 0 - -struct bch_extent_crc64 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:3, - _compressed_size:9, - _uncompressed_size:9, - offset:9, - nonce:10, - csum_type:4, - compression_type:4, - csum_hi:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 csum_hi:16, - compression_type:4, - csum_type:4, - nonce:10, - offset:9, - _uncompressed_size:9, - _compressed_size:9, - type:3; -#endif - __u64 csum_lo; -} __packed __aligned(8); - -#define CRC64_SIZE_MAX (1U << 9) -#define CRC64_NONCE_MAX ((1U << 10) - 1) - -struct bch_extent_crc128 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:4, - _compressed_size:13, - _uncompressed_size:13, - offset:13, - nonce:13, - csum_type:4, - compression_type:4; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 compression_type:4, - csum_type:4, - nonce:13, - offset:13, - _uncompressed_size:13, - _compressed_size:13, - type:4; -#endif - struct bch_csum csum; -} __packed __aligned(8); - -#define CRC128_SIZE_MAX (1U << 13) -#define CRC128_NONCE_MAX ((1U << 13) - 1) - -/* - * @reservation - pointer hasn't been written to, just reserved - */ -struct bch_extent_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:1, - cached:1, - unused:1, - unwritten:1, - offset:44, /* 8 petabytes */ - dev:8, - gen:8; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 gen:8, - dev:8, - offset:44, - unwritten:1, - unused:1, - cached:1, - type:1; -#endif -} __packed __aligned(8); - -struct bch_extent_stripe_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:5, - block:8, - redundancy:4, - idx:47; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 idx:47, - redundancy:4, - block:8, - type:5; -#endif -}; - -#define BCH_EXTENT_FLAGS() \ - x(poisoned, 0) - -enum bch_extent_flags_e { -#define x(n, v) BCH_EXTENT_FLAG_##n = v, - BCH_EXTENT_FLAGS() -#undef x -}; - -struct bch_extent_flags { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:7, - flags:57; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 flags:57, - type:7; -#endif -}; - -/* bch_extent_rebalance: */ -#include "rebalance_format.h" - -union bch_extent_entry { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 - unsigned long type; -#elif __BITS_PER_LONG == 32 - struct { - unsigned long pad; - unsigned long type; - }; -#else -#error edit for your odd byteorder. -#endif - -#define x(f, n) struct bch_extent_##f f; - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -struct bch_btree_ptr { - struct bch_val v; - - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -struct bch_btree_ptr_v2 { - struct bch_val v; - - __u64 mem_ptr; - __le64 seq; - __le16 sectors_written; - __le16 flags; - struct bpos min_key; - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); - -struct bch_extent { - struct bch_val v; - - __u64 _data[0]; - union bch_extent_entry start[]; -} __packed __aligned(8); - -/* Maximum size (in u64s) a single pointer could be: */ -#define BKEY_EXTENT_PTR_U64s_MAX\ - ((sizeof(struct bch_extent_crc128) + \ - sizeof(struct bch_extent_ptr)) / sizeof(__u64)) - -/* Maximum possible size of an entire extent value: */ -#define BKEY_EXTENT_VAL_U64s_MAX \ - (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) - -/* * Maximum possible size of an entire extent, key + value: */ -#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) - -/* Btree pointers don't carry around checksums: */ -#define BKEY_BTREE_PTR_VAL_U64s_MAX \ - ((sizeof(struct bch_btree_ptr_v2) + \ - sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) -#define BKEY_BTREE_PTR_U64s_MAX \ - (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) - -struct bch_reservation { - struct bch_val v; - - __le32 generation; - __u8 nr_replicas; - __u8 pad[3]; -} __packed __aligned(8); - -struct bch_inline_data { - struct bch_val v; - u8 data[]; -}; - -#endif /* _BCACHEFS_EXTENTS_FORMAT_H */ diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h deleted file mode 100644 index b23ce4a373c0..000000000000 --- a/fs/bcachefs/extents_types.h +++ /dev/null @@ -1,42 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EXTENTS_TYPES_H -#define _BCACHEFS_EXTENTS_TYPES_H - -#include "bcachefs_format.h" - -struct bch_extent_crc_unpacked { - u32 compressed_size; - u32 uncompressed_size; - u32 live_size; - - u8 csum_type; - u8 compression_type; - - u16 offset; - - u16 nonce; - - struct bch_csum csum; -}; - -struct extent_ptr_decoded { - bool has_ec; - bool do_ec_reconstruct; - u8 crc_retry_nr; - struct bch_extent_crc_unpacked crc; - struct bch_extent_ptr ptr; - struct bch_extent_stripe_ptr ec; -}; - -struct bch_io_failures { - u8 nr; - struct bch_dev_io_failures { - u8 dev; - unsigned failed_csum_nr:6, - failed_io:1, - failed_btree_validate:1, - failed_ec:1; - } devs[BCH_REPLICAS_MAX + 1]; -}; - -#endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c deleted file mode 100644 index 0e742555cb0a..000000000000 --- a/fs/bcachefs/eytzinger.c +++ /dev/null @@ -1,315 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "eytzinger.h" - -/** - * is_aligned - is this pointer & size okay for word-wide copying? - * @base: pointer to data - * @size: size of each element - * @align: required alignment (typically 4 or 8) - * - * Returns true if elements can be copied using word loads and stores. - * The size must be a multiple of the alignment, and the base address must - * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. - * - * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" - * to "if ((a | b) & mask)", so we do that by hand. - */ -__attribute_const__ __always_inline -static bool is_aligned(const void *base, size_t size, unsigned char align) -{ - unsigned char lsbits = (unsigned char)size; - - (void)base; -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - lsbits |= (unsigned char)(uintptr_t)base; -#endif - return (lsbits & (align - 1)) == 0; -} - -/** - * swap_words_32 - swap two elements in 32-bit chunks - * @a: pointer to the first element to swap - * @b: pointer to the second element to swap - * @n: element size (must be a multiple of 4) - * - * Exchange the two objects in memory. This exploits base+index addressing, - * which basically all CPUs have, to minimize loop overhead computations. - * - * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the - * bottom of the loop, even though the zero flag is still valid from the - * subtract (since the intervening mov instructions don't alter the flags). - * Gcc 8.1.0 doesn't have that problem. - */ -static void swap_words_32(void *a, void *b, size_t n) -{ - do { - u32 t = *(u32 *)(a + (n -= 4)); - *(u32 *)(a + n) = *(u32 *)(b + n); - *(u32 *)(b + n) = t; - } while (n); -} - -/** - * swap_words_64 - swap two elements in 64-bit chunks - * @a: pointer to the first element to swap - * @b: pointer to the second element to swap - * @n: element size (must be a multiple of 8) - * - * Exchange the two objects in memory. This exploits base+index - * addressing, which basically all CPUs have, to minimize loop overhead - * computations. - * - * We'd like to use 64-bit loads if possible. If they're not, emulating - * one requires base+index+4 addressing which x86 has but most other - * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, - * but it's possible to have 64-bit loads without 64-bit pointers (e.g. - * x32 ABI). Are there any cases the kernel needs to worry about? - */ -static void swap_words_64(void *a, void *b, size_t n) -{ - do { -#ifdef CONFIG_64BIT - u64 t = *(u64 *)(a + (n -= 8)); - *(u64 *)(a + n) = *(u64 *)(b + n); - *(u64 *)(b + n) = t; -#else - /* Use two 32-bit transfers to avoid base+index+4 addressing */ - u32 t = *(u32 *)(a + (n -= 4)); - *(u32 *)(a + n) = *(u32 *)(b + n); - *(u32 *)(b + n) = t; - - t = *(u32 *)(a + (n -= 4)); - *(u32 *)(a + n) = *(u32 *)(b + n); - *(u32 *)(b + n) = t; -#endif - } while (n); -} - -/** - * swap_bytes - swap two elements a byte at a time - * @a: pointer to the first element to swap - * @b: pointer to the second element to swap - * @n: element size - * - * This is the fallback if alignment doesn't allow using larger chunks. - */ -static void swap_bytes(void *a, void *b, size_t n) -{ - do { - char t = ((char *)a)[--n]; - ((char *)a)[n] = ((char *)b)[n]; - ((char *)b)[n] = t; - } while (n); -} - -/* - * The values are arbitrary as long as they can't be confused with - * a pointer, but small integers make for the smallest compare - * instructions. - */ -#define SWAP_WORDS_64 (swap_r_func_t)0 -#define SWAP_WORDS_32 (swap_r_func_t)1 -#define SWAP_BYTES (swap_r_func_t)2 -#define SWAP_WRAPPER (swap_r_func_t)3 - -struct wrapper { - cmp_func_t cmp; - swap_func_t swap_func; -}; - -/* - * The function pointer is last to make tail calls most efficient if the - * compiler decides not to inline this function. - */ -static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) -{ - if (swap_func == SWAP_WRAPPER) { - ((const struct wrapper *)priv)->swap_func(a, b, (int)size); - return; - } - - if (swap_func == SWAP_WORDS_64) - swap_words_64(a, b, size); - else if (swap_func == SWAP_WORDS_32) - swap_words_32(a, b, size); - else if (swap_func == SWAP_BYTES) - swap_bytes(a, b, size); - else - swap_func(a, b, (int)size, priv); -} - -#define _CMP_WRAPPER ((cmp_r_func_t)0L) - -static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) -{ - if (cmp == _CMP_WRAPPER) - return ((const struct wrapper *)priv)->cmp(a, b); - return cmp(a, b, priv); -} - -static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size, - cmp_r_func_t cmp_func, const void *priv, - size_t l, size_t r) -{ - return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size, - base1 + inorder_to_eytzinger1(r, n) * size, - cmp_func, priv); -} - -static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size, - swap_r_func_t swap_func, const void *priv, - size_t l, size_t r) -{ - do_swap(base1 + inorder_to_eytzinger1(l, n) * size, - base1 + inorder_to_eytzinger1(r, n) * size, - size, swap_func, priv); -} - -static void eytzinger1_sort_r(void *base1, size_t n, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) -{ - unsigned i, j, k; - - /* called from 'sort' without swap function, let's pick the default */ - if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) - swap_func = NULL; - - if (!swap_func) { - if (is_aligned(base1, size, 8)) - swap_func = SWAP_WORDS_64; - else if (is_aligned(base1, size, 4)) - swap_func = SWAP_WORDS_32; - else - swap_func = SWAP_BYTES; - } - - /* heapify */ - for (i = n / 2; i >= 1; --i) { - /* Find the sift-down path all the way to the leaves. */ - for (j = i; k = j * 2, k < n;) - j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - - /* Special case for the last leaf with no sibling. */ - if (j * 2 == n) - j *= 2; - - /* Backtrack to the correct location. */ - while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0) - j /= 2; - - /* Shift the element into its correct place. */ - for (k = j; j != i;) { - j /= 2; - eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); - } - } - - /* sort */ - for (i = n; i > 1; --i) { - eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i); - - /* Find the sift-down path all the way to the leaves. */ - for (j = 1; k = j * 2, k + 1 < i;) - j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - - /* Special case for the last leaf with no sibling. */ - if (j * 2 + 1 == i) - j *= 2; - - /* Backtrack to the correct location. */ - while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0) - j /= 2; - - /* Shift the element into its correct place. */ - for (k = j; j > 1;) { - j /= 2; - eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); - } - } -} - -void eytzinger0_sort_r(void *base, size_t n, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) -{ - void *base1 = base - size; - - return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv); -} - -void eytzinger0_sort(void *base, size_t n, size_t size, - cmp_func_t cmp_func, - swap_func_t swap_func) -{ - struct wrapper w = { - .cmp = cmp_func, - .swap_func = swap_func, - }; - - return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); -} - -#if 0 -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/ktime.h> - -static u64 cmp_count; - -static int mycmp(const void *a, const void *b) -{ - u32 _a = *(u32 *)a; - u32 _b = *(u32 *)b; - - cmp_count++; - if (_a < _b) - return -1; - else if (_a > _b) - return 1; - else - return 0; -} - -static int test(void) -{ - size_t N, i; - ktime_t start, end; - s64 delta; - u32 *arr; - - for (N = 10000; N <= 100000; N += 10000) { - arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL); - cmp_count = 0; - - for (i = 0; i < N; i++) - arr[i] = get_random_u32(); - - start = ktime_get(); - eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL); - end = ktime_get(); - - delta = ktime_us_delta(end, start); - printk(KERN_INFO "time: %lld\n", delta); - printk(KERN_INFO "comparisons: %lld\n", cmp_count); - - u32 prev = 0; - - eytzinger0_for_each(i, N) { - if (prev > arr[i]) - goto err; - prev = arr[i]; - } - - kfree(arr); - } - return 0; - -err: - kfree(arr); - return -1; -} -#endif diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h deleted file mode 100644 index 643c1f716061..000000000000 --- a/fs/bcachefs/eytzinger.h +++ /dev/null @@ -1,300 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _EYTZINGER_H -#define _EYTZINGER_H - -#include <linux/bitops.h> -#include <linux/log2.h> - -#ifdef EYTZINGER_DEBUG -#include <linux/bug.h> -#define EYTZINGER_BUG_ON(cond) BUG_ON(cond) -#else -#define EYTZINGER_BUG_ON(cond) -#endif - -/* - * Traversal for trees in eytzinger layout - a full binary tree layed out in an - * array. - * - * Consider using an eytzinger tree any time you would otherwise be doing binary - * search over an array. Binary search is a worst case scenario for branch - * prediction and prefetching, but in an eytzinger tree every node's children - * are adjacent in memory, thus we can prefetch children before knowing the - * result of the comparison, assuming multiple nodes fit on a cacheline. - * - * Two variants are provided, for one based indexing and zero based indexing. - * - * Zero based indexing is more convenient, but one based indexing has better - * alignment and thus better performance because each new level of the tree - * starts at a power of two, and thus if element 0 was cacheline aligned, each - * new level will be as well. - */ - -static inline unsigned eytzinger1_child(unsigned i, unsigned child) -{ - EYTZINGER_BUG_ON(child > 1); - - return (i << 1) + child; -} - -static inline unsigned eytzinger1_left_child(unsigned i) -{ - return eytzinger1_child(i, 0); -} - -static inline unsigned eytzinger1_right_child(unsigned i) -{ - return eytzinger1_child(i, 1); -} - -static inline unsigned eytzinger1_first(unsigned size) -{ - return size ? rounddown_pow_of_two(size) : 0; -} - -static inline unsigned eytzinger1_last(unsigned size) -{ - return rounddown_pow_of_two(size + 1) - 1; -} - -static inline unsigned eytzinger1_next(unsigned i, unsigned size) -{ - EYTZINGER_BUG_ON(i == 0 || i > size); - - if (eytzinger1_right_child(i) <= size) { - i = eytzinger1_right_child(i); - - i <<= __fls(size) - __fls(i); - i >>= i > size; - } else { - i >>= ffz(i) + 1; - } - - return i; -} - -static inline unsigned eytzinger1_prev(unsigned i, unsigned size) -{ - EYTZINGER_BUG_ON(i == 0 || i > size); - - if (eytzinger1_left_child(i) <= size) { - i = eytzinger1_left_child(i) + 1; - - i <<= __fls(size) - __fls(i); - i -= 1; - i >>= i > size; - } else { - i >>= __ffs(i) + 1; - } - - return i; -} - -static inline unsigned eytzinger1_extra(unsigned size) -{ - return size - ? (size + 1 - rounddown_pow_of_two(size)) << 1 - : 0; -} - -static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, - unsigned extra) -{ - unsigned b = __fls(i); - unsigned shift = __fls(size) - b; - int s; - - EYTZINGER_BUG_ON(!i || i > size); - - i ^= 1U << b; - i <<= 1; - i |= 1; - i <<= shift; - - /* - * sign bit trick: - * - * if (i > extra) - * i -= (i - extra) >> 1; - */ - s = extra - i; - i += (s >> 1) & (s >> 31); - - return i; -} - -static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, - unsigned extra) -{ - unsigned shift; - int s; - - EYTZINGER_BUG_ON(!i || i > size); - - /* - * sign bit trick: - * - * if (i > extra) - * i += i - extra; - */ - s = extra - i; - i -= s & (s >> 31); - - shift = __ffs(i); - - i >>= shift + 1; - i |= 1U << (__fls(size) - shift); - - return i; -} - -static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) -{ - return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); -} - -static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) -{ - return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); -} - -#define eytzinger1_for_each(_i, _size) \ - for (unsigned (_i) = eytzinger1_first((_size)); \ - (_i) != 0; \ - (_i) = eytzinger1_next((_i), (_size))) - -/* Zero based indexing version: */ - -static inline unsigned eytzinger0_child(unsigned i, unsigned child) -{ - EYTZINGER_BUG_ON(child > 1); - - return (i << 1) + 1 + child; -} - -static inline unsigned eytzinger0_left_child(unsigned i) -{ - return eytzinger0_child(i, 0); -} - -static inline unsigned eytzinger0_right_child(unsigned i) -{ - return eytzinger0_child(i, 1); -} - -static inline unsigned eytzinger0_first(unsigned size) -{ - return eytzinger1_first(size) - 1; -} - -static inline unsigned eytzinger0_last(unsigned size) -{ - return eytzinger1_last(size) - 1; -} - -static inline unsigned eytzinger0_next(unsigned i, unsigned size) -{ - return eytzinger1_next(i + 1, size) - 1; -} - -static inline unsigned eytzinger0_prev(unsigned i, unsigned size) -{ - return eytzinger1_prev(i + 1, size) - 1; -} - -static inline unsigned eytzinger0_extra(unsigned size) -{ - return eytzinger1_extra(size); -} - -static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, - unsigned extra) -{ - return __eytzinger1_to_inorder(i + 1, size, extra) - 1; -} - -static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, - unsigned extra) -{ - return __inorder_to_eytzinger1(i + 1, size, extra) - 1; -} - -static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) -{ - return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); -} - -static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) -{ - return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); -} - -#define eytzinger0_for_each(_i, _size) \ - for (unsigned (_i) = eytzinger0_first((_size)); \ - (_i) != -1; \ - (_i) = eytzinger0_next((_i), (_size))) - -#define eytzinger0_for_each_prev(_i, _size) \ - for (unsigned (_i) = eytzinger0_last((_size)); \ - (_i) != -1; \ - (_i) = eytzinger0_prev((_i), (_size))) - -/* return greatest node <= @search, or -1 if not found */ -static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) -{ - void *base1 = base - size; - unsigned n = 1; - - while (n <= nr) - n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); - n >>= __ffs(n) + 1; - return n - 1; -} - -/* return smallest node > @search, or -1 if not found */ -static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) -{ - void *base1 = base - size; - unsigned n = 1; - - while (n <= nr) - n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); - n >>= __ffs(n + 1) + 1; - return n - 1; -} - -/* return smallest node >= @search, or -1 if not found */ -static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) -{ - void *base1 = base - size; - unsigned n = 1; - - while (n <= nr) - n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0); - n >>= __ffs(n + 1) + 1; - return n - 1; -} - -#define eytzinger0_find(base, nr, size, _cmp, search) \ -({ \ - size_t _size = (size); \ - void *_base1 = (void *)(base) - _size; \ - const void *_search = (search); \ - size_t _nr = (nr); \ - size_t _i = 1; \ - int _res; \ - \ - while (_i <= _nr && \ - (_res = _cmp(_search, _base1 + _i * _size))) \ - _i = eytzinger1_child(_i, _res > 0); \ - _i - 1; \ -}) - -void eytzinger0_sort_r(void *, size_t, size_t, - cmp_r_func_t, swap_r_func_t, const void *); -void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t); - -#endif /* _EYTZINGER_H */ diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c deleted file mode 100644 index 2faec143eb31..000000000000 --- a/fs/bcachefs/fast_list.c +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Fast, unordered lists - * - * Supports add, remove, and iterate - * - * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot - * allocation and freeing. - * - * This means that adding, removing, and iterating over items is lockless, - * except when refilling/emptying the percpu slot buffers. - */ - -#include "fast_list.h" - -struct fast_list_pcpu { - u32 nr; - u32 entries[31]; -}; - -static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp) -{ - int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp); - if (unlikely(idx < 0)) - return 0; - - if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) { - ida_free(&l->slots_allocated, idx); - return 0; - } - - return idx; -} - -/** - * fast_list_get_idx - get a slot in a fast_list - * @l: list to get slot in - * - * This allocates a slot in the radix tree without storing to it, so that we can - * take the potential memory allocation failure early and do the list add later - * when we can't take an allocation failure. - * - * Returns: positive integer on success, -ENOMEM on failure - */ -int fast_list_get_idx(struct fast_list *l) -{ - unsigned long flags; - int idx; -retry: - local_irq_save(flags); - struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); - - if (unlikely(!lp->nr)) { - u32 entries[16], nr = 0; - - local_irq_restore(flags); - while (nr < ARRAY_SIZE(entries) && - (idx = fast_list_alloc_idx(l, GFP_KERNEL))) - entries[nr++] = idx; - local_irq_save(flags); - - lp = this_cpu_ptr(l->buffer); - - while (nr && lp->nr < ARRAY_SIZE(lp->entries)) - lp->entries[lp->nr++] = entries[--nr]; - - if (unlikely(nr)) { - local_irq_restore(flags); - while (nr) - ida_free(&l->slots_allocated, entries[--nr]); - goto retry; - } - - if (unlikely(!lp->nr)) { - local_irq_restore(flags); - return -ENOMEM; - } - } - - idx = lp->entries[--lp->nr]; - local_irq_restore(flags); - - return idx; -} - -/** - * fast_list_add - add an item to a fast_list - * @l: list - * @item: item to add - * - * Allocates a slot in the radix tree and stores to it and then returns the - * slot index, which must be passed to fast_list_remove(). - * - * Returns: positive integer on success, -ENOMEM on failure - */ -int fast_list_add(struct fast_list *l, void *item) -{ - int idx = fast_list_get_idx(l); - if (idx < 0) - return idx; - - *genradix_ptr_inlined(&l->items, idx) = item; - return idx; -} - -/** - * fast_list_remove - remove an item from a fast_list - * @l: list - * @idx: item's slot index - * - * Zeroes out the slot in the radix tree and frees the slot for future - * fast_list_add() operations. - */ -void fast_list_remove(struct fast_list *l, unsigned idx) -{ - u32 entries[16], nr = 0; - unsigned long flags; - - if (!idx) - return; - - *genradix_ptr_inlined(&l->items, idx) = NULL; - - local_irq_save(flags); - struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); - - if (unlikely(lp->nr == ARRAY_SIZE(lp->entries))) - while (nr < ARRAY_SIZE(entries)) - entries[nr++] = lp->entries[--lp->nr]; - - lp->entries[lp->nr++] = idx; - local_irq_restore(flags); - - if (unlikely(nr)) - while (nr) - ida_free(&l->slots_allocated, entries[--nr]); -} - -void fast_list_exit(struct fast_list *l) -{ - /* XXX: warn if list isn't empty */ - free_percpu(l->buffer); - ida_destroy(&l->slots_allocated); - genradix_free(&l->items); -} - -int fast_list_init(struct fast_list *l) -{ - genradix_init(&l->items); - ida_init(&l->slots_allocated); - l->buffer = alloc_percpu(*l->buffer); - if (!l->buffer) - return -ENOMEM; - return 0; -} diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h deleted file mode 100644 index 73c9bf591fd6..000000000000 --- a/fs/bcachefs/fast_list.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef _LINUX_FAST_LIST_H -#define _LINUX_FAST_LIST_H - -#include <linux/generic-radix-tree.h> -#include <linux/idr.h> -#include <linux/percpu.h> - -struct fast_list_pcpu; - -struct fast_list { - GENRADIX(void *) items; - struct ida slots_allocated;; - struct fast_list_pcpu __percpu - *buffer; -}; - -static inline void *fast_list_iter_peek(struct genradix_iter *iter, - struct fast_list *list) -{ - void **p; - while ((p = genradix_iter_peek(iter, &list->items)) && !*p) - genradix_iter_advance(iter, &list->items); - - return p ? *p : NULL; -} - -#define fast_list_for_each_from(_list, _iter, _i, _start) \ - for (_iter = genradix_iter_init(&(_list)->items, _start); \ - (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \ - genradix_iter_advance(&(_iter), &(_list)->items)) - -#define fast_list_for_each(_list, _iter, _i) \ - fast_list_for_each_from(_list, _iter, _i, 0) - -int fast_list_get_idx(struct fast_list *l); -int fast_list_add(struct fast_list *l, void *item); -void fast_list_remove(struct fast_list *l, unsigned idx); -void fast_list_exit(struct fast_list *l); -int fast_list_init(struct fast_list *l); - -#endif /* _LINUX_FAST_LIST_H */ diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h deleted file mode 100644 index d8153fe27037..000000000000 --- a/fs/bcachefs/fifo.h +++ /dev/null @@ -1,127 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FIFO_H -#define _BCACHEFS_FIFO_H - -#include "util.h" - -#define FIFO(type) \ -struct { \ - size_t front, back, size, mask; \ - type *data; \ -} - -#define DECLARE_FIFO(type, name) FIFO(type) name - -#define fifo_buf_size(fifo) \ - ((fifo)->size \ - ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ - : 0) - -#define init_fifo(fifo, _size, _gfp) \ -({ \ - (fifo)->front = (fifo)->back = 0; \ - (fifo)->size = (_size); \ - (fifo)->mask = (fifo)->size \ - ? roundup_pow_of_two((fifo)->size) - 1 \ - : 0; \ - (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \ -}) - -#define free_fifo(fifo) \ -do { \ - kvfree((fifo)->data); \ - (fifo)->data = NULL; \ -} while (0) - -#define fifo_swap(l, r) \ -do { \ - swap((l)->front, (r)->front); \ - swap((l)->back, (r)->back); \ - swap((l)->size, (r)->size); \ - swap((l)->mask, (r)->mask); \ - swap((l)->data, (r)->data); \ -} while (0) - -#define fifo_move(dest, src) \ -do { \ - typeof(*((dest)->data)) _t; \ - while (!fifo_full(dest) && \ - fifo_pop(src, _t)) \ - fifo_push(dest, _t); \ -} while (0) - -#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) -#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) - -#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) -#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) - -#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) -#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) - -#define fifo_entry_idx_abs(fifo, p) \ - ((((p) >= &fifo_peek_front(fifo) \ - ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ - (((p) - (fifo)->data))) - -#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) -#define fifo_idx_entry(fifo, i) ((fifo)->data[((fifo)->front + (i)) & (fifo)->mask]) - -#define fifo_push_back_ref(f) \ - (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) - -#define fifo_push_front_ref(f) \ - (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) - -#define fifo_push_back(fifo, new) \ -({ \ - typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ - if (_r) \ - *_r = (new); \ - _r != NULL; \ -}) - -#define fifo_push_front(fifo, new) \ -({ \ - typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ - if (_r) \ - *_r = (new); \ - _r != NULL; \ -}) - -#define fifo_pop_front(fifo, i) \ -({ \ - bool _r = !fifo_empty((fifo)); \ - if (_r) \ - (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ - _r; \ -}) - -#define fifo_pop_back(fifo, i) \ -({ \ - bool _r = !fifo_empty((fifo)); \ - if (_r) \ - (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ - _r; \ -}) - -#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) -#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) -#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) -#define fifo_peek(fifo) fifo_peek_front(fifo) - -#define fifo_for_each_entry(_entry, _fifo, _iter) \ - for (typecheck(typeof((_fifo)->front), _iter), \ - (_iter) = (_fifo)->front; \ - ((_iter != (_fifo)->back) && \ - (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ - (_iter)++) - -#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ - for (typecheck(typeof((_fifo)->front), _iter), \ - (_iter) = (_fifo)->front; \ - ((_iter != (_fifo)->back) && \ - (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ - (_iter)++) - -#endif /* _BCACHEFS_FIFO_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c deleted file mode 100644 index 1c54b9b5bd69..000000000000 --- a/fs/bcachefs/fs-io-buffered.c +++ /dev/null @@ -1,1109 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "fs-io.h" -#include "fs-io-buffered.h" -#include "fs-io-direct.h" -#include "fs-io-pagecache.h" -#include "io_read.h" -#include "io_write.h" - -#include <linux/backing-dev.h> -#include <linux/pagemap.h> -#include <linux/writeback.h> - -static inline bool bio_full(struct bio *bio, unsigned len) -{ - if (bio->bi_vcnt >= bio->bi_max_vecs) - return true; - if (bio->bi_iter.bi_size > UINT_MAX - len) - return true; - return false; -} - -/* readpage(s): */ - -static void bch2_readpages_end_io(struct bio *bio) -{ - struct folio_iter fi; - - bio_for_each_folio_all(fi, bio) - folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK); - - bio_put(bio); -} - -struct readpages_iter { - struct address_space *mapping; - unsigned idx; - folios folios; -}; - -static int readpages_iter_init(struct readpages_iter *iter, - struct readahead_control *ractl) -{ - struct folio *folio; - - *iter = (struct readpages_iter) { ractl->mapping }; - - while ((folio = __readahead_folio(ractl))) { - if (!bch2_folio_create(folio, GFP_KERNEL) || - darray_push(&iter->folios, folio)) { - bch2_folio_release(folio); - ractl->_nr_pages += folio_nr_pages(folio); - ractl->_index -= folio_nr_pages(folio); - return iter->folios.nr ? 0 : -ENOMEM; - } - - folio_put(folio); - } - - return 0; -} - -static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) -{ - if (iter->idx >= iter->folios.nr) - return NULL; - return iter->folios.data[iter->idx]; -} - -static inline void readpage_iter_advance(struct readpages_iter *iter) -{ - iter->idx++; -} - -static bool extent_partial_reads_expensive(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - - bkey_for_each_crc(k.k, ptrs, crc, i) - if (crc.csum_type || crc.compression_type) - return true; - return false; -} - -static int readpage_bio_extend(struct btree_trans *trans, - struct readpages_iter *iter, - struct bio *bio, - unsigned sectors_this_extent, - bool get_more) -{ - /* Don't hold btree locks while allocating memory: */ - bch2_trans_unlock(trans); - - while (bio_sectors(bio) < sectors_this_extent && - bio->bi_vcnt < bio->bi_max_vecs) { - struct folio *folio = readpage_iter_peek(iter); - int ret; - - if (folio) { - readpage_iter_advance(iter); - } else { - pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; - - if (!get_more) - break; - - unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio); - - if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping)) - break; - - unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); - - /* ensure proper alignment */ - order = min(order, __ffs(folio_offset|BIT(31))); - - folio = xa_load(&iter->mapping->i_pages, folio_offset); - if (folio && !xa_is_value(folio)) - break; - - folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order); - if (!folio) - break; - - if (!__bch2_folio_create(folio, GFP_KERNEL)) { - folio_put(folio); - break; - } - - ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); - if (ret) { - __bch2_folio_release(folio); - folio_put(folio); - break; - } - - folio_put(folio); - } - - BUG_ON(folio_sector(folio) != bio_end_sector(bio)); - - BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); - } - - return bch2_trans_relock(trans); -} - -static void bchfs_read(struct btree_trans *trans, - struct bch_read_bio *rbio, - subvol_inum inum, - struct readpages_iter *readpages_iter) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_buf sk; - int flags = BCH_READ_retry_if_stale| - BCH_READ_may_promote; - int ret = 0; - - rbio->subvol = inum.subvol; - - bch2_bkey_buf_init(&sk); - bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, rbio->bio.bi_iter.bi_sector), - BTREE_ITER_slots); - while (1) { - struct bkey_s_c k; - unsigned bytes, sectors; - s64 offset_into_extent; - enum btree_id data_btree = BTREE_ID_extents; - - bch2_trans_begin(trans); - - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - - bch2_btree_iter_set_pos(trans, &iter, - POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&sk, c, k); - - ret = bch2_read_indirect_extent(trans, &data_btree, - &offset_into_extent, &sk); - if (ret) - goto err; - - k = bkey_i_to_s_c(sk.k); - - sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - - if (readpages_iter) { - ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, - extent_partial_reads_expensive(k)); - if (ret) - goto err; - } - - bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; - swap(rbio->bio.bi_iter.bi_size, bytes); - - if (rbio->bio.bi_iter.bi_size == bytes) - flags |= BCH_READ_last_fragment; - - bch2_bio_page_state_set(&rbio->bio, k); - - bch2_read_extent(trans, rbio, iter.pos, - data_btree, k, offset_into_extent, flags); - /* - * Careful there's a landmine here if bch2_read_extent() ever - * starts returning transaction restarts here. - * - * We've changed rbio->bi_iter.bi_size to be "bytes we can read - * from this extent" with the swap call, and we restore it - * below. That restore needs to come before checking for - * errors. - * - * But unlike __bch2_read(), we use the rbio bvec iter, not one - * on the stack, so we can't do the restore right after the - * bch2_read_extent() call: we don't own that iterator anymore - * if BCH_READ_last_fragment is set, since we may have submitted - * that rbio instead of cloning it. - */ - - if (flags & BCH_READ_last_fragment) - break; - - swap(rbio->bio.bi_iter.bi_size, bytes); - bio_advance(&rbio->bio, bytes); -err: - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) { - struct printbuf buf = PRINTBUF; - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); - prt_printf(&buf, "read error %i from btree lookup", ret); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - - rbio->bio.bi_status = BLK_STS_IOERR; - bio_endio(&rbio->bio); - } - - bch2_bkey_buf_exit(&sk, c); -} - -void bch2_readahead(struct readahead_control *ractl) -{ - struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; - struct folio *folio; - struct readpages_iter readpages_iter; - struct blk_plug plug; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - int ret = readpages_iter_init(&readpages_iter, ractl); - if (ret) - return; - - /* - * Besides being a general performance optimization, plugging helps with - * avoiding btree transaction srcu warnings - submitting a bio can - * block, and we don't want todo that with the transaction locked. - * - * However, plugged bios are submitted when we schedule; we ideally - * would have our own scheduler hook to call unlock_long() before - * scheduling. - */ - blk_start_plug(&plug); - bch2_pagecache_add_get(inode); - - struct btree_trans *trans = bch2_trans_get(c); - while ((folio = readpage_iter_peek(&readpages_iter))) { - unsigned n = min_t(unsigned, - readpages_iter.folios.nr - - readpages_iter.idx, - BIO_MAX_VECS); - struct bch_read_bio *rbio = - rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, - GFP_KERNEL, &c->bio_read), - c, - opts, - bch2_readpages_end_io); - - readpage_iter_advance(&readpages_iter); - - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bchfs_read(trans, rbio, inode_inum(inode), - &readpages_iter); - bch2_trans_unlock(trans); - } - bch2_trans_put(trans); - - bch2_pagecache_add_put(inode); - blk_finish_plug(&plug); - darray_exit(&readpages_iter.folios); -} - -static void bch2_read_single_folio_end_io(struct bio *bio) -{ - complete(bio->bi_private); -} - -int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_read_bio *rbio; - struct bch_io_opts opts; - struct blk_plug plug; - int ret; - DECLARE_COMPLETION_ONSTACK(done); - - BUG_ON(folio_test_uptodate(folio)); - BUG_ON(folio_test_dirty(folio)); - - if (!bch2_folio_create(folio, GFP_KERNEL)) - return -ENOMEM; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - c, - opts, - bch2_read_single_folio_end_io); - rbio->bio.bi_private = &done; - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - blk_start_plug(&plug); - bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); - blk_finish_plug(&plug); - wait_for_completion(&done); - - ret = blk_status_to_errno(rbio->bio.bi_status); - bio_put(&rbio->bio); - - if (ret < 0) - return ret; - - folio_mark_uptodate(folio); - return 0; -} - -int bch2_read_folio(struct file *file, struct folio *folio) -{ - int ret; - - ret = bch2_read_single_folio(folio, folio->mapping); - folio_unlock(folio); - return bch2_err_class(ret); -} - -/* writepages: */ - -struct bch_writepage_io { - struct bch_inode_info *inode; - - /* must be last: */ - struct bch_write_op op; -}; - -struct bch_writepage_state { - struct bch_writepage_io *io; - struct bch_io_opts opts; - struct bch_folio_sector *tmp; - unsigned tmp_sectors; - struct blk_plug plug; -}; - -/* - * Determine when a writepage io is full. We have to limit writepage bios to a - * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to - * what the bounce path in bch2_write_extent() can handle. In theory we could - * loosen this restriction for non-bounce I/O, but we don't have that context - * here. Ideally, we can up this limit and make it configurable in the future - * when the bounce path can be enhanced to accommodate larger source bios. - */ -static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len) -{ - struct bio *bio = &io->op.wbio.bio; - return bio_full(bio, len) || - (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE); -} - -static void bch2_writepage_io_done(struct bch_write_op *op) -{ - struct bch_writepage_io *io = - container_of(op, struct bch_writepage_io, op); - struct bch_fs *c = io->op.c; - struct bio *bio = &io->op.wbio.bio; - struct folio_iter fi; - unsigned i; - - if (io->op.error) { - set_bit(EI_INODE_ERROR, &io->inode->ei_flags); - - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - - mapping_set_error(fi.folio->mapping, -EIO); - - s = __bch2_folio(fi.folio); - spin_lock(&s->lock); - for (i = 0; i < folio_sectors(fi.folio); i++) - s->s[i].nr_replicas = 0; - spin_unlock(&s->lock); - } - } - - if (io->op.flags & BCH_WRITE_wrote_data_inline) { - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - - s = __bch2_folio(fi.folio); - spin_lock(&s->lock); - for (i = 0; i < folio_sectors(fi.folio); i++) - s->s[i].nr_replicas = 0; - spin_unlock(&s->lock); - } - } - - /* - * racing with fallocate can cause us to add fewer sectors than - * expected - but we shouldn't add more sectors than expected: - */ - WARN_ON_ONCE(io->op.i_sectors_delta > 0); - - /* - * (error (due to going RO) halfway through a page can screw that up - * slightly) - * XXX wtf? - BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); - */ - - /* - * The writeback flag is effectively our ref on the inode - - * fixup i_blocks before calling folio_end_writeback: - */ - bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s = __bch2_folio(fi.folio); - - if (atomic_dec_and_test(&s->write_count)) - folio_end_writeback(fi.folio); - } - - bio_put(&io->op.wbio.bio); -} - -static void bch2_writepage_do_io(struct bch_writepage_state *w) -{ - struct bch_writepage_io *io = w->io; - - w->io = NULL; - closure_call(&io->op.cl, bch2_write, NULL, NULL); -} - -/* - * Get a bch_writepage_io and add @page to it - appending to an existing one if - * possible, else allocating a new one: - */ -static void bch2_writepage_io_alloc(struct bch_fs *c, - struct writeback_control *wbc, - struct bch_writepage_state *w, - struct bch_inode_info *inode, - u64 sector, - unsigned nr_replicas) -{ - struct bch_write_op *op; - - w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, - REQ_OP_WRITE, - GFP_KERNEL, - &c->writepage_bioset), - struct bch_writepage_io, op.wbio.bio); - - w->io->inode = inode; - op = &w->io->op; - bch2_write_op_init(op, c, w->opts); - op->target = w->opts.foreground_target; - op->nr_replicas = nr_replicas; - op->res.nr_replicas = nr_replicas; - op->write_point = writepoint_hashed(inode->ei_last_dirtied); - op->subvol = inode->ei_inum.subvol; - op->pos = POS(inode->v.i_ino, sector); - op->end_io = bch2_writepage_io_done; - op->devs_need_flush = &inode->ei_devs_need_flush; - op->wbio.bio.bi_iter.bi_sector = sector; - op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); -} - -static int __bch2_writepage(struct folio *folio, - struct writeback_control *wbc, - void *data) -{ - struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_writepage_state *w = data; - struct bch_folio *s; - unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; - loff_t i_size = i_size_read(&inode->v); - int ret; - - EBUG_ON(!folio_test_uptodate(folio)); - - /* Is the folio fully inside i_size? */ - if (folio_end_pos(folio) <= i_size) - goto do_io; - - /* Is the folio fully outside i_size? (truncate in progress) */ - if (folio_pos(folio) >= i_size) { - folio_unlock(folio); - return 0; - } - - /* - * The folio straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the folio size. For a file that is not a multiple of - * the folio size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - folio_zero_segment(folio, - i_size - folio_pos(folio), - folio_size(folio)); -do_io: - f_sectors = folio_sectors(folio); - s = bch2_folio(folio); - - if (f_sectors > w->tmp_sectors) { - kfree(w->tmp); - w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL); - w->tmp_sectors = f_sectors; - } - - /* - * Things get really hairy with errors during writeback: - */ - ret = bch2_get_folio_disk_reservation(c, inode, folio, false); - BUG_ON(ret); - - /* Before unlocking the page, get copy of reservations: */ - spin_lock(&s->lock); - memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); - - for (i = 0; i < f_sectors; i++) { - if (s->s[i].state < SECTOR_dirty) - continue; - - nr_replicas_this_write = - min_t(unsigned, nr_replicas_this_write, - s->s[i].nr_replicas + - s->s[i].replicas_reserved); - } - - for (i = 0; i < f_sectors; i++) { - if (s->s[i].state < SECTOR_dirty) - continue; - - s->s[i].nr_replicas = w->opts.compression - ? 0 : nr_replicas_this_write; - - s->s[i].replicas_reserved = 0; - bch2_folio_sector_set(folio, s, i, SECTOR_allocated); - } - spin_unlock(&s->lock); - - BUG_ON(atomic_read(&s->write_count)); - atomic_set(&s->write_count, 1); - - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - - folio_unlock(folio); - - offset = 0; - while (1) { - unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; - u64 sector; - - while (offset < f_sectors && - w->tmp[offset].state < SECTOR_dirty) - offset++; - - if (offset == f_sectors) - break; - - while (offset + sectors < f_sectors && - w->tmp[offset + sectors].state >= SECTOR_dirty) { - reserved_sectors += w->tmp[offset + sectors].replicas_reserved; - dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; - sectors++; - } - BUG_ON(!sectors); - - sector = folio_sector(folio) + offset; - - if (w->io && - (w->io->op.res.nr_replicas != nr_replicas_this_write || - bch_io_full(w->io, sectors << 9) || - bio_end_sector(&w->io->op.wbio.bio) != sector)) - bch2_writepage_do_io(w); - - if (!w->io) - bch2_writepage_io_alloc(c, wbc, w, inode, sector, - nr_replicas_this_write); - - atomic_inc(&s->write_count); - - BUG_ON(inode != w->io->inode); - BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, - sectors << 9, offset << 9)); - - w->io->op.res.sectors += reserved_sectors; - w->io->op.i_sectors_delta -= dirty_sectors; - w->io->op.new_i_size = i_size; - - offset += sectors; - } - - if (atomic_dec_and_test(&s->write_count)) - folio_end_writeback(folio); - - return 0; -} - -int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) -{ - struct bch_fs *c = mapping->host->i_sb->s_fs_info; - struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL); - - bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode); - - blk_start_plug(&w->plug); - int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w); - if (w->io) - bch2_writepage_do_io(w); - blk_finish_plug(&w->plug); - kfree(w->tmp); - kfree(w); - return bch2_err_class(ret); -} - -/* buffered writes: */ - -int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation *res; - struct folio *folio; - unsigned offset; - int ret = -ENOMEM; - - res = kmalloc(sizeof(*res), GFP_KERNEL); - if (!res) - return -ENOMEM; - - bch2_folio_reservation_init(c, inode, res); - *fsdata = res; - - bch2_pagecache_add_get(inode); - - folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, - FGP_WRITEBEGIN | fgf_set_order(len), - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) - goto err_unlock; - - offset = pos - folio_pos(folio); - len = min_t(size_t, len, folio_end_pos(folio) - pos); - - if (folio_test_uptodate(folio)) - goto out; - - /* If we're writing entire folio, don't need to read it in first: */ - if (!offset && len == folio_size(folio)) - goto out; - - if (!offset && pos + len >= inode->v.i_size) { - folio_zero_segment(folio, len, folio_size(folio)); - flush_dcache_folio(folio); - goto out; - } - - if (folio_pos(folio) >= inode->v.i_size) { - folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); - flush_dcache_folio(folio); - goto out; - } -readpage: - ret = bch2_read_single_folio(folio, mapping); - if (ret) - goto err; -out: - ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); - if (ret) - goto err; - - ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); - if (ret) { - if (!folio_test_uptodate(folio)) { - /* - * If the folio hasn't been read in, we won't know if we - * actually need a reservation - we don't actually need - * to read here, we just need to check if the folio is - * fully backed by uncompressed data: - */ - goto readpage; - } - - goto err; - } - - *foliop = folio; - return 0; -err: - folio_unlock(folio); - folio_put(folio); -err_unlock: - bch2_pagecache_add_put(inode); - kfree(res); - *fsdata = NULL; - return bch2_err_class(ret); -} - -int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation *res = fsdata; - unsigned offset = pos - folio_pos(folio); - - lockdep_assert_held(&inode->v.i_rwsem); - BUG_ON(offset + copied > folio_size(folio)); - - if (unlikely(copied < len && !folio_test_uptodate(folio))) { - /* - * The folio needs to be read in, but that would destroy - * our partial write - simplest thing is to just force - * userspace to redo the write: - */ - folio_zero_range(folio, 0, folio_size(folio)); - flush_dcache_folio(folio); - copied = 0; - } - - spin_lock(&inode->v.i_lock); - if (pos + copied > inode->v.i_size) - i_size_write(&inode->v, pos + copied); - spin_unlock(&inode->v.i_lock); - - if (copied) { - if (!folio_test_uptodate(folio)) - folio_mark_uptodate(folio); - - bch2_set_folio_dirty(c, inode, folio, res, offset, copied); - - inode->ei_last_dirtied = (unsigned long) current; - } - - folio_unlock(folio); - folio_put(folio); - bch2_pagecache_add_put(inode); - - bch2_folio_reservation_put(c, inode, res); - kfree(res); - - return copied; -} - -static noinline void folios_trunc(folios *fs, struct folio **fi) -{ - while (fs->data + fs->nr > fi) { - struct folio *f = darray_pop(fs); - - folio_unlock(f); - folio_put(f); - } -} - -static int __bch2_buffered_write(struct bch_inode_info *inode, - struct address_space *mapping, - struct iov_iter *iter, - loff_t pos, unsigned len) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation res; - folios fs; - struct folio *f; - unsigned copied = 0, f_offset, f_copied; - u64 end = pos + len, f_pos, f_len; - loff_t last_folio_pos = inode->v.i_size; - int ret = 0; - - BUG_ON(!len); - - bch2_folio_reservation_init(c, inode, &res); - darray_init(&fs); - - ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, - FGP_WRITEBEGIN | fgf_set_order(len), - mapping_gfp_mask(mapping), &fs); - if (ret) - goto out; - - BUG_ON(!fs.nr); - - f = darray_first(fs); - if (pos != folio_pos(f) && !folio_test_uptodate(f)) { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - - f = darray_last(fs); - end = min(end, folio_end_pos(f)); - last_folio_pos = folio_pos(f); - if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { - if (end >= inode->v.i_size) { - folio_zero_range(f, 0, folio_size(f)); - } else { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - } - - ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr); - if (ret) - goto out; - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(fs)); - darray_for_each(fs, fi) { - ssize_t f_reserved; - - f = *fi; - f_len = min(end, folio_end_pos(f)) - f_pos; - f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len); - - if (unlikely(f_reserved != f_len)) { - if (f_reserved < 0) { - if (f == darray_first(fs)) { - ret = f_reserved; - goto out; - } - - folios_trunc(&fs, fi); - end = min(end, folio_end_pos(darray_last(fs))); - } else { - if (!folio_test_uptodate(f)) { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - - folios_trunc(&fs, fi + 1); - end = f_pos + f_reserved; - } - - break; - } - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - if (mapping_writably_mapped(mapping)) - darray_for_each(fs, fi) - flush_dcache_folio(*fi); - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(fs)); - darray_for_each(fs, fi) { - f = *fi; - f_len = min(end, folio_end_pos(f)) - f_pos; - f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter); - if (!f_copied) { - folios_trunc(&fs, fi); - break; - } - - if (!folio_test_uptodate(f) && - f_copied != folio_size(f) && - pos + copied + f_copied < inode->v.i_size) { - iov_iter_revert(iter, f_copied); - folio_zero_range(f, 0, folio_size(f)); - folios_trunc(&fs, fi); - break; - } - - flush_dcache_folio(f); - copied += f_copied; - - if (f_copied != f_len) { - folios_trunc(&fs, fi + 1); - break; - } - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - if (!copied) - goto out; - - end = pos + copied; - - spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) - i_size_write(&inode->v, end); - spin_unlock(&inode->v.i_lock); - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(fs)); - darray_for_each(fs, fi) { - f = *fi; - f_len = min(end, folio_end_pos(f)) - f_pos; - - if (!folio_test_uptodate(f)) - folio_mark_uptodate(f); - - bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - inode->ei_last_dirtied = (unsigned long) current; -out: - darray_for_each(fs, fi) { - folio_unlock(*fi); - folio_put(*fi); - } - - /* - * If the last folio added to the mapping starts beyond current EOF, we - * performed a short write but left around at least one post-EOF folio. - * Clean up the mapping before we return. - */ - if (last_folio_pos >= inode->v.i_size) - truncate_pagecache(&inode->v, inode->v.i_size); - - darray_exit(&fs); - bch2_folio_reservation_put(c, inode, &res); - - return copied ?: ret; -} - -static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos = iocb->ki_pos; - ssize_t written = 0; - int ret = 0; - - bch2_pagecache_add_get(inode); - - do { - unsigned offset = pos & (PAGE_SIZE - 1); - unsigned bytes = iov_iter_count(iter); -again: - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. - */ - if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { - bytes = min_t(unsigned long, iov_iter_count(iter), - PAGE_SIZE - offset); - - if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { - ret = -EFAULT; - break; - } - } - - if (unlikely(fatal_signal_pending(current))) { - ret = -EINTR; - break; - } - - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); - if (unlikely(ret < 0)) - break; - - cond_resched(); - - if (unlikely(ret == 0)) { - /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. - */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(iter)); - goto again; - } - pos += ret; - written += ret; - ret = 0; - - balance_dirty_pages_ratelimited(mapping); - } while (iov_iter_count(iter)); - - bch2_pagecache_add_put(inode); - - return written ? written : ret; -} - -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - ssize_t ret; - - if (iocb->ki_flags & IOCB_DIRECT) { - ret = bch2_direct_write(iocb, from); - goto out; - } - - inode_lock(&inode->v); - - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs(file); - if (ret) - goto unlock; - - ret = file_update_time(file); - if (ret) - goto unlock; - - ret = bch2_buffered_write(iocb, from); - if (likely(ret > 0)) - iocb->ki_pos += ret; -unlock: - inode_unlock(&inode->v); - - if (ret > 0) - ret = generic_write_sync(iocb, ret); -out: - return bch2_err_class(ret); -} - -void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) -{ - bioset_exit(&c->writepage_bioset); -} - -int bch2_fs_fs_io_buffered_init(struct bch_fs *c) -{ - if (bioset_init(&c->writepage_bioset, - 4, offsetof(struct bch_writepage_io, op.wbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_writepage_bioset_init; - - return 0; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h deleted file mode 100644 index 14de91c27656..000000000000 --- a/fs/bcachefs/fs-io-buffered.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IO_BUFFERED_H -#define _BCACHEFS_FS_IO_BUFFERED_H - -#ifndef NO_BCACHEFS_FS - -int bch2_read_single_folio(struct folio *, struct address_space *); -int bch2_read_folio(struct file *, struct folio *); - -int bch2_writepages(struct address_space *, struct writeback_control *); -void bch2_readahead(struct readahead_control *); - -int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos, - unsigned len, struct folio **, void **); -int bch2_write_end(const struct kiocb *, struct address_space *, loff_t, - unsigned len, unsigned copied, struct folio *, void *); - -ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); - -void bch2_fs_fs_io_buffered_exit(struct bch_fs *); -int bch2_fs_fs_io_buffered_init(struct bch_fs *); -#else -static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {} -static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; } -#endif - -#endif /* _BCACHEFS_FS_IO_BUFFERED_H */ diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c deleted file mode 100644 index 1f5154d9676b..000000000000 --- a/fs/bcachefs/fs-io-direct.c +++ /dev/null @@ -1,704 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "enumerated_ref.h" -#include "fs.h" -#include "fs-io.h" -#include "fs-io-direct.h" -#include "fs-io-pagecache.h" -#include "io_read.h" -#include "io_write.h" - -#include <linux/kthread.h> -#include <linux/pagemap.h> -#include <linux/prefetch.h> -#include <linux/task_io_accounting_ops.h> - -/* O_DIRECT reads */ - -struct dio_read { - struct closure cl; - struct kiocb *req; - long ret; - bool should_dirty; - struct bch_read_bio rbio; -}; - -static void bio_check_or_release(struct bio *bio, bool check_dirty) -{ - if (check_dirty) { - bio_check_pages_dirty(bio); - } else { - bio_release_pages(bio, false); - bio_put(bio); - } -} - -static CLOSURE_CALLBACK(bch2_dio_read_complete) -{ - closure_type(dio, struct dio_read, cl); - - dio->req->ki_complete(dio->req, dio->ret); - bio_check_or_release(&dio->rbio.bio, dio->should_dirty); -} - -static void bch2_direct_IO_read_endio(struct bio *bio) -{ - struct dio_read *dio = bio->bi_private; - - if (bio->bi_status) - dio->ret = blk_status_to_errno(bio->bi_status); - - closure_put(&dio->cl); -} - -static void bch2_direct_IO_read_split_endio(struct bio *bio) -{ - struct dio_read *dio = bio->bi_private; - bool should_dirty = dio->should_dirty; - - bch2_direct_IO_read_endio(bio); - bio_check_or_release(bio, should_dirty); -} - -static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) -{ - struct file *file = req->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; - struct dio_read *dio; - struct bio *bio; - struct blk_plug plug; - loff_t offset = req->ki_pos; - bool sync = is_sync_kiocb(req); - bool split = false; - size_t shorten; - ssize_t ret; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - /* bios must be 512 byte aligned: */ - if ((offset|iter->count) & (SECTOR_SIZE - 1)) - return -EINVAL; - - ret = min_t(loff_t, iter->count, - max_t(loff_t, 0, i_size_read(&inode->v) - offset)); - - if (!ret) - return ret; - - shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); - if (shorten >= iter->count) - shorten = 0; - iter->count -= shorten; - - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_READ, - GFP_KERNEL, - &c->dio_read_bioset); - - dio = container_of(bio, struct dio_read, rbio.bio); - closure_init(&dio->cl, NULL); - - /* - * this is a _really_ horrible hack just to avoid an atomic sub at the - * end: - */ - if (!sync) { - set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER - - CLOSURE_RUNNING + - CLOSURE_DESTRUCTOR); - } else { - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER + 1); - dio->cl.closure_get_happened = true; - } - - dio->req = req; - dio->ret = ret; - /* - * This is one of the sketchier things I've encountered: we have to skip - * the dirtying of requests that are internal from the kernel (i.e. from - * loopback), because we'll deadlock on page_lock. - */ - dio->should_dirty = iter_is_iovec(iter); - - blk_start_plug(&plug); - - goto start; - while (iter->count) { - split = true; - - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_READ, - GFP_KERNEL, - &c->bio_read); -start: - bio->bi_opf = REQ_OP_READ|REQ_SYNC; - bio->bi_iter.bi_sector = offset >> 9; - bio->bi_private = dio; - - ret = bio_iov_iter_get_pages(bio, iter); - if (ret < 0) { - /* XXX: fault inject this path */ - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - break; - } - - offset += bio->bi_iter.bi_size; - - if (dio->should_dirty) - bio_set_pages_dirty(bio); - - if (iter->count) - closure_get(&dio->cl); - - struct bch_read_bio *rbio = - rbio_init(bio, - c, - opts, - split - ? bch2_direct_IO_read_split_endio - : bch2_direct_IO_read_endio); - - bch2_read(c, rbio, inode_inum(inode)); - } - - blk_finish_plug(&plug); - - iter->count += shorten; - - if (sync) { - closure_sync(&dio->cl); - closure_debug_destroy(&dio->cl); - ret = dio->ret; - bio_check_or_release(&dio->rbio.bio, dio->should_dirty); - return ret; - } else { - return -EIOCBQUEUED; - } -} - -ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct address_space *mapping = file->f_mapping; - size_t count = iov_iter_count(iter); - ssize_t ret = 0; - - if (!count) - return 0; /* skip atime */ - - if (iocb->ki_flags & IOCB_DIRECT) { - struct blk_plug plug; - - if (unlikely(mapping->nrpages)) { - ret = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret < 0) - goto out; - } - - file_accessed(file); - - blk_start_plug(&plug); - ret = bch2_direct_IO_read(iocb, iter); - blk_finish_plug(&plug); - - if (ret >= 0) - iocb->ki_pos += ret; - } else { - bch2_pagecache_add_get(inode); - ret = filemap_read(iocb, iter, ret); - bch2_pagecache_add_put(inode); - } -out: - return bch2_err_class(ret); -} - -/* O_DIRECT writes */ - -struct dio_write { - struct kiocb *req; - struct address_space *mapping; - struct bch_inode_info *inode; - struct mm_struct *mm; - const struct iovec *iov; - unsigned loop:1, - extending:1, - sync:1, - flush:1; - struct quota_res quota_res; - u64 written; - - struct iov_iter iter; - struct iovec inline_vecs[2]; - - /* must be last: */ - struct bch_write_op op; -}; - -static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, - u64 offset, u64 size, - unsigned nr_replicas, bool compressed) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - u64 end = offset + size; - u32 snapshot; - bool ret = true; - int err; -retry: - bch2_trans_begin(trans); - - err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (err) - goto err; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, - SPOS(inum.inum, offset, snapshot), - BTREE_ITER_slots, k, err) { - if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) - break; - - if (k.k->p.snapshot != snapshot || - nr_replicas > bch2_bkey_replicas(c, k) || - (!compressed && bch2_bkey_sectors_compressed(k))) { - ret = false; - break; - } - } - - offset = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(err, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_put(trans); - - return err ? false : ret; -} - -static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct bch_inode_info *inode = dio->inode; - struct bio *bio = &dio->op.wbio.bio; - - return bch2_check_range_allocated(c, inode_inum(inode), - dio->op.pos.offset, bio_sectors(bio), - dio->op.opts.data_replicas, - dio->op.opts.compression != 0); -} - -static void bch2_dio_write_loop_async(struct bch_write_op *); -static __always_inline long bch2_dio_write_done(struct dio_write *dio); - -/* - * We're going to return -EIOCBQUEUED, but we haven't finished consuming the - * iov_iter yet, so we need to stash a copy of the iovec: it might be on the - * caller's stack, we're not guaranteed that it will live for the duration of - * the IO: - */ -static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) -{ - struct iovec *iov = dio->inline_vecs; - - /* - * iov_iter has a single embedded iovec - nothing to do: - */ - if (iter_is_ubuf(&dio->iter)) - return 0; - - /* - * We don't currently handle non-iovec iov_iters here - return an error, - * and we'll fall back to doing the IO synchronously: - */ - if (!iter_is_iovec(&dio->iter)) - return -1; - - if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { - dio->iov = iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), - GFP_KERNEL); - if (unlikely(!iov)) - return -ENOMEM; - } - - memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); - dio->iter.__iov = iov; - return 0; -} - -static CLOSURE_CALLBACK(bch2_dio_write_flush_done) -{ - closure_type(dio, struct dio_write, op.cl); - struct bch_fs *c = dio->op.c; - - closure_debug_destroy(cl); - - dio->op.error = bch2_journal_error(&c->journal); - - bch2_dio_write_done(dio); -} - -static noinline void bch2_dio_write_flush(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct bch_inode_unpacked inode; - int ret; - - dio->flush = 0; - - closure_init(&dio->op.cl, NULL); - - if (!dio->op.error) { - ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); - if (ret) { - dio->op.error = ret; - } else { - bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, - &dio->op.cl); - bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); - } - } - - if (dio->sync) { - closure_sync(&dio->op.cl); - closure_debug_destroy(&dio->op.cl); - } else { - continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); - } -} - -static __always_inline long bch2_dio_write_done(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct bch_inode_info *inode = dio->inode; - bool sync = dio->sync; - long ret; - - if (unlikely(dio->flush)) { - bch2_dio_write_flush(dio); - if (!sync) - return -EIOCBQUEUED; - } - - bch2_pagecache_block_put(inode); - - kfree(dio->iov); - - ret = dio->op.error ?: ((long) dio->written << 9); - bio_put(&dio->op.wbio.bio); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); - - /* inode->i_dio_count is our ref on inode and thus bch_fs */ - inode_dio_end(&inode->v); - - if (ret < 0) - ret = bch2_err_class(ret); - - if (!sync) { - req->ki_complete(req, ret); - ret = -EIOCBQUEUED; - } - return ret; -} - -static __always_inline void bch2_dio_write_end(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct bch_inode_info *inode = dio->inode; - struct bio *bio = &dio->op.wbio.bio; - - req->ki_pos += (u64) dio->op.written << 9; - dio->written += dio->op.written; - - if (dio->extending) { - spin_lock(&inode->v.i_lock); - if (req->ki_pos > inode->v.i_size) - i_size_write(&inode->v, req->ki_pos); - spin_unlock(&inode->v.i_lock); - } - - if (dio->op.i_sectors_delta || dio->quota_res.sectors) { - mutex_lock(&inode->ei_quota_lock); - __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); - __bch2_quota_reservation_put(c, inode, &dio->quota_res); - mutex_unlock(&inode->ei_quota_lock); - } - - bio_release_pages(bio, false); - - if (unlikely(dio->op.error)) - set_bit(EI_INODE_ERROR, &inode->ei_flags); -} - -static __always_inline long bch2_dio_write_loop(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct address_space *mapping = dio->mapping; - struct bch_inode_info *inode = dio->inode; - struct bch_io_opts opts; - struct bio *bio = &dio->op.wbio.bio; - unsigned unaligned, iter_count; - bool sync = dio->sync, dropped_locks; - long ret; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - while (1) { - iter_count = dio->iter.count; - - EBUG_ON(current->faults_disabled_mapping); - current->faults_disabled_mapping = mapping; - - ret = bio_iov_iter_get_pages(bio, &dio->iter); - - dropped_locks = fdm_dropped_locks(); - - current->faults_disabled_mapping = NULL; - - /* - * If the fault handler returned an error but also signalled - * that it dropped & retook ei_pagecache_lock, we just need to - * re-shoot down the page cache and retry: - */ - if (dropped_locks && ret) - ret = 0; - - if (unlikely(ret < 0)) - goto err; - - if (unlikely(dropped_locks)) { - ret = bch2_write_invalidate_inode_pages_range(mapping, - req->ki_pos, - req->ki_pos + iter_count - 1); - if (unlikely(ret)) - goto err; - - if (!bio->bi_iter.bi_size) - continue; - } - - unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); - bio->bi_iter.bi_size -= unaligned; - iov_iter_revert(&dio->iter, unaligned); - - if (!bio->bi_iter.bi_size) { - /* - * bio_iov_iter_get_pages was only able to get < - * blocksize worth of pages: - */ - ret = -EFAULT; - goto err; - } - - bch2_write_op_init(&dio->op, c, opts); - dio->op.end_io = sync - ? NULL - : bch2_dio_write_loop_async; - dio->op.target = dio->op.opts.foreground_target; - dio->op.write_point = writepoint_hashed((unsigned long) current); - dio->op.nr_replicas = dio->op.opts.data_replicas; - dio->op.subvol = inode->ei_inum.subvol; - dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); - dio->op.devs_need_flush = &inode->ei_devs_need_flush; - - if (sync) - dio->op.flags |= BCH_WRITE_sync; - dio->op.flags |= BCH_WRITE_check_enospc; - - ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, - bio_sectors(bio), true); - if (unlikely(ret)) - goto err; - - ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), - dio->op.opts.data_replicas, 0); - if (unlikely(ret) && - !bch2_dio_write_check_allocated(dio)) - goto err; - - task_io_account_write(bio->bi_iter.bi_size); - - if (unlikely(dio->iter.count) && - !dio->sync && - !dio->loop && - bch2_dio_write_copy_iov(dio)) - dio->sync = sync = true; - - dio->loop = true; - closure_call(&dio->op.cl, bch2_write, NULL, NULL); - - if (!sync) - return -EIOCBQUEUED; - - bch2_dio_write_end(dio); - - if (likely(!dio->iter.count) || dio->op.error) - break; - - bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); - } -out: - return bch2_dio_write_done(dio); -err: - dio->op.error = ret; - - bio_release_pages(bio, false); - - bch2_quota_reservation_put(c, inode, &dio->quota_res); - goto out; -} - -static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) -{ - struct mm_struct *mm = dio->mm; - - bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); - - if (mm) - kthread_use_mm(mm); - bch2_dio_write_loop(dio); - if (mm) - kthread_unuse_mm(mm); -} - -static void bch2_dio_write_loop_async(struct bch_write_op *op) -{ - struct dio_write *dio = container_of(op, struct dio_write, op); - - bch2_dio_write_end(dio); - - if (likely(!dio->iter.count) || dio->op.error) - bch2_dio_write_done(dio); - else - bch2_dio_write_continue(dio); -} - -ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) -{ - struct file *file = req->ki_filp; - struct address_space *mapping = file->f_mapping; - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct dio_write *dio; - struct bio *bio; - bool locked = true, extending; - ssize_t ret; - - prefetch(&c->opts); - prefetch((void *) &c->opts + 64); - prefetch(&inode->ei_inode); - prefetch((void *) &inode->ei_inode + 64); - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write)) - return -EROFS; - - inode_lock(&inode->v); - - ret = generic_write_checks(req, iter); - if (unlikely(ret <= 0)) - goto err_put_write_ref; - - ret = file_remove_privs(file); - if (unlikely(ret)) - goto err_put_write_ref; - - ret = file_update_time(file); - if (unlikely(ret)) - goto err_put_write_ref; - - if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) { - ret = -EINVAL; - goto err_put_write_ref; - } - - inode_dio_begin(&inode->v); - bch2_pagecache_block_get(inode); - - extending = req->ki_pos + iter->count > inode->v.i_size; - if (!extending) { - inode_unlock(&inode->v); - locked = false; - } - - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_WRITE | REQ_SYNC | REQ_IDLE, - GFP_KERNEL, - &c->dio_write_bioset); - dio = container_of(bio, struct dio_write, op.wbio.bio); - dio->req = req; - dio->mapping = mapping; - dio->inode = inode; - dio->mm = current->mm; - dio->iov = NULL; - dio->loop = false; - dio->extending = extending; - dio->sync = is_sync_kiocb(req) || extending; - dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; - dio->quota_res.sectors = 0; - dio->written = 0; - dio->iter = *iter; - dio->op.c = c; - - if (unlikely(mapping->nrpages)) { - ret = bch2_write_invalidate_inode_pages_range(mapping, - req->ki_pos, - req->ki_pos + iter->count - 1); - if (unlikely(ret)) - goto err_put_bio; - } - - ret = bch2_dio_write_loop(dio); -out: - if (locked) - inode_unlock(&inode->v); - return ret; -err_put_bio: - bch2_pagecache_block_put(inode); - bio_put(bio); - inode_dio_end(&inode->v); -err_put_write_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); - goto out; -} - -void bch2_fs_fs_io_direct_exit(struct bch_fs *c) -{ - bioset_exit(&c->dio_write_bioset); - bioset_exit(&c->dio_read_bioset); -} - -int bch2_fs_fs_io_direct_init(struct bch_fs *c) -{ - if (bioset_init(&c->dio_read_bioset, - 4, offsetof(struct dio_read, rbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_dio_read_bioset_init; - - if (bioset_init(&c->dio_write_bioset, - 4, offsetof(struct dio_write, op.wbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_dio_write_bioset_init; - - return 0; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h deleted file mode 100644 index 814621ec7f81..000000000000 --- a/fs/bcachefs/fs-io-direct.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IO_DIRECT_H -#define _BCACHEFS_FS_IO_DIRECT_H - -#ifndef NO_BCACHEFS_FS -ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *); -ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); - -void bch2_fs_fs_io_direct_exit(struct bch_fs *); -int bch2_fs_fs_io_direct_init(struct bch_fs *); -#else -static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {} -static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; } -#endif - -#endif /* _BCACHEFS_FS_IO_DIRECT_H */ diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c deleted file mode 100644 index c2cc405822f2..000000000000 --- a/fs/bcachefs/fs-io-pagecache.c +++ /dev/null @@ -1,827 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "btree_iter.h" -#include "extents.h" -#include "fs-io.h" -#include "fs-io-pagecache.h" -#include "subvolume.h" - -#include <linux/pagevec.h> -#include <linux/writeback.h> - -int bch2_filemap_get_contig_folios_d(struct address_space *mapping, - loff_t start, u64 end, - fgf_t fgp_flags, gfp_t gfp, - folios *fs) -{ - struct folio *f; - u64 pos = start; - int ret = 0; - - while (pos < end) { - if ((u64) pos >= (u64) start + (1ULL << 20)) - fgp_flags &= ~FGP_CREAT; - - ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL); - if (ret) - break; - - f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); - if (IS_ERR(f)) - break; - - BUG_ON(fs->nr && folio_pos(f) != pos); - - pos = folio_end_pos(f); - darray_push(fs, f); - } - - if (!fs->nr && !ret && (fgp_flags & FGP_CREAT)) - ret = -ENOMEM; - - return fs->nr ? 0 : ret; -} - -/* pagecache_block must be held */ -int bch2_write_invalidate_inode_pages_range(struct address_space *mapping, - loff_t start, loff_t end) -{ - int ret; - - /* - * XXX: the way this is currently implemented, we can spin if a process - * is continually redirtying a specific page - */ - do { - if (!mapping->nrpages) - return 0; - - ret = filemap_write_and_wait_range(mapping, start, end); - if (ret) - break; - - if (!mapping->nrpages) - return 0; - - ret = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, - end >> PAGE_SHIFT); - } while (ret == -EBUSY); - - return ret; -} - -#if 0 -/* Useful for debug tracing: */ -static const char * const bch2_folio_sector_states[] = { -#define x(n) #n, - BCH_FOLIO_SECTOR_STATE() -#undef x - NULL -}; -#endif - -static inline enum bch_folio_sector_state -folio_sector_dirty(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_unallocated: - return SECTOR_dirty; - case SECTOR_reserved: - return SECTOR_dirty_reserved; - default: - return state; - } -} - -static inline enum bch_folio_sector_state -folio_sector_undirty(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_dirty: - return SECTOR_unallocated; - case SECTOR_dirty_reserved: - return SECTOR_reserved; - default: - return state; - } -} - -static inline enum bch_folio_sector_state -folio_sector_reserve(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_unallocated: - return SECTOR_reserved; - case SECTOR_dirty: - return SECTOR_dirty_reserved; - default: - return state; - } -} - -/* for newly allocated folios: */ -struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) -{ - struct bch_folio *s; - - s = kzalloc(sizeof(*s) + - sizeof(struct bch_folio_sector) * - folio_sectors(folio), gfp); - if (!s) - return NULL; - - spin_lock_init(&s->lock); - folio_attach_private(folio, s); - return s; -} - -struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) -{ - return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); -} - -static unsigned bkey_to_sector_state(struct bkey_s_c k) -{ - if (bkey_extent_is_reservation(k)) - return SECTOR_reserved; - if (bkey_extent_is_allocation(k.k)) - return SECTOR_allocated; - return SECTOR_unallocated; -} - -static void __bch2_folio_set(struct folio *folio, - unsigned pg_offset, unsigned pg_len, - unsigned nr_ptrs, unsigned state) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, sectors = folio_sectors(folio); - - BUG_ON(pg_offset >= sectors); - BUG_ON(pg_offset + pg_len > sectors); - - spin_lock(&s->lock); - - for (i = pg_offset; i < pg_offset + pg_len; i++) { - s->s[i].nr_replicas = nr_ptrs; - bch2_folio_sector_set(folio, s, i, state); - } - - if (i == sectors) - s->uptodate = true; - - spin_unlock(&s->lock); -} - -/* - * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the - * extents btree: - */ -int bch2_folio_set(struct bch_fs *c, subvol_inum inum, - struct folio **fs, unsigned nr_folios) -{ - u64 offset = folio_sector(fs[0]); - bool need_set = false; - - for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) { - struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); - if (!s) - return -ENOMEM; - - need_set |= !s->uptodate; - } - - if (!need_set) - return 0; - - unsigned folio_idx = 0; - - return bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, - POS(inum.inum, offset), - POS(inum.inum, U64_MAX), - inum.subvol, BTREE_ITER_slots, k, ({ - unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); - - while (folio_idx < nr_folios) { - struct folio *folio = fs[folio_idx]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - - folio_start; - unsigned folio_len = min(k.k->p.offset, folio_end) - - folio_offset - folio_start; - - BUG_ON(k.k->p.offset < folio_start); - BUG_ON(bkey_start_offset(k.k) > folio_end); - - if (!bch2_folio(folio)->uptodate) - __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); - - if (k.k->p.offset < folio_end) - break; - folio_idx++; - } - - if (folio_idx == nr_folios) - break; - 0; - }))); -} - -void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) -{ - struct bvec_iter iter; - struct folio_vec fv; - unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v - ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); - - bio_for_each_folio(fv, bio, iter) - __bch2_folio_set(fv.fv_folio, - fv.fv_offset >> 9, - fv.fv_len >> 9, - nr_ptrs, state); -} - -void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, - u64 start, u64 end) -{ - pgoff_t index = start >> PAGE_SECTORS_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; - struct folio_batch fbatch; - unsigned i, j; - - if (end <= start) - return; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(inode->v.i_mapping, - &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - struct bch_folio *s; - - BUG_ON(end <= folio_start); - - folio_lock(folio); - s = bch2_folio(folio); - - if (s) { - spin_lock(&s->lock); - for (j = folio_offset; j < folio_offset + folio_len; j++) - s->s[j].nr_replicas = 0; - spin_unlock(&s->lock); - } - - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } -} - -int bch2_mark_pagecache_reserved(struct bch_inode_info *inode, - u64 *start, u64 end, - bool nonblocking) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - pgoff_t index = *start >> PAGE_SECTORS_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; - struct folio_batch fbatch; - s64 i_sectors_delta = 0; - int ret = 0; - - if (end <= *start) - return 0; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(inode->v.i_mapping, - &index, end_index, &fbatch)) { - for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - - if (!nonblocking) - folio_lock(folio); - else if (!folio_trylock(folio)) { - folio_batch_release(&fbatch); - ret = -EAGAIN; - break; - } - - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - - BUG_ON(end <= folio_start); - - *start = min(end, folio_end); - - struct bch_folio *s = bch2_folio(folio); - if (s) { - unsigned folio_offset = max(*start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - - spin_lock(&s->lock); - for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) { - i_sectors_delta -= s->s[j].state == SECTOR_dirty; - bch2_folio_sector_set(folio, s, j, - folio_sector_reserve(s->s[j].state)); - } - spin_unlock(&s->lock); - } - - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } - - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - return ret; -} - -static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, - unsigned nr_replicas) -{ - return max(0, (int) nr_replicas - - s->nr_replicas - - s->replicas_reserved); -} - -int bch2_get_folio_disk_reservation(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, bool check_enospc) -{ - struct bch_folio *s = bch2_folio_create(folio, 0); - unsigned nr_replicas = inode_nr_replicas(c, inode); - struct disk_reservation disk_res = { 0 }; - unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; - int ret; - - if (!s) - return -ENOMEM; - - for (i = 0; i < sectors; i++) - disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); - - if (!disk_res_sectors) - return 0; - - ret = bch2_disk_reservation_get(c, &disk_res, - disk_res_sectors, 1, - !check_enospc - ? BCH_DISK_RESERVATION_NOFAIL - : 0); - if (unlikely(ret)) - return ret; - - for (i = 0; i < sectors; i++) - s->s[i].replicas_reserved += - sectors_to_reserve(&s->s[i], nr_replicas); - - return 0; -} - -void bch2_folio_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch2_folio_reservation *res) -{ - bch2_disk_reservation_put(c, &res->disk); - bch2_quota_reservation_put(c, inode, &res->quota); -} - -static int __bch2_folio_reservation_get(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - size_t offset, size_t len, - bool partial) -{ - struct bch_folio *s = bch2_folio_create(folio, 0); - unsigned i, disk_sectors = 0, quota_sectors = 0; - struct disk_reservation disk_res = {}; - size_t reserved = len; - int ret; - - if (!s) - return -ENOMEM; - - BUG_ON(!s->uptodate); - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); - quota_sectors += s->s[i].state == SECTOR_unallocated; - } - - if (disk_sectors) { - ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors, - partial ? BCH_DISK_RESERVATION_PARTIAL : 0); - if (unlikely(ret)) - return ret; - - if (unlikely(disk_res.sectors != disk_sectors)) { - disk_sectors = quota_sectors = 0; - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); - if (disk_sectors > disk_res.sectors) { - /* - * Make sure to get a reservation that's - * aligned to the filesystem blocksize: - */ - unsigned reserved_offset = round_down(i << 9, block_bytes(c)); - reserved = clamp(reserved_offset, offset, offset + len) - offset; - - if (!reserved) { - bch2_disk_reservation_put(c, &disk_res); - return bch_err_throw(c, ENOSPC_disk_reservation); - } - break; - } - quota_sectors += s->s[i].state == SECTOR_unallocated; - } - } - } - - if (quota_sectors) { - ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true); - if (unlikely(ret)) { - bch2_disk_reservation_put(c, &disk_res); - return ret; - } - } - - res->disk.sectors += disk_res.sectors; - return partial ? reserved : 0; -} - -int bch2_folio_reservation_get(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - size_t offset, size_t len) -{ - return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false); -} - -ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - size_t offset, size_t len) -{ - return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true); -} - -static void bch2_clear_folio_bits(struct folio *folio) -{ - struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_folio *s = bch2_folio(folio); - struct disk_reservation disk_res = { 0 }; - int i, sectors = folio_sectors(folio), dirty_sectors = 0; - - if (!s) - return; - - EBUG_ON(!folio_test_locked(folio)); - EBUG_ON(folio_test_writeback(folio)); - - for (i = 0; i < sectors; i++) { - disk_res.sectors += s->s[i].replicas_reserved; - s->s[i].replicas_reserved = 0; - - dirty_sectors -= s->s[i].state == SECTOR_dirty; - bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); - } - - bch2_disk_reservation_put(c, &disk_res); - - bch2_i_sectors_acct(c, inode, NULL, dirty_sectors); - - bch2_folio_release(folio); -} - -void bch2_set_folio_dirty(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - unsigned offset, unsigned len) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, dirty_sectors = 0; - - WARN_ON((u64) folio_pos(folio) + offset + len > - round_up((u64) i_size_read(&inode->v), block_bytes(c))); - - BUG_ON(!s->uptodate); - - spin_lock(&s->lock); - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - unsigned sectors = sectors_to_reserve(&s->s[i], - res->disk.nr_replicas); - - /* - * This can happen if we race with the error path in - * bch2_writepage_io_done(): - */ - sectors = min_t(unsigned, sectors, res->disk.sectors); - - s->s[i].replicas_reserved += sectors; - res->disk.sectors -= sectors; - - dirty_sectors += s->s[i].state == SECTOR_unallocated; - - bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); - } - - spin_unlock(&s->lock); - - bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors); - - if (!folio_test_dirty(folio)) - filemap_dirty_folio(inode->v.i_mapping, folio); -} - -vm_fault_t bch2_page_fault(struct vm_fault *vmf) -{ - struct file *file = vmf->vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct address_space *fdm = faults_disabled_mapping(); - struct bch_inode_info *inode = file_bch_inode(file); - vm_fault_t ret; - - if (fdm == mapping) - return VM_FAULT_SIGBUS; - - /* Lock ordering: */ - if (fdm > mapping) { - struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); - - if (bch2_pagecache_add_tryget(inode)) - goto got_lock; - - bch2_pagecache_block_put(fdm_host); - - bch2_pagecache_add_get(inode); - bch2_pagecache_add_put(inode); - - bch2_pagecache_block_get(fdm_host); - - /* Signal that lock has been dropped: */ - set_fdm_dropped_locks(); - return VM_FAULT_SIGBUS; - } - - bch2_pagecache_add_get(inode); -got_lock: - ret = filemap_fault(vmf); - bch2_pagecache_add_put(inode); - - return ret; -} - -vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) -{ - struct folio *folio = page_folio(vmf->page); - struct file *file = vmf->vma->vm_file; - struct bch_inode_info *inode = file_bch_inode(file); - struct address_space *mapping = file->f_mapping; - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation res; - vm_fault_t ret; - - loff_t file_offset = round_down(vmf->pgoff << PAGE_SHIFT, block_bytes(c)); - unsigned offset = file_offset - folio_pos(folio); - unsigned len = max(PAGE_SIZE, block_bytes(c)); - - BUG_ON(offset + len > folio_size(folio)); - - bch2_folio_reservation_init(c, inode, &res); - - sb_start_pagefault(inode->v.i_sb); - file_update_time(file); - - /* - * Not strictly necessary, but helps avoid dio writes livelocking in - * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get - * a bch2_write_invalidate_inode_pages_range() that works without dropping - * page lock before invalidating page - */ - bch2_pagecache_add_get(inode); - - folio_lock(folio); - u64 isize = i_size_read(&inode->v); - - if (folio->mapping != mapping || file_offset >= isize) { - folio_unlock(folio); - ret = VM_FAULT_NOPAGE; - goto out; - } - - len = min_t(unsigned, len, isize - file_offset); - - if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: - bch2_folio_reservation_get(c, inode, folio, &res, offset, len)) { - folio_unlock(folio); - ret = VM_FAULT_SIGBUS; - goto out; - } - - bch2_set_folio_dirty(c, inode, folio, &res, offset, len); - bch2_folio_reservation_put(c, inode, &res); - - folio_wait_stable(folio); - ret = VM_FAULT_LOCKED; -out: - bch2_pagecache_add_put(inode); - sb_end_pagefault(inode->v.i_sb); - - return ret; -} - -void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) -{ - if (offset || length < folio_size(folio)) - return; - - bch2_clear_folio_bits(folio); -} - -bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) -{ - if (folio_test_dirty(folio) || folio_test_writeback(folio)) - return false; - - bch2_clear_folio_bits(folio); - return true; -} - -/* fseek: */ - -static int folio_data_offset(struct folio *folio, loff_t pos, - unsigned min_replicas) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, sectors = folio_sectors(folio); - - if (s) - for (i = folio_pos_to_s(folio, pos); i < sectors; i++) - if (s->s[i].state >= SECTOR_dirty && - s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) - return i << SECTOR_SHIFT; - - return -1; -} - -loff_t bch2_seek_pagecache_data(struct inode *vinode, - loff_t start_offset, - loff_t end_offset, - unsigned min_replicas, - bool nonblock) -{ - struct folio_batch fbatch; - pgoff_t start_index = start_offset >> PAGE_SHIFT; - pgoff_t end_index = end_offset >> PAGE_SHIFT; - pgoff_t index = start_index; - unsigned i; - loff_t ret; - int offset; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(vinode->i_mapping, - &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - - if (!nonblock) { - folio_lock(folio); - } else if (!folio_trylock(folio)) { - folio_batch_release(&fbatch); - return -EAGAIN; - } - - offset = folio_data_offset(folio, - max(folio_pos(folio), start_offset), - min_replicas); - if (offset >= 0) { - ret = clamp(folio_pos(folio) + offset, - start_offset, end_offset); - folio_unlock(folio); - folio_batch_release(&fbatch); - return ret; - } - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } - - return end_offset; -} - -/* - * Search for a hole in a folio. - * - * The filemap layer returns -ENOENT if no folio exists, so reuse the same error - * code to indicate a pagecache hole exists at the returned offset. Otherwise - * return 0 if the folio is filled with data, or an error code. This function - * can return -EAGAIN if nonblock is specified. - */ -static int folio_hole_offset(struct address_space *mapping, loff_t *offset, - unsigned min_replicas, bool nonblock) -{ - struct folio *folio; - struct bch_folio *s; - unsigned i, sectors; - int ret = -ENOENT; - - folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, - FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0); - if (IS_ERR(folio)) - return PTR_ERR(folio); - - s = bch2_folio(folio); - if (!s) - goto unlock; - - sectors = folio_sectors(folio); - for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) - if (s->s[i].state < SECTOR_dirty || - s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { - *offset = max(*offset, - folio_pos(folio) + (i << SECTOR_SHIFT)); - goto unlock; - } - - *offset = folio_end_pos(folio); - ret = 0; -unlock: - folio_unlock(folio); - folio_put(folio); - return ret; -} - -loff_t bch2_seek_pagecache_hole(struct inode *vinode, - loff_t start_offset, - loff_t end_offset, - unsigned min_replicas, - bool nonblock) -{ - struct address_space *mapping = vinode->i_mapping; - loff_t offset = start_offset; - loff_t ret = 0; - - while (!ret && offset < end_offset) - ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock); - - if (ret && ret != -ENOENT) - return ret; - return min(offset, end_offset); -} - -int bch2_clamp_data_hole(struct inode *inode, - u64 *hole_start, - u64 *hole_end, - unsigned min_replicas, - bool nonblock) -{ - loff_t ret; - - ret = bch2_seek_pagecache_hole(inode, - *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; - if (ret < 0) - return ret; - - *hole_start = ret; - - if (*hole_start == *hole_end) - return 0; - - ret = bch2_seek_pagecache_data(inode, - *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; - if (ret < 0) - return ret; - - *hole_end = ret; - return 0; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h deleted file mode 100644 index fad911cf5068..000000000000 --- a/fs/bcachefs/fs-io-pagecache.h +++ /dev/null @@ -1,176 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IO_PAGECACHE_H -#define _BCACHEFS_FS_IO_PAGECACHE_H - -#include <linux/pagemap.h> - -typedef DARRAY(struct folio *) folios; - -int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, - u64, fgf_t, gfp_t, folios *); -int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); - -/* - * Use u64 for the end pos and sector helpers because if the folio covers the - * max supported range of the mapping, the start offset of the next folio - * overflows loff_t. This breaks much of the range based processing in the - * buffered write path. - */ -static inline u64 folio_end_pos(struct folio *folio) -{ - return folio_pos(folio) + folio_size(folio); -} - -static inline size_t folio_sectors(struct folio *folio) -{ - return PAGE_SECTORS << folio_order(folio); -} - -static inline loff_t folio_sector(struct folio *folio) -{ - return folio_pos(folio) >> 9; -} - -static inline u64 folio_end_sector(struct folio *folio) -{ - return folio_end_pos(folio) >> 9; -} - -#define BCH_FOLIO_SECTOR_STATE() \ - x(unallocated) \ - x(reserved) \ - x(dirty) \ - x(dirty_reserved) \ - x(allocated) - -enum bch_folio_sector_state { -#define x(n) SECTOR_##n, - BCH_FOLIO_SECTOR_STATE() -#undef x -}; - -struct bch_folio_sector { - /* Uncompressed, fully allocated replicas (or on disk reservation): */ - u8 nr_replicas:4, - /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ - replicas_reserved:4; - u8 state; -}; - -struct bch_folio { - spinlock_t lock; - atomic_t write_count; - /* - * Is the sector state up to date with the btree? - * (Not the data itself) - */ - bool uptodate; - struct bch_folio_sector s[]; -}; - -/* Helper for when we need to add debug instrumentation: */ -static inline void bch2_folio_sector_set(struct folio *folio, - struct bch_folio *s, - unsigned i, unsigned n) -{ - s->s[i].state = n; -} - -/* file offset (to folio offset) to bch_folio_sector index */ -static inline int folio_pos_to_s(struct folio *folio, loff_t pos) -{ - u64 f_offset = pos - folio_pos(folio); - - BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); - return f_offset >> SECTOR_SHIFT; -} - -/* for newly allocated folios: */ -static inline void __bch2_folio_release(struct folio *folio) -{ - kfree(folio_detach_private(folio)); -} - -static inline void bch2_folio_release(struct folio *folio) -{ - EBUG_ON(!folio_test_locked(folio)); - __bch2_folio_release(folio); -} - -static inline struct bch_folio *__bch2_folio(struct folio *folio) -{ - return folio_get_private(folio); -} - -static inline struct bch_folio *bch2_folio(struct folio *folio) -{ - EBUG_ON(!folio_test_locked(folio)); - - return __bch2_folio(folio); -} - -struct bch_folio *__bch2_folio_create(struct folio *, gfp_t); -struct bch_folio *bch2_folio_create(struct folio *, gfp_t); - -struct bch2_folio_reservation { - struct disk_reservation disk; - struct quota_res quota; -}; - -static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) -{ - /* XXX: this should not be open coded */ - return inode->ei_inode.bi_data_replicas - ? inode->ei_inode.bi_data_replicas - 1 - : c->opts.data_replicas; -} - -static inline void bch2_folio_reservation_init(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch2_folio_reservation *res) -{ - memset(res, 0, sizeof(*res)); - - res->disk.nr_replicas = inode_nr_replicas(c, inode); -} - -int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); -void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); - -void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); -int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool); - -int bch2_get_folio_disk_reservation(struct bch_fs *, - struct bch_inode_info *, - struct folio *, bool); - -void bch2_folio_reservation_put(struct bch_fs *, - struct bch_inode_info *, - struct bch2_folio_reservation *); -int bch2_folio_reservation_get(struct bch_fs *, - struct bch_inode_info *, - struct folio *, - struct bch2_folio_reservation *, - size_t, size_t); -ssize_t bch2_folio_reservation_get_partial(struct bch_fs *, - struct bch_inode_info *, - struct folio *, - struct bch2_folio_reservation *, - size_t, size_t); - -void bch2_set_folio_dirty(struct bch_fs *, - struct bch_inode_info *, - struct folio *, - struct bch2_folio_reservation *, - unsigned, unsigned); - -vm_fault_t bch2_page_fault(struct vm_fault *); -vm_fault_t bch2_page_mkwrite(struct vm_fault *); -void bch2_invalidate_folio(struct folio *, size_t, size_t); -bool bch2_release_folio(struct folio *, gfp_t); - -loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool); -loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool); -int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); - -#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c deleted file mode 100644 index a233f45875e9..000000000000 --- a/fs/bcachefs/fs-io.c +++ /dev/null @@ -1,1102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "clock.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "extent_update.h" -#include "fs.h" -#include "fs-io.h" -#include "fs-io-buffered.h" -#include "fs-io-pagecache.h" -#include "fsck.h" -#include "inode.h" -#include "journal.h" -#include "io_misc.h" -#include "keylist.h" -#include "quota.h" -#include "reflink.h" -#include "trace.h" - -#include <linux/aio.h> -#include <linux/backing-dev.h> -#include <linux/falloc.h> -#include <linux/migrate.h> -#include <linux/mmu_context.h> -#include <linux/pagevec.h> -#include <linux/rmap.h> -#include <linux/sched/signal.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/uio.h> - -#include <trace/events/writeback.h> - -struct nocow_flush { - struct closure *cl; - struct bch_dev *ca; - struct bio bio; -}; - -static void nocow_flush_endio(struct bio *_bio) -{ - - struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); - - closure_put(bio->cl); - enumerated_ref_put(&bio->ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_nocow_flush); - bio_put(&bio->bio); -} - -void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, - struct bch_inode_info *inode, - struct closure *cl) -{ - struct nocow_flush *bio; - struct bch_dev *ca; - struct bch_devs_mask devs; - unsigned dev; - - dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); - if (dev == BCH_SB_MEMBERS_MAX) - return; - - devs = inode->ei_devs_need_flush; - memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - - for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { - scoped_guard(rcu) { - ca = rcu_dereference(c->devs[dev]); - if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_nocow_flush)) - ca = NULL; - } - - if (!ca) - continue; - - bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, - REQ_OP_WRITE|REQ_PREFLUSH, - GFP_KERNEL, - &c->nocow_flush_bioset), - struct nocow_flush, bio); - bio->cl = cl; - bio->ca = ca; - bio->bio.bi_end_io = nocow_flush_endio; - closure_bio_submit(&bio->bio, cl); - } -} - -static int bch2_inode_flush_nocow_writes(struct bch_fs *c, - struct bch_inode_info *inode) -{ - struct closure cl; - - closure_init_stack(&cl); - bch2_inode_flush_nocow_writes_async(c, inode, &cl); - closure_sync(&cl); - - return 0; -} - -/* i_size updates: */ - -struct inode_new_size { - loff_t new_size; - u64 now; - unsigned fields; -}; - -static int inode_set_size(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct inode_new_size *s = p; - - bi->bi_size = s->new_size; - if (s->fields & ATTR_ATIME) - bi->bi_atime = s->now; - if (s->fields & ATTR_MTIME) - bi->bi_mtime = s->now; - if (s->fields & ATTR_CTIME) - bi->bi_ctime = s->now; - - return 0; -} - -int __must_check bch2_write_inode_size(struct bch_fs *c, - struct bch_inode_info *inode, - loff_t new_size, unsigned fields) -{ - struct inode_new_size s = { - .new_size = new_size, - .now = bch2_current_time(c), - .fields = fields, - }; - - return bch2_write_inode(c, inode, inode_set_size, &s, fields); -} - -void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, s64 sectors) -{ - if (unlikely((s64) inode->v.i_blocks + sectors < 0)) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, sectors, - inode->ei_inode.bi_sectors); - - bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - if (sectors < 0) - sectors = -inode->v.i_blocks; - else - sectors = 0; - } - - inode->v.i_blocks += sectors; - -#ifdef CONFIG_BCACHEFS_QUOTA - if (quota_res && - !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && - sectors > 0) { - BUG_ON(sectors > quota_res->sectors); - BUG_ON(sectors > inode->ei_quota_reserved); - - quota_res->sectors -= sectors; - inode->ei_quota_reserved -= sectors; - } else { - bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); - } -#endif -} - -/* fsync: */ - -static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, - u64 *seq) -{ - struct printbuf buf = PRINTBUF; - struct bch_inode_unpacked u; - struct btree_iter iter; - int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); - if (ret) - return ret; - - u64 cur_seq = journal_cur_seq(&trans->c->journal); - *seq = min(cur_seq, u.bi_journal_seq); - - if (fsck_err_on(u.bi_journal_seq > cur_seq, - trans, inode_journal_seq_in_future, - "inode journal seq in future (currently at %llu)\n%s", - cur_seq, - (bch2_inode_unpacked_to_text(&buf, &u), - buf.buf))) { - u.bi_journal_seq = cur_seq; - ret = bch2_inode_write(trans, &iter, &u); - } -fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -} - -/* - * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an - * insert trigger: look up the btree inode instead - */ -static int bch2_flush_inode(struct bch_fs *c, - struct bch_inode_info *inode) -{ - if (c->opts.journal_flush_disabled) - return 0; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync)) - return -EROFS; - - u64 seq; - int ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: - bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: - bch2_inode_flush_nocow_writes(c, inode); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync); - return ret; -} - -int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret, err; - - trace_bch2_fsync(file, datasync); - - ret = file_write_and_wait_range(file, start, end); - if (ret) - goto out; - ret = sync_inode_metadata(&inode->v, 1); - if (ret) - goto out; - ret = bch2_flush_inode(c, inode); -out: - ret = bch2_err_class(ret); - if (ret == -EROFS) - ret = -EIO; - - err = file_check_and_advance_wb_err(file); - if (!ret) - ret = err; - - return ret; -} - -/* truncate: */ - -static inline int range_has_data(struct bch_fs *c, u32 subvol, - struct bpos start, - struct bpos end) -{ - return bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, - subvol, 0, k, ({ - bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); - }))); -} - -static int __bch2_truncate_folio(struct bch_inode_info *inode, - pgoff_t index, loff_t start, loff_t end) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct address_space *mapping = inode->v.i_mapping; - struct bch_folio *s; - unsigned start_offset; - unsigned end_offset; - unsigned i; - struct folio *folio; - s64 i_sectors_delta = 0; - int ret = 0; - u64 end_pos; - - folio = filemap_lock_folio(mapping, index); - if (IS_ERR_OR_NULL(folio)) { - /* - * XXX: we're doing two index lookups when we end up reading the - * folio - */ - ret = range_has_data(c, inode->ei_inum.subvol, - POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), - POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); - if (ret <= 0) - return ret; - - folio = __filemap_get_folio(mapping, index, - FGP_LOCK|FGP_CREAT, GFP_KERNEL); - if (IS_ERR(folio)) { - ret = -ENOMEM; - goto out; - } - } - - BUG_ON(start >= folio_end_pos(folio)); - BUG_ON(end <= folio_pos(folio)); - - start_offset = max(start, folio_pos(folio)) - folio_pos(folio); - end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); - - /* Folio boundary? Nothing to do */ - if (start_offset == 0 && - end_offset == folio_size(folio)) { - ret = 0; - goto unlock; - } - - s = bch2_folio_create(folio, 0); - if (!s) { - ret = -ENOMEM; - goto unlock; - } - - if (!folio_test_uptodate(folio)) { - ret = bch2_read_single_folio(folio, mapping); - if (ret) - goto unlock; - } - - ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); - if (ret) - goto unlock; - - for (i = round_up(start_offset, block_bytes(c)) >> 9; - i < round_down(end_offset, block_bytes(c)) >> 9; - i++) { - s->s[i].nr_replicas = 0; - - i_sectors_delta -= s->s[i].state == SECTOR_dirty; - bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); - } - - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - - /* - * Caller needs to know whether this folio will be written out by - * writeback - doing an i_size update if necessary - or whether it will - * be responsible for the i_size update. - * - * Note that we shouldn't ever see a folio beyond EOF, but check and - * warn if so. This has been observed by failure to clean up folios - * after a short write and there's still a chance reclaim will fix - * things up. - */ - WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); - end_pos = folio_end_pos(folio); - if (inode->v.i_size > folio_pos(folio)) - end_pos = min_t(u64, inode->v.i_size, end_pos); - ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; - - folio_zero_segment(folio, start_offset, end_offset); - - /* - * Bit of a hack - we don't want truncate to fail due to -ENOSPC. - * - * XXX: because we aren't currently tracking whether the folio has actual - * data in it (vs. just 0s, or only partially written) this wrong. ick. - */ - BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); - - /* - * This removes any writeable userspace mappings; we need to force - * .page_mkwrite to be called again before any mmapped writes, to - * redirty the full page: - */ - folio_mkclean(folio); - filemap_dirty_folio(mapping, folio); -unlock: - folio_unlock(folio); - folio_put(folio); -out: - return ret; -} - -static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) -{ - return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, - from, ANYSINT_MAX(loff_t)); -} - -static int bch2_truncate_folios(struct bch_inode_info *inode, - loff_t start, loff_t end) -{ - int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, - start, end); - - if (ret >= 0 && - start >> PAGE_SHIFT != end >> PAGE_SHIFT) - ret = __bch2_truncate_folio(inode, - (end - 1) >> PAGE_SHIFT, - start, end); - return ret; -} - -static int bch2_extend(struct mnt_idmap *idmap, - struct bch_inode_info *inode, - struct bch_inode_unpacked *inode_u, - struct iattr *iattr) -{ - struct address_space *mapping = inode->v.i_mapping; - int ret; - - /* - * sync appends: - * - * this has to be done _before_ extending i_size: - */ - ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); - if (ret) - return ret; - - truncate_setsize(&inode->v, iattr->ia_size); - - return bch2_setattr_nonsize(idmap, inode, iattr); -} - -int bchfs_truncate(struct mnt_idmap *idmap, - struct bch_inode_info *inode, struct iattr *iattr) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct address_space *mapping = inode->v.i_mapping; - struct bch_inode_unpacked inode_u; - s64 i_sectors_delta = 0; - int ret = 0; - - /* - * If the truncate call with change the size of the file, the - * cmtimes should be updated. If the size will not change, we - * do not need to update the cmtimes. - */ - if (iattr->ia_size != inode->v.i_size) { - if (!(iattr->ia_valid & ATTR_MTIME)) - ktime_get_coarse_real_ts64(&iattr->ia_mtime); - if (!(iattr->ia_valid & ATTR_CTIME)) - ktime_get_coarse_real_ts64(&iattr->ia_ctime); - iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; - } - - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(inode); - - ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); - if (ret) - goto err; - - /* - * check this before next assertion; on filesystem error our normal - * invariants are a bit broken (truncate has to truncate the page cache - * before the inode). - */ - ret = bch2_journal_error(&c->journal); - if (ret) - goto err; - - WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && - inode->v.i_size < inode_u.bi_size, - "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", - (u64) inode->v.i_size, inode_u.bi_size); - - if (iattr->ia_size > inode->v.i_size) { - ret = bch2_extend(idmap, inode, &inode_u, iattr); - goto err; - } - - iattr->ia_valid &= ~ATTR_SIZE; - - ret = bch2_truncate_folio(inode, iattr->ia_size); - if (unlikely(ret < 0)) - goto err; - ret = 0; - - truncate_setsize(&inode->v, iattr->ia_size); - - /* - * When extending, we're going to write the new i_size to disk - * immediately so we need to flush anything above the current on disk - * i_size first: - * - * Also, when extending we need to flush the page that i_size currently - * straddles - if it's mapped to userspace, we need to ensure that - * userspace has to redirty it and call .mkwrite -> set_page_dirty - * again to allocate the part of the page that was extended. - */ - if (iattr->ia_size > inode_u.bi_size) - ret = filemap_write_and_wait_range(mapping, - inode_u.bi_size, - iattr->ia_size - 1); - else if (iattr->ia_size & (PAGE_SIZE - 1)) - ret = filemap_write_and_wait_range(mapping, - round_down(iattr->ia_size, PAGE_SIZE), - iattr->ia_size - 1); - if (ret) - goto err; - - ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - - if (unlikely(ret)) { - /* - * If we error here, VFS caches are now inconsistent with btree - */ - set_bit(EI_INODE_ERROR, &inode->ei_flags); - goto err; - } - - if (unlikely(!inode->v.i_size && inode->v.i_blocks && - !bch2_journal_error(&c->journal))) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, - inode->ei_inode.bi_sectors); - - bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_setattr_nonsize(idmap, inode, iattr); -err: - bch2_pagecache_block_put(inode); - return bch2_err_class(ret); -} - -/* fallocate: */ - -static int inode_update_times_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); - return 0; -} - -static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - u64 end = offset + len; - u64 block_start = round_up(offset, block_bytes(c)); - u64 block_end = round_down(end, block_bytes(c)); - bool truncated_last_page; - int ret = 0; - - ret = bch2_truncate_folios(inode, offset, end); - if (unlikely(ret < 0)) - goto err; - - truncated_last_page = ret; - - truncate_pagecache_range(&inode->v, offset, end - 1); - - if (block_start < block_end) { - s64 i_sectors_delta = 0; - - ret = bch2_fpunch(c, inode_inum(inode), - block_start >> 9, block_end >> 9, - &i_sectors_delta); - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - } - - mutex_lock(&inode->ei_update_lock); - if (end >= inode->v.i_size && !truncated_last_page) { - ret = bch2_write_inode_size(c, inode, inode->v.i_size, - ATTR_MTIME|ATTR_CTIME); - } else { - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_MTIME|ATTR_CTIME); - } - mutex_unlock(&inode->ei_update_lock); -err: - return ret; -} - -static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode, - loff_t offset, loff_t len, - bool insert) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct address_space *mapping = inode->v.i_mapping; - s64 i_sectors_delta = 0; - int ret = 0; - - if ((offset | len) & (block_bytes(c) - 1)) - return -EINVAL; - - if (insert) { - if (offset >= inode->v.i_size) - return -EINVAL; - } else { - if (offset + len >= inode->v.i_size) - return -EINVAL; - } - - ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); - if (ret) - return ret; - - if (insert) - i_size_write(&inode->v, inode->v.i_size + len); - - ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, - insert, &i_sectors_delta); - if (!ret && !insert) - i_size_write(&inode->v, inode->v.i_size - len); - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - - return ret; -} - -static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, - u64 start_sector, u64 end_sector) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bpos end_pos = POS(inode->v.i_ino, end_sector); - struct bch_io_opts opts; - int ret = 0; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inode->v.i_ino, start_sector), - BTREE_ITER_slots|BTREE_ITER_intent); - - while (!ret) { - s64 i_sectors_delta = 0; - struct quota_res quota_res = { 0 }; - struct bkey_s_c k; - unsigned sectors; - bool is_allocation; - u64 hole_start, hole_end; - u32 snapshot; - - bch2_trans_begin(trans); - - if (bkey_ge(iter.pos, end_pos)) - break; - - ret = bch2_subvolume_get_snapshot(trans, - inode->ei_inum.subvol, &snapshot); - if (ret) - goto bkey_err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - - k = bch2_btree_iter_peek_slot(trans, &iter); - if ((ret = bkey_err(k))) - goto bkey_err; - - hole_start = iter.pos.offset; - hole_end = bpos_min(k.k->p, end_pos).offset; - is_allocation = bkey_extent_is_allocation(k.k); - - /* already reserved */ - if (bkey_extent_is_reservation(k) && - bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { - bch2_btree_iter_advance(trans, &iter); - continue; - } - - if (bkey_extent_is_data(k.k) && - !(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_advance(trans, &iter); - continue; - } - - if (!(mode & FALLOC_FL_ZERO_RANGE)) { - /* - * Lock ordering - can't be holding btree locks while - * blocking on a folio lock: - */ - if (bch2_clamp_data_hole(&inode->v, - &hole_start, - &hole_end, - opts.data_replicas, true)) { - ret = drop_locks_do(trans, - (bch2_clamp_data_hole(&inode->v, - &hole_start, - &hole_end, - opts.data_replicas, false), 0)); - if (ret) - goto bkey_err; - } - bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start)); - - if (ret) - goto bkey_err; - - if (hole_start == hole_end) - continue; - } - - sectors = hole_end - hole_start; - - if (!is_allocation) { - ret = bch2_quota_reservation_add(c, inode, - "a_res, sectors, true); - if (unlikely(ret)) - goto bkey_err; - } - - ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, - sectors, opts, &i_sectors_delta, - writepoint_hashed((unsigned long) current)); - if (ret) - goto bkey_err; - - bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - - if (bch2_mark_pagecache_reserved(inode, &hole_start, - iter.pos.offset, true)) { - ret = drop_locks_do(trans, - bch2_mark_pagecache_reserved(inode, &hole_start, - iter.pos.offset, false)); - if (ret) - goto bkey_err; - } -bkey_err: - bch2_quota_reservation_put(c, inode, "a_res); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - } - - if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { - struct quota_res quota_res = { 0 }; - s64 i_sectors_delta = 0; - - bch2_fpunch_at(trans, &iter, inode_inum(inode), - end_sector, &i_sectors_delta); - bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - bch2_quota_reservation_put(c, inode, "a_res); - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode, - loff_t offset, loff_t len) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - u64 end = offset + len; - u64 block_start = round_down(offset, block_bytes(c)); - u64 block_end = round_up(end, block_bytes(c)); - bool truncated_last_page = false; - int ret, ret2 = 0; - - if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { - ret = inode_newsize_ok(&inode->v, end); - if (ret) - return ret; - } - - if (mode & FALLOC_FL_ZERO_RANGE) { - ret = bch2_truncate_folios(inode, offset, end); - if (unlikely(ret < 0)) - return ret; - - truncated_last_page = ret; - - truncate_pagecache_range(&inode->v, offset, end - 1); - - block_start = round_up(offset, block_bytes(c)); - block_end = round_down(end, block_bytes(c)); - } - - ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); - - /* - * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, - * so that the VFS cache i_size is consistent with the btree i_size: - */ - if (ret && - !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) - return ret; - - if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) - end = inode->v.i_size; - - if (end >= inode->v.i_size && - (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || - !(mode & FALLOC_FL_KEEP_SIZE))) { - spin_lock(&inode->v.i_lock); - i_size_write(&inode->v, end); - spin_unlock(&inode->v.i_lock); - - mutex_lock(&inode->ei_update_lock); - ret2 = bch2_write_inode_size(c, inode, end, 0); - mutex_unlock(&inode->ei_update_lock); - } - - return ret ?: ret2; -} - -long bch2_fallocate_dispatch(struct file *file, int mode, - loff_t offset, loff_t len) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - long ret; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate)) - return -EROFS; - - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(inode); - - ret = file_modified(file); - if (ret) - goto err; - - if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) - ret = bchfs_fallocate(inode, mode, offset, len); - else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) - ret = bchfs_fpunch(inode, offset, len); - else if (mode == FALLOC_FL_INSERT_RANGE) - ret = bchfs_fcollapse_finsert(inode, offset, len, true); - else if (mode == FALLOC_FL_COLLAPSE_RANGE) - ret = bchfs_fcollapse_finsert(inode, offset, len, false); - else - ret = -EOPNOTSUPP; -err: - bch2_pagecache_block_put(inode); - inode_unlock(&inode->v); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate); - - return bch2_err_class(ret); -} - -/* - * Take a quota reservation for unallocated blocks in a given file range - * Does not check pagecache - */ -static int quota_reserve_range(struct bch_inode_info *inode, - struct quota_res *res, - u64 start, u64 end) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - u64 sectors = end - start; - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, - BTREE_ID_extents, - POS(inode->v.i_ino, start), - POS(inode->v.i_ino, end - 1), - inode->ei_inum.subvol, 0, k, ({ - if (bkey_extent_is_allocation(k.k)) { - u64 s = min(end, k.k->p.offset) - - max(start, bkey_start_offset(k.k)); - BUG_ON(s > sectors); - sectors -= s; - } - - 0; - }))); - - return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); -} - -loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - loff_t len, unsigned remap_flags) -{ - struct bch_inode_info *src = file_bch_inode(file_src); - struct bch_inode_info *dst = file_bch_inode(file_dst); - struct bch_fs *c = src->v.i_sb->s_fs_info; - struct quota_res quota_res = { 0 }; - s64 i_sectors_delta = 0; - u64 aligned_len; - loff_t ret = 0; - - if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) - return -EINVAL; - - if ((pos_src & (block_bytes(c) - 1)) || - (pos_dst & (block_bytes(c) - 1))) - return -EINVAL; - - if (src == dst && - abs(pos_src - pos_dst) < len) - return -EINVAL; - - lock_two_nondirectories(&src->v, &dst->v); - bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); - - inode_dio_wait(&src->v); - inode_dio_wait(&dst->v); - - ret = generic_remap_file_range_prep(file_src, pos_src, - file_dst, pos_dst, - &len, remap_flags); - if (ret < 0 || len == 0) - goto err; - - aligned_len = round_up((u64) len, block_bytes(c)); - - ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, - pos_dst, pos_dst + len - 1); - if (ret) - goto err; - - ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, - (pos_dst + aligned_len) >> 9); - if (ret) - goto err; - - if (!(remap_flags & REMAP_FILE_DEDUP)) - file_update_time(file_dst); - - bch2_mark_pagecache_unallocated(src, pos_src >> 9, - (pos_src + aligned_len) >> 9); - - /* - * XXX: we'd like to be telling bch2_remap_range() if we have - * permission to write to the source file, and thus if io path option - * changes should be propagated through the copy, but we need mnt_idmap - * from the pathwalk, awkward - */ - ret = bch2_remap_range(c, - inode_inum(dst), pos_dst >> 9, - inode_inum(src), pos_src >> 9, - aligned_len >> 9, - pos_dst + len, &i_sectors_delta, - false); - if (ret < 0) - goto err; - - /* - * due to alignment, we might have remapped slightly more than requsted - */ - ret = min((u64) ret << 9, (u64) len); - - bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); - - spin_lock(&dst->v.i_lock); - if (pos_dst + ret > dst->v.i_size) - i_size_write(&dst->v, pos_dst + ret); - spin_unlock(&dst->v.i_lock); - - if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || - IS_SYNC(file_inode(file_dst))) - ret = bch2_flush_inode(c, dst); -err: - bch2_quota_reservation_put(c, dst, "a_res); - bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); - unlock_two_nondirectories(&src->v, &dst->v); - - return bch2_err_class(ret); -} - -/* fseek: */ - -static loff_t bch2_seek_data(struct file *file, u64 offset) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - subvol_inum inum = inode_inum(inode); - u64 isize, next_data = MAX_LFS_FILESIZE; - - isize = i_size_read(&inode->v); - if (offset >= isize) - return -ENXIO; - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), - POS(inode->v.i_ino, U64_MAX), - inum.subvol, 0, k, ({ - if (bkey_extent_is_data(k.k)) { - next_data = max(offset, bkey_start_offset(k.k) << 9); - break; - } else if (k.k->p.offset >> 9 > isize) - break; - 0; - }))); - if (ret) - return ret; - - if (next_data > offset) - next_data = bch2_seek_pagecache_data(&inode->v, - offset, next_data, 0, false); - - if (next_data >= isize) - return -ENXIO; - - return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); -} - -static loff_t bch2_seek_hole(struct file *file, u64 offset) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - subvol_inum inum = inode_inum(inode); - u64 isize, next_hole = MAX_LFS_FILESIZE; - - isize = i_size_read(&inode->v); - if (offset >= isize) - return -ENXIO; - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), - POS(inode->v.i_ino, U64_MAX), - inum.subvol, BTREE_ITER_slots, k, ({ - if (k.k->p.inode != inode->v.i_ino || - !bkey_extent_is_data(k.k)) { - loff_t start_offset = k.k->p.inode == inode->v.i_ino - ? max(offset, bkey_start_offset(k.k) << 9) - : offset; - loff_t end_offset = k.k->p.inode == inode->v.i_ino - ? MAX_LFS_FILESIZE - : k.k->p.offset << 9; - - /* - * Found a hole in the btree, now make sure it's - * a hole in the pagecache. We might have to - * keep searching if this hole is entirely dirty - * in the page cache: - */ - bch2_trans_unlock(trans); - loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, - start_offset, end_offset, 0, false); - if (pagecache_hole < end_offset) { - next_hole = pagecache_hole; - break; - } - } else { - offset = max(offset, bkey_start_offset(k.k) << 9); - } - 0; - }))); - if (ret) - return ret; - - if (next_hole > isize) - next_hole = isize; - - return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); -} - -loff_t bch2_llseek(struct file *file, loff_t offset, int whence) -{ - loff_t ret; - - switch (whence) { - case SEEK_SET: - case SEEK_CUR: - case SEEK_END: - ret = generic_file_llseek(file, offset, whence); - break; - case SEEK_DATA: - ret = bch2_seek_data(file, offset); - break; - case SEEK_HOLE: - ret = bch2_seek_hole(file, offset); - break; - default: - ret = -EINVAL; - break; - } - - return bch2_err_class(ret); -} - -void bch2_fs_fsio_exit(struct bch_fs *c) -{ - bioset_exit(&c->nocow_flush_bioset); -} - -int bch2_fs_fsio_init(struct bch_fs *c) -{ - if (bioset_init(&c->nocow_flush_bioset, - 1, offsetof(struct nocow_flush, bio), 0)) - return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; - - return 0; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h deleted file mode 100644 index ca70346e68dc..000000000000 --- a/fs/bcachefs/fs-io.h +++ /dev/null @@ -1,184 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IO_H -#define _BCACHEFS_FS_IO_H - -#ifndef NO_BCACHEFS_FS - -#include "buckets.h" -#include "fs.h" -#include "io_write_types.h" -#include "quota.h" - -#include <linux/uio.h> - -struct folio_vec { - struct folio *fv_folio; - size_t fv_offset; - size_t fv_len; -}; - -static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) -{ - - struct folio *folio = page_folio(bv.bv_page); - size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + - bv.bv_offset; - size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); - - return (struct folio_vec) { - .fv_folio = folio, - .fv_offset = offset, - .fv_len = len, - }; -} - -static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, - struct bvec_iter iter) -{ - return biovec_to_foliovec(bio_iter_iovec(bio, iter)); -} - -#define __bio_for_each_folio(bvl, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ - bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) - -/** - * bio_for_each_folio - iterate over folios within a bio - * - * Like other non-_all versions, this iterates over what bio->bi_iter currently - * points to. This version is for drivers, where the bio may have previously - * been split or cloned. - */ -#define bio_for_each_folio(bvl, bio, iter) \ - __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) - -struct quota_res { - u64 sectors; -}; - -#ifdef CONFIG_BCACHEFS_QUOTA - -static inline void __bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) -{ - BUG_ON(res->sectors > inode->ei_quota_reserved); - - bch2_quota_acct(c, inode->ei_qid, Q_SPC, - -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); - inode->ei_quota_reserved -= res->sectors; - res->sectors = 0; -} - -static inline void bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) -{ - if (res->sectors) { - mutex_lock(&inode->ei_quota_lock); - __bch2_quota_reservation_put(c, inode, res); - mutex_unlock(&inode->ei_quota_lock); - } -} - -static inline int bch2_quota_reservation_add(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res, - u64 sectors, - bool check_enospc) -{ - int ret; - - if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) - return 0; - - mutex_lock(&inode->ei_quota_lock); - ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, - check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); - if (likely(!ret)) { - inode->ei_quota_reserved += sectors; - res->sectors += sectors; - } - mutex_unlock(&inode->ei_quota_lock); - - return ret; -} - -#else - -static inline void __bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) {} - -static inline void bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) {} - -static inline int bch2_quota_reservation_add(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res, - unsigned sectors, - bool check_enospc) -{ - return 0; -} - -#endif - -void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *, - struct quota_res *, s64); - -static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, s64 sectors) -{ - if (sectors) { - mutex_lock(&inode->ei_quota_lock); - __bch2_i_sectors_acct(c, inode, quota_res, sectors); - mutex_unlock(&inode->ei_quota_lock); - } -} - -static inline struct address_space *faults_disabled_mapping(void) -{ - return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); -} - -static inline void set_fdm_dropped_locks(void) -{ - current->faults_disabled_mapping = - (void *) (((unsigned long) current->faults_disabled_mapping)|1); -} - -static inline bool fdm_dropped_locks(void) -{ - return ((unsigned long) current->faults_disabled_mapping) & 1; -} - -void bch2_inode_flush_nocow_writes_async(struct bch_fs *, - struct bch_inode_info *, struct closure *); - -int __must_check bch2_write_inode_size(struct bch_fs *, - struct bch_inode_info *, - loff_t, unsigned); - -int bch2_fsync(struct file *, loff_t, loff_t, int); - -int bchfs_truncate(struct mnt_idmap *, - struct bch_inode_info *, struct iattr *); -long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); - -loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, - loff_t, loff_t, unsigned); - -loff_t bch2_llseek(struct file *, loff_t, int); - -void bch2_fs_fsio_exit(struct bch_fs *); -int bch2_fs_fsio_init(struct bch_fs *); -#else -static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} -static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } -#endif - -#endif /* _BCACHEFS_FS_IO_H */ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c deleted file mode 100644 index 4e72e654da96..000000000000 --- a/fs/bcachefs/fs-ioctl.c +++ /dev/null @@ -1,442 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "chardev.h" -#include "dirent.h" -#include "fs.h" -#include "fs-ioctl.h" -#include "namei.h" -#include "quota.h" - -#include <linux/compat.h> -#include <linux/fsnotify.h> -#include <linux/mount.h> -#include <linux/namei.h> -#include <linux/security.h> -#include <linux/writeback.h> - -#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) -#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ -#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ -#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ - -static int bch2_reinherit_attrs_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_inode_info *dir = p; - - return !bch2_reinherit_attrs(bi, &dir->ei_inode); -} - -static int bch2_ioc_reinherit_attrs(struct bch_fs *c, - struct file *file, - struct bch_inode_info *src, - const char __user *name) -{ - struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode); - struct bch_inode_info *dst; - struct inode *vinode = NULL; - char *kname = NULL; - struct qstr qstr; - int ret = 0; - subvol_inum inum; - - kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL); - if (!kname) - return -ENOMEM; - - ret = strncpy_from_user(kname, name, BCH_NAME_MAX); - if (unlikely(ret < 0)) - goto err1; - - qstr.len = ret; - qstr.name = kname; - - ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum); - if (ret) - goto err1; - - vinode = bch2_vfs_inode_get(c, inum); - ret = PTR_ERR_OR_ZERO(vinode); - if (ret) - goto err1; - - dst = to_bch_ei(vinode); - - ret = mnt_want_write_file(file); - if (ret) - goto err2; - - bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); - - if (inode_attr_changing(src, dst, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, dst, - src->ei_qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - goto err3; - } - - ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); -err3: - bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); - - /* return true if we did work */ - if (ret >= 0) - ret = !ret; - - mnt_drop_write_file(file); -err2: - iput(vinode); -err1: - kfree(kname); - - return ret; -} - -static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg) -{ - return put_user(inode->v.i_generation, arg); -} - -static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label) -{ - int ret; - size_t len; - char label[BCH_SB_LABEL_SIZE]; - - BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX); - - mutex_lock(&c->sb_lock); - memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE); - mutex_unlock(&c->sb_lock); - - len = strnlen(label, BCH_SB_LABEL_SIZE); - if (len == BCH_SB_LABEL_SIZE) { - bch_warn(c, - "label is too long, return the first %zu bytes", - --len); - } - - ret = copy_to_user(user_label, label, len); - - return ret ? -EFAULT : 0; -} - -static int bch2_ioc_setlabel(struct bch_fs *c, - struct file *file, - struct bch_inode_info *inode, - const char __user *user_label) -{ - int ret; - char label[BCH_SB_LABEL_SIZE]; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(label, user_label, sizeof(label))) - return -EFAULT; - - if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) { - bch_err(c, - "unable to set label with more than %d bytes", - BCH_SB_LABEL_SIZE - 1); - return -EINVAL; - } - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - mutex_lock(&c->sb_lock); - strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); - ret = bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - mnt_drop_write_file(file); - return ret; -} - -static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) -{ - u32 flags; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (get_user(flags, arg)) - return -EFAULT; - - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "shutdown by ioctl type %u", flags); - - switch (flags) { - case FSOP_GOING_FLAGS_DEFAULT: - ret = bdev_freeze(c->vfs_sb->s_bdev); - if (ret) - break; - bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only2(c, &buf); - bdev_thaw(c->vfs_sb->s_bdev); - break; - case FSOP_GOING_FLAGS_LOGFLUSH: - bch2_journal_flush(&c->journal); - fallthrough; - case FSOP_GOING_FLAGS_NOLOGFLUSH: - bch2_fs_emergency_read_only2(c, &buf); - break; - default: - ret = -EINVAL; - goto noprint; - } - - bch2_print_str(c, KERN_ERR, buf.buf); -noprint: - printbuf_exit(&buf); - return ret; -} - -static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) -{ - struct inode *dir; - struct bch_inode_info *inode; - struct user_namespace *s_user_ns; - struct dentry *dst_dentry; - struct path src_path, dst_path; - int how = LOOKUP_FOLLOW; - int error; - subvol_inum snapshot_src = { 0 }; - unsigned lookup_flags = 0; - unsigned create_flags = BCH_CREATE_SUBVOL; - - if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE| - BCH_SUBVOL_SNAPSHOT_RO)) - return -EINVAL; - - if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && - (arg.src_ptr || - (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) - return -EINVAL; - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) - create_flags |= BCH_CREATE_SNAPSHOT; - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) - create_flags |= BCH_CREATE_SNAPSHOT_RO; - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) { - /* sync_inodes_sb enforce s_umount is locked */ - down_read(&c->vfs_sb->s_umount); - sync_inodes_sb(c->vfs_sb); - up_read(&c->vfs_sb->s_umount); - } - - if (arg.src_ptr) { - error = user_path_at(arg.dirfd, - (const char __user *)(unsigned long)arg.src_ptr, - how, &src_path); - if (error) - goto err1; - - if (src_path.dentry->d_sb->s_fs_info != c) { - path_put(&src_path); - error = -EXDEV; - goto err1; - } - - snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); - } - - dst_dentry = user_path_create(arg.dirfd, - (const char __user *)(unsigned long)arg.dst_ptr, - &dst_path, lookup_flags); - error = PTR_ERR_OR_ZERO(dst_dentry); - if (error) - goto err2; - - if (dst_dentry->d_sb->s_fs_info != c) { - error = -EXDEV; - goto err3; - } - - if (dst_dentry->d_inode) { - error = bch_err_throw(c, EEXIST_subvolume_create); - goto err3; - } - - dir = dst_path.dentry->d_inode; - if (IS_DEADDIR(dir)) { - error = bch_err_throw(c, ENOENT_directory_dead); - goto err3; - } - - s_user_ns = dir->i_sb->s_user_ns; - if (!kuid_has_mapping(s_user_ns, current_fsuid()) || - !kgid_has_mapping(s_user_ns, current_fsgid())) { - error = -EOVERFLOW; - goto err3; - } - - error = inode_permission(file_mnt_idmap(filp), - dir, MAY_WRITE | MAY_EXEC); - if (error) - goto err3; - - if (!IS_POSIXACL(dir)) - arg.mode &= ~current_umask(); - - error = security_path_mkdir(&dst_path, dst_dentry, arg.mode); - if (error) - goto err3; - - if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && - !arg.src_ptr) - snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; - - down_write(&c->snapshot_create_lock); - inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), - dst_dentry, arg.mode|S_IFDIR, - 0, snapshot_src, create_flags); - up_write(&c->snapshot_create_lock); - - error = PTR_ERR_OR_ZERO(inode); - if (error) - goto err3; - - d_instantiate(dst_dentry, &inode->v); - fsnotify_mkdir(dir, dst_dentry); -err3: - done_path_create(&dst_path, dst_dentry); -err2: - if (arg.src_ptr) - path_put(&src_path); -err1: - return error; -} - -static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) -{ - const char __user *name = (void __user *)(unsigned long)arg.dst_ptr; - struct path path; - struct inode *dir; - struct dentry *victim; - int ret = 0; - - if (arg.flags) - return -EINVAL; - - victim = user_path_locked_at(arg.dirfd, name, &path); - if (IS_ERR(victim)) - return PTR_ERR(victim); - - dir = d_inode(path.dentry); - if (victim->d_sb->s_fs_info != c) { - ret = -EXDEV; - goto err; - } - - ret = inode_permission(file_mnt_idmap(filp), d_inode(victim), MAY_WRITE) ?: - __bch2_unlink(dir, victim, true); - if (!ret) { - fsnotify_rmdir(dir, victim); - d_invalidate(victim); - } -err: - inode_unlock(dir); - dput(victim); - path_put(&path); - return ret; -} - -long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - long ret; - - switch (cmd) { - case BCHFS_IOC_REINHERIT_ATTRS: - ret = bch2_ioc_reinherit_attrs(c, file, inode, - (void __user *) arg); - break; - - case FS_IOC_GETVERSION: - ret = bch2_ioc_getversion(inode, (u32 __user *) arg); - break; - - case FS_IOC_SETVERSION: - ret = -ENOTTY; - break; - - case FS_IOC_GETFSLABEL: - ret = bch2_ioc_getlabel(c, (void __user *) arg); - break; - - case FS_IOC_SETFSLABEL: - ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg); - break; - - case FS_IOC_GOINGDOWN: - ret = bch2_ioc_goingdown(c, (u32 __user *) arg); - break; - - case BCH_IOCTL_SUBVOLUME_CREATE: { - struct bch_ioctl_subvolume i; - - ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) - ? -EFAULT - : bch2_ioctl_subvolume_create(c, file, i); - break; - } - - case BCH_IOCTL_SUBVOLUME_DESTROY: { - struct bch_ioctl_subvolume i; - - ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) - ? -EFAULT - : bch2_ioctl_subvolume_destroy(c, file, i); - break; - } - - default: - ret = bch2_fs_ioctl(c, cmd, (void __user *) arg); - break; - } - - return bch2_err_class(ret); -} - -#ifdef CONFIG_COMPAT -long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - /* These are just misnamed, they actually get/put from/to user an int */ - switch (cmd) { - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; - case FS_IOC32_GETVERSION: - cmd = FS_IOC_GETVERSION; - break; - case FS_IOC_GETFSLABEL: - case FS_IOC_SETFSLABEL: - break; - default: - return -ENOIOCTLCMD; - } - return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); -} -#endif - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h deleted file mode 100644 index a657e4994b71..000000000000 --- a/fs/bcachefs/fs-ioctl.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IOCTL_H -#define _BCACHEFS_FS_IOCTL_H - -long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); -long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); - -#endif /* _BCACHEFS_FS_IOCTL_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c deleted file mode 100644 index 687af0eea0c2..000000000000 --- a/fs/bcachefs/fs.c +++ /dev/null @@ -1,2768 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "acl.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "chardev.h" -#include "dirent.h" -#include "errcode.h" -#include "extents.h" -#include "fs.h" -#include "fs-io.h" -#include "fs-ioctl.h" -#include "fs-io-buffered.h" -#include "fs-io-direct.h" -#include "fs-io-pagecache.h" -#include "fsck.h" -#include "inode.h" -#include "io_read.h" -#include "journal.h" -#include "keylist.h" -#include "namei.h" -#include "quota.h" -#include "rebalance.h" -#include "snapshot.h" -#include "super.h" -#include "xattr.h" -#include "trace.h" - -#include <linux/aio.h> -#include <linux/backing-dev.h> -#include <linux/exportfs.h> -#include <linux/fiemap.h> -#include <linux/fileattr.h> -#include <linux/fs_context.h> -#include <linux/module.h> -#include <linux/pagemap.h> -#include <linux/posix_acl.h> -#include <linux/random.h> -#include <linux/seq_file.h> -#include <linux/siphash.h> -#include <linux/statfs.h> -#include <linux/string.h> -#include <linux/xattr.h> - -static struct kmem_cache *bch2_inode_cache; - -static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, - struct bch_inode_info *, - struct bch_inode_unpacked *, - struct bch_subvolume *); - -/* Set VFS inode flags from bcachefs inode: */ -static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode) -{ - static const __maybe_unused unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_sync] = S_SYNC, - [__BCH_INODE_immutable] = S_IMMUTABLE, - [__BCH_INODE_append] = S_APPEND, - [__BCH_INODE_noatime] = S_NOATIME, - }; - - set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); - - if (bch2_inode_casefold(c, &inode->ei_inode)) - inode->v.i_flags |= S_CASEFOLD; - else - inode->v.i_flags &= ~S_CASEFOLD; -} - -void bch2_inode_update_after_write(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - unsigned fields) -{ - struct bch_fs *c = trans->c; - - BUG_ON(bi->bi_inum != inode->v.i_ino); - - bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); - - set_nlink(&inode->v, bch2_inode_nlink_get(bi)); - i_uid_write(&inode->v, bi->bi_uid); - i_gid_write(&inode->v, bi->bi_gid); - inode->v.i_mode = bi->bi_mode; - - if (fields & ATTR_SIZE) - i_size_write(&inode->v, bi->bi_size); - - if (fields & ATTR_ATIME) - inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); - if (fields & ATTR_MTIME) - inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); - if (fields & ATTR_CTIME) - inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); - - inode->ei_inode = *bi; - - bch2_inode_flags_to_vfs(c, inode); -} - -int __must_check bch2_write_inode(struct bch_fs *c, - struct bch_inode_info *inode, - inode_set_fn set, - void *p, unsigned fields) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = {}; - struct bch_inode_unpacked inode_u; - int ret; -retry: - bch2_trans_begin(trans); - - ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); - if (ret) - goto err; - - struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); - - ret = (set ? set(trans, inode, &inode_u, p) : 0); - if (ret) - goto err; - - struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); - bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r)); - - if (rebalance_changed) { - ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); - if (ret) - goto err; - } - - ret = bch2_inode_write(trans, &iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - - /* - * the btree node lock protects inode->ei_inode, not ei_update_lock; - * this is important for inode updates via bchfs_write_index_update - */ - if (!ret) - bch2_inode_update_after_write(trans, inode, &inode_u, fields); -err: - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - if (rebalance_changed) - bch2_rebalance_wakeup(c); - - bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, - "%s: inode %llu:%llu not found when updating", - bch2_err_str(ret), - inode_inum(inode).subvol, - inode_inum(inode).inum); - - bch2_trans_put(trans); - return ret < 0 ? ret : 0; -} - -int bch2_fs_quota_transfer(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch_qid new_qid, - unsigned qtypes, - enum quota_acct_mode mode) -{ - unsigned i; - int ret; - - qtypes &= enabled_qtypes(c); - - for (i = 0; i < QTYP_NR; i++) - if (new_qid.q[i] == inode->ei_qid.q[i]) - qtypes &= ~(1U << i); - - if (!qtypes) - return 0; - - mutex_lock(&inode->ei_quota_lock); - - ret = bch2_quota_transfer(c, qtypes, new_qid, - inode->ei_qid, - inode->v.i_blocks + - inode->ei_quota_reserved, - mode); - if (!ret) - for (i = 0; i < QTYP_NR; i++) - if (qtypes & (1 << i)) - inode->ei_qid.q[i] = new_qid.q[i]; - - mutex_unlock(&inode->ei_quota_lock); - - return ret; -} - -static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) -{ - const subvol_inum *inum = data; - siphash_key_t k = { .key[0] = seed }; - - return siphash_2u64(inum->subvol, inum->inum, &k); -} - -static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) -{ - const struct bch_inode_info *inode = data; - - return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); -} - -static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct bch_inode_info *inode = obj; - const subvol_inum *v = arg->key; - - return !subvol_inum_eq(inode->ei_inum, *v); -} - -static const struct rhashtable_params bch2_vfs_inodes_params = { - .head_offset = offsetof(struct bch_inode_info, hash), - .key_offset = offsetof(struct bch_inode_info, ei_inum), - .key_len = sizeof(subvol_inum), - .hashfn = bch2_vfs_inode_hash_fn, - .obj_hashfn = bch2_vfs_inode_obj_hash_fn, - .obj_cmpfn = bch2_vfs_inode_cmp_fn, - .automatic_shrinking = true, -}; - -static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { - .head_offset = offsetof(struct bch_inode_info, by_inum_hash), - .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), - .key_len = sizeof(u64), - .automatic_shrinking = true, -}; - -int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) -{ - struct bch_fs *c = trans->c; - struct rhltable *ht = &c->vfs_inodes_by_inum_table; - u64 inum = p.offset; - DARRAY(u32) subvols; - int ret = 0; - - if (!test_bit(BCH_FS_started, &c->flags)) - return false; - - darray_init(&subvols); -restart_from_top: - - /* - * Tweaked version of __rhashtable_lookup(); we need to get a list of - * subvolumes in which the given inode number is open. - * - * For this to work, we don't include the subvolume ID in the key that - * we hash - all inodes with the same inode number regardless of - * subvolume will hash to the same slot. - * - * This will be less than ideal if the same file is ever open - * simultaneously in many different snapshots: - */ - rcu_read_lock(); - struct rhash_lock_head __rcu *const *bkt; - struct rhash_head *he; - unsigned int hash; - struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); -restart: - hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); - bkt = rht_bucket(tbl, hash); - do { - struct bch_inode_info *inode; - - rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { - if (inode->ei_inum.inum == inum) { - ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, - GFP_NOWAIT|__GFP_NOWARN); - if (ret) { - rcu_read_unlock(); - ret = darray_make_room(&subvols, 1); - if (ret) - goto err; - subvols.nr = 0; - goto restart_from_top; - } - } - } - /* An object might have been moved to a different hash chain, - * while we walk along it - better check and retry. - */ - } while (he != RHT_NULLS_MARKER(bkt)); - - /* Ensure we see any new tables. */ - smp_rmb(); - - tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); - if (unlikely(tbl)) - goto restart; - rcu_read_unlock(); - - darray_for_each(subvols, i) { - u32 snap; - ret = bch2_subvolume_get_snapshot(trans, *i, &snap); - if (ret) - goto err; - - ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); - if (ret) - break; - } -err: - darray_exit(&subvols); - return ret; -} - -static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) -{ - return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); -} - -static void __wait_on_freeing_inode(struct bch_fs *c, - struct bch_inode_info *inode, - subvol_inum inum) -{ - wait_queue_head_t *wq; - struct wait_bit_queue_entry wait; - - wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->v.i_lock); - - if (__bch2_inode_hash_find(c, inum) == inode) - schedule_timeout(HZ * 10); - finish_wait(wq, &wait.wq_entry); -} - -static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, - subvol_inum inum) -{ - struct bch_inode_info *inode; -repeat: - inode = __bch2_inode_hash_find(c, inum); - if (inode) { - spin_lock(&inode->v.i_lock); - if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { - spin_unlock(&inode->v.i_lock); - return NULL; - } - if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { - if (!trans) { - __wait_on_freeing_inode(c, inode, inum); - } else { - int ret = drop_locks_do(trans, - (__wait_on_freeing_inode(c, inode, inum), 0)); - if (ret) - return ERR_PTR(ret); - } - goto repeat; - } - __iget(&inode->v); - spin_unlock(&inode->v.i_lock); - } - - return inode; -} - -static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) -{ - spin_lock(&inode->v.i_lock); - bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); - spin_unlock(&inode->v.i_lock); - - if (remove) { - int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, - &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); - BUG_ON(ret); - - ret = rhashtable_remove_fast(&c->vfs_inodes_table, - &inode->hash, bch2_vfs_inodes_params); - BUG_ON(ret); - inode->v.i_hash.pprev = NULL; - /* - * This pairs with the bch2_inode_hash_find() -> - * __wait_on_freeing_inode() path - */ - inode_wake_up_bit(&inode->v, __I_NEW); - } -} - -static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, - struct btree_trans *trans, - struct bch_inode_info *inode) -{ - struct bch_inode_info *old = inode; - - set_bit(EI_INODE_HASHED, &inode->ei_flags); -retry: - if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, - &inode->ei_inum, - &inode->hash, - bch2_vfs_inodes_params))) { - old = bch2_inode_hash_find(c, trans, inode->ei_inum); - if (!old) - goto retry; - - clear_bit(EI_INODE_HASHED, &inode->ei_flags); - - /* - * bcachefs doesn't use I_NEW; we have no use for it since we - * only insert fully created inodes in the inode hash table. But - * discard_new_inode() expects it to be set... - */ - inode->v.i_state |= I_NEW; - /* - * We don't want bch2_evict_inode() to delete the inode on disk, - * we just raced and had another inode in cache. Normally new - * inodes don't have nlink == 0 - except tmpfiles do... - */ - set_nlink(&inode->v, 1); - discard_new_inode(&inode->v); - return old; - } else { - int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, - &inode->by_inum_hash, - bch2_vfs_inodes_by_inum_params); - BUG_ON(ret); - - inode_fake_hash(&inode->v); - - inode_sb_list_add(&inode->v); - - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); - return inode; - } -} - -#define memalloc_flags_do(_flags, _do) \ -({ \ - unsigned _saved_flags = memalloc_flags_save(_flags); \ - typeof(_do) _ret = _do; \ - memalloc_noreclaim_restore(_saved_flags); \ - _ret; \ -}) - -static struct inode *bch2_alloc_inode(struct super_block *sb) -{ - BUG(); -} - -static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp) -{ - struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, - bch2_inode_cache, gfp); - if (!inode) - return NULL; - - inode_init_once(&inode->v); - mutex_init(&inode->ei_update_lock); - two_state_lock_init(&inode->ei_pagecache_lock); - INIT_LIST_HEAD(&inode->ei_vfs_inode_list); - inode->ei_flags = 0; - mutex_init(&inode->ei_quota_lock); - memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - - if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) { - kmem_cache_free(bch2_inode_cache, inode); - return NULL; - } - - return inode; -} - -/* - * Allocate a new inode, dropping/retaking btree locks if necessary: - */ -static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) -{ - struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT); - - if (unlikely(!inode)) { - int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM); - if (ret && inode) { - __destroy_inode(&inode->v); - kmem_cache_free(bch2_inode_cache, inode); - } - if (ret) - return ERR_PTR(ret); - } - - return inode; -} - -static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *bi, - struct bch_subvolume *subvol) -{ - struct bch_inode_info *inode = bch2_new_inode(trans); - if (IS_ERR(inode)) - return inode; - - bch2_vfs_inode_init(trans, inum, inode, bi, subvol); - - return bch2_inode_hash_insert(trans->c, trans, inode); - -} - -struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) -{ - struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); - if (inode) - return &inode->v; - - struct btree_trans *trans = bch2_trans_get(c); - - struct bch_inode_unpacked inode_u; - struct bch_subvolume subvol; - int ret = lockrestart_do(trans, - bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: - PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); - bch2_trans_put(trans); - - return ret ? ERR_PTR(ret) : &inode->v; -} - -struct bch_inode_info * -__bch2_create(struct mnt_idmap *idmap, - struct bch_inode_info *dir, struct dentry *dentry, - umode_t mode, dev_t rdev, subvol_inum snapshot_src, - unsigned flags) -{ - struct bch_fs *c = dir->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct bch_inode_unpacked dir_u; - struct bch_inode_info *inode; - struct bch_inode_unpacked inode_u; - struct posix_acl *default_acl = NULL, *acl = NULL; - subvol_inum inum; - struct bch_subvolume subvol; - u64 journal_seq = 0; - kuid_t kuid; - kgid_t kgid; - int ret; - - /* - * preallocate acls + vfs inode before btree transaction, so that - * nothing can fail after the transaction succeeds: - */ -#ifdef CONFIG_BCACHEFS_POSIX_ACL - ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); - if (ret) - return ERR_PTR(ret); -#endif - inode = __bch2_new_inode(c, GFP_NOFS); - if (unlikely(!inode)) { - inode = ERR_PTR(-ENOMEM); - goto err; - } - - bch2_inode_init_early(c, &inode_u); - - if (!(flags & BCH_CREATE_TMPFILE)) - mutex_lock(&dir->ei_update_lock); - - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); - kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); - ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: - bch2_create_trans(trans, - inode_inum(dir), &dir_u, &inode_u, - !(flags & BCH_CREATE_TMPFILE) - ? &dentry->d_name : NULL, - from_kuid(i_user_ns(&dir->v), kuid), - from_kgid(i_user_ns(&dir->v), kgid), - mode, rdev, - default_acl, acl, snapshot_src, flags) ?: - bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, - KEY_TYPE_QUOTA_PREALLOC); - if (unlikely(ret)) - goto err_before_quota; - - inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; - inum.inum = inode_u.bi_inum; - - ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_trans_commit(trans, NULL, &journal_seq, 0); - if (unlikely(ret)) { - bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, - KEY_TYPE_QUOTA_WARN); -err_before_quota: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - goto err_trans; - } - - if (!(flags & BCH_CREATE_TMPFILE)) { - bch2_inode_update_after_write(trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - mutex_unlock(&dir->ei_update_lock); - } - - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - - set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); - set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); - - /* - * we must insert the new inode into the inode cache before calling - * bch2_trans_exit() and dropping locks, else we could race with another - * thread pulling the inode in and modifying it: - * - * also, calling bch2_inode_hash_insert() without passing in the - * transaction object is sketchy - if we could ever end up in - * __wait_on_freeing_inode(), we'd risk deadlock. - * - * But that shouldn't be possible, since we still have the inode locked - * that we just created, and we _really_ can't take a transaction - * restart here. - */ - inode = bch2_inode_hash_insert(c, NULL, inode); - bch2_trans_put(trans); -err: - posix_acl_release(default_acl); - posix_acl_release(acl); - return inode; -err_trans: - if (!(flags & BCH_CREATE_TMPFILE)) - mutex_unlock(&dir->ei_update_lock); - - bch2_trans_put(trans); - make_bad_inode(&inode->v); - iput(&inode->v); - inode = ERR_PTR(ret); - goto err; -} - -/* methods */ - -static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, - subvol_inum dir, struct bch_hash_info *dir_hash_info, - const struct qstr *name) -{ - struct bch_fs *c = trans->c; - subvol_inum inum = {}; - struct printbuf buf = PRINTBUF; - - struct qstr lookup_name; - int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name); - if (ret) - return ERR_PTR(ret); - - struct btree_iter dirent_iter = {}; - struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, - dir_hash_info, dir, &lookup_name, 0); - ret = bkey_err(k); - if (ret) - return ERR_PTR(ret); - - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - - ret = bch2_dirent_read_target(trans, dir, d, &inum); - if (ret > 0) - ret = -ENOENT; - if (ret) - goto err; - - struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); - if (inode) - goto out; - - /* - * Note: if check/repair needs it, we commit before - * bch2_inode_hash_init_insert(), as after that point we can't take a - * restart - not in the top level loop with a commit_do(), like we - * usually do: - */ - - struct bch_subvolume subvol; - struct bch_inode_unpacked inode_u; - ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: - bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); - - /* - * don't remove it: check_inodes might find another inode that points - * back to this dirent - */ - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), - c, "dirent to missing inode:\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); - if (ret) - goto err; -out: - bch2_trans_iter_exit(trans, &dirent_iter); - printbuf_exit(&buf); - return inode; -err: - inode = ERR_PTR(ret); - goto out; -} - -static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, - unsigned int flags) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); - - struct bch_inode_info *inode; - bch2_trans_do(c, - PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), - &hash, &dentry->d_name))); - if (IS_ERR(inode)) - inode = NULL; - - if (!inode && IS_CASEFOLDED(vdir)) { - /* - * Do not cache a negative dentry in casefolded directories - * as it would need to be invalidated in the following situation: - * - Lookup file "blAH" in a casefolded directory - * - Creation of file "BLAH" in a casefolded directory - * - Lookup file "blAH" in a casefolded directory - * which would fail if we had a negative dentry. - * - * We should come back to this when VFS has a method to handle - * this edgecase. - */ - return NULL; - } - - return d_splice_alias(&inode->v, dentry); -} - -static int bch2_mknod(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - struct bch_inode_info *inode = - __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, - (subvol_inum) { 0 }, 0); - - if (IS_ERR(inode)) - return bch2_err_class(PTR_ERR(inode)); - - d_instantiate(dentry, &inode->v); - return 0; -} - -static int bch2_create(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, - umode_t mode, bool excl) -{ - return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); -} - -static int __bch2_link(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch_inode_info *dir, - struct dentry *dentry) -{ - struct bch_inode_unpacked dir_u, inode_u; - int ret; - - mutex_lock(&inode->ei_update_lock); - struct btree_trans *trans = bch2_trans_get(c); - - ret = commit_do(trans, NULL, NULL, 0, - bch2_link_trans(trans, - inode_inum(dir), &dir_u, - inode_inum(inode), &inode_u, - &dentry->d_name)); - - if (likely(!ret)) { - bch2_inode_update_after_write(trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); - } - - bch2_trans_put(trans); - mutex_unlock(&inode->ei_update_lock); - return ret; -} - -static int bch2_link(struct dentry *old_dentry, struct inode *vdir, - struct dentry *dentry) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); - int ret; - - lockdep_assert_held(&inode->v.i_rwsem); - - ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: - bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - __bch2_link(c, inode, dir, dentry); - if (unlikely(ret)) - return bch2_err_class(ret); - - ihold(&inode->v); - d_instantiate(dentry, &inode->v); - return 0; -} - -int __bch2_unlink(struct inode *vdir, struct dentry *dentry, - bool deleting_snapshot) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct bch_inode_unpacked dir_u, inode_u; - int ret; - - bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); - - struct btree_trans *trans = bch2_trans_get(c); - - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_unlink_trans(trans, - inode_inum(dir), &dir_u, - &inode_u, &dentry->d_name, - deleting_snapshot)); - if (unlikely(ret)) - goto err; - - bch2_inode_update_after_write(trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - bch2_inode_update_after_write(trans, inode, &inode_u, - ATTR_MTIME); - - if (inode_u.bi_subvol) { - /* - * Subvolume deletion is asynchronous, but we still want to tell - * the VFS that it's been deleted here: - */ - set_nlink(&inode->v, 0); - } - - if (IS_CASEFOLDED(vdir)) - d_invalidate(dentry); -err: - bch2_trans_put(trans); - bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); - - return ret; -} - -static int bch2_unlink(struct inode *vdir, struct dentry *dentry) -{ - struct bch_inode_info *dir= to_bch_ei(vdir); - struct bch_fs *c = dir->v.i_sb->s_fs_info; - - int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: - __bch2_unlink(vdir, dentry, false); - return bch2_err_class(ret); -} - -static int bch2_symlink(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, - const char *symname) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir), *inode; - int ret; - - inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, - (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); - if (IS_ERR(inode)) - return bch2_err_class(PTR_ERR(inode)); - - inode_lock(&inode->v); - ret = page_symlink(&inode->v, symname, strlen(symname) + 1); - inode_unlock(&inode->v); - - if (unlikely(ret)) - goto err; - - ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); - if (unlikely(ret)) - goto err; - - ret = __bch2_link(c, inode, dir, dentry); - if (unlikely(ret)) - goto err; - - d_instantiate(dentry, &inode->v); - return 0; -err: - iput(&inode->v); - return bch2_err_class(ret); -} - -static struct dentry *bch2_mkdir(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, umode_t mode) -{ - return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0)); -} - -static int bch2_rename2(struct mnt_idmap *idmap, - struct inode *src_vdir, struct dentry *src_dentry, - struct inode *dst_vdir, struct dentry *dst_dentry, - unsigned flags) -{ - struct bch_fs *c = src_vdir->i_sb->s_fs_info; - struct bch_inode_info *src_dir = to_bch_ei(src_vdir); - struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); - struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); - struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); - struct bch_inode_unpacked dst_dir_u, src_dir_u; - struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; - struct btree_trans *trans; - enum bch_rename_mode mode = flags & RENAME_EXCHANGE - ? BCH_RENAME_EXCHANGE - : dst_dentry->d_inode - ? BCH_RENAME_OVERWRITE : BCH_RENAME; - bool whiteout = !!(flags & RENAME_WHITEOUT); - int ret; - - if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) - return -EINVAL; - - if (mode == BCH_RENAME_OVERWRITE) { - ret = filemap_write_and_wait_range(src_inode->v.i_mapping, - 0, LLONG_MAX); - if (ret) - return ret; - } - - bch2_lock_inodes(INODE_UPDATE_LOCK, - src_dir, - dst_dir, - src_inode, - dst_inode); - - trans = bch2_trans_get(c); - - ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: - bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); - if (ret) - goto err_tx_restart; - - if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, src_inode, - dst_dir->ei_qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - goto err; - } - - if (mode == BCH_RENAME_EXCHANGE && - inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, dst_inode, - src_dir->ei_qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - goto err; - } -retry: - bch2_trans_begin(trans); - - ret = bch2_rename_trans(trans, - inode_inum(src_dir), &src_dir_u, - inode_inum(dst_dir), &dst_dir_u, - &src_inode_u, - &dst_inode_u, - &src_dentry->d_name, - &dst_dentry->d_name, - mode); - if (unlikely(ret)) - goto err_tx_restart; - - if (whiteout) { - whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); - ret = PTR_ERR_OR_ZERO(whiteout_inode_u); - if (unlikely(ret)) - goto err_tx_restart; - bch2_inode_init_early(c, whiteout_inode_u); - - ret = bch2_create_trans(trans, - inode_inum(src_dir), &src_dir_u, - whiteout_inode_u, - &src_dentry->d_name, - from_kuid(i_user_ns(&src_dir->v), current_fsuid()), - from_kgid(i_user_ns(&src_dir->v), current_fsgid()), - S_IFCHR|WHITEOUT_MODE, 0, - NULL, NULL, (subvol_inum) { 0 }, 0) ?: - bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, - KEY_TYPE_QUOTA_PREALLOC); - if (unlikely(ret)) - goto err_tx_restart; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0); - if (unlikely(ret)) { -err_tx_restart: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - goto err; - } - - BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); - BUG_ON(dst_inode && - dst_inode->v.i_ino != dst_inode_u.bi_inum); - - bch2_inode_update_after_write(trans, src_dir, &src_dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - - if (src_dir != dst_dir) - bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - - bch2_inode_update_after_write(trans, src_inode, &src_inode_u, - ATTR_CTIME); - - if (dst_inode) - bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, - ATTR_CTIME); -err: - bch2_trans_put(trans); - - bch2_fs_quota_transfer(c, src_inode, - bch_qid(&src_inode->ei_inode), - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_NOCHECK); - if (dst_inode) - bch2_fs_quota_transfer(c, dst_inode, - bch_qid(&dst_inode->ei_inode), - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_NOCHECK); - - bch2_unlock_inodes(INODE_UPDATE_LOCK, - src_dir, - dst_dir, - src_inode, - dst_inode); - - return bch2_err_class(ret); -} - -static void bch2_setattr_copy(struct mnt_idmap *idmap, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - struct iattr *attr) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - unsigned int ia_valid = attr->ia_valid; - kuid_t kuid; - kgid_t kgid; - - if (ia_valid & ATTR_UID) { - kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); - bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); - } - if (ia_valid & ATTR_GID) { - kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); - bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); - } - - if (ia_valid & ATTR_SIZE) - bi->bi_size = attr->ia_size; - - if (ia_valid & ATTR_ATIME) - bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); - if (ia_valid & ATTR_MTIME) - bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) - bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); - - if (ia_valid & ATTR_MODE) { - umode_t mode = attr->ia_mode; - kgid_t gid = ia_valid & ATTR_GID - ? kgid - : inode->v.i_gid; - - if (!in_group_or_capable(idmap, &inode->v, - make_vfsgid(idmap, i_user_ns(&inode->v), gid))) - mode &= ~S_ISGID; - bi->bi_mode = mode; - } -} - -int bch2_setattr_nonsize(struct mnt_idmap *idmap, - struct bch_inode_info *inode, - struct iattr *attr) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_qid qid; - struct btree_trans *trans; - struct btree_iter inode_iter = {}; - struct bch_inode_unpacked inode_u; - struct posix_acl *acl = NULL; - kuid_t kuid; - kgid_t kgid; - int ret; - - mutex_lock(&inode->ei_update_lock); - - qid = inode->ei_qid; - - if (attr->ia_valid & ATTR_UID) { - kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); - qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); - } - - if (attr->ia_valid & ATTR_GID) { - kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); - qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); - } - - ret = bch2_fs_quota_transfer(c, inode, qid, ~0, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - goto err; - - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - kfree(acl); - acl = NULL; - - ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_intent); - if (ret) - goto btree_err; - - bch2_setattr_copy(idmap, inode, &inode_u, attr); - - if (attr->ia_valid & ATTR_MODE) { - ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, - inode_u.bi_mode, &acl); - if (ret) - goto btree_err; - } - - ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); -btree_err: - bch2_trans_iter_exit(trans, &inode_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (unlikely(ret)) - goto err_trans; - - bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); - - if (acl) - set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); -err_trans: - bch2_trans_put(trans); -err: - mutex_unlock(&inode->ei_update_lock); - - return bch2_err_class(ret); -} - -static int bch2_getattr(struct mnt_idmap *idmap, - const struct path *path, struct kstat *stat, - u32 request_mask, unsigned query_flags) -{ - struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); - vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); - - stat->dev = inode->v.i_sb->s_dev; - stat->ino = inode->v.i_ino; - stat->mode = inode->v.i_mode; - stat->nlink = inode->v.i_nlink; - stat->uid = vfsuid_into_kuid(vfsuid); - stat->gid = vfsgid_into_kgid(vfsgid); - stat->rdev = inode->v.i_rdev; - stat->size = i_size_read(&inode->v); - stat->atime = inode_get_atime(&inode->v); - stat->mtime = inode_get_mtime(&inode->v); - stat->ctime = inode_get_ctime(&inode->v); - stat->blksize = block_bytes(c); - stat->blocks = inode->v.i_blocks; - - stat->subvol = inode->ei_inum.subvol; - stat->result_mask |= STATX_SUBVOL; - - if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { - stat->result_mask |= STATX_DIOALIGN; - /* - * this is incorrect; we should be tracking this in superblock, - * and checking the alignment of open devices - */ - stat->dio_mem_align = SECTOR_SIZE; - stat->dio_offset_align = block_bytes(c); - } - - if (request_mask & STATX_BTIME) { - stat->result_mask |= STATX_BTIME; - stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); - } - - if (inode->ei_inode.bi_flags & BCH_INODE_immutable) - stat->attributes |= STATX_ATTR_IMMUTABLE; - stat->attributes_mask |= STATX_ATTR_IMMUTABLE; - - if (inode->ei_inode.bi_flags & BCH_INODE_append) - stat->attributes |= STATX_ATTR_APPEND; - stat->attributes_mask |= STATX_ATTR_APPEND; - - if (inode->ei_inode.bi_flags & BCH_INODE_nodump) - stat->attributes |= STATX_ATTR_NODUMP; - stat->attributes_mask |= STATX_ATTR_NODUMP; - - return 0; -} - -static int bch2_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *iattr) -{ - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret; - - lockdep_assert_held(&inode->v.i_rwsem); - - ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - setattr_prepare(idmap, dentry, iattr); - if (ret) - return ret; - - return iattr->ia_valid & ATTR_SIZE - ? bchfs_truncate(idmap, inode, iattr) - : bch2_setattr_nonsize(idmap, inode, iattr); -} - -static int bch2_tmpfile(struct mnt_idmap *idmap, - struct inode *vdir, struct file *file, umode_t mode) -{ - struct bch_inode_info *inode = - __bch2_create(idmap, to_bch_ei(vdir), - file->f_path.dentry, mode, 0, - (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); - - if (IS_ERR(inode)) - return bch2_err_class(PTR_ERR(inode)); - - d_mark_tmpfile(file, &inode->v); - d_instantiate(file->f_path.dentry, &inode->v); - return finish_open_simple(file, 0); -} - -struct bch_fiemap_extent { - struct bkey_buf kbuf; - unsigned flags; -}; - -static int bch2_fill_extent(struct bch_fs *c, - struct fiemap_extent_info *info, - struct bch_fiemap_extent *fe) -{ - struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k); - unsigned flags = fe->flags; - - BUG_ON(!k.k->size); - - if (bkey_extent_is_direct_data(k.k)) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - int ret; - - if (k.k->type == KEY_TYPE_reflink_v) - flags |= FIEMAP_EXTENT_SHARED; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int flags2 = 0; - u64 offset = p.ptr.offset; - - if (p.ptr.unwritten) - flags2 |= FIEMAP_EXTENT_UNWRITTEN; - - if (p.crc.compression_type) - flags2 |= FIEMAP_EXTENT_ENCODED; - else - offset += p.crc.offset; - - if ((offset & (block_sectors(c) - 1)) || - (k.k->size & (block_sectors(c) - 1))) - flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; - - ret = fiemap_fill_next_extent(info, - bkey_start_offset(k.k) << 9, - offset << 9, - k.k->size << 9, flags|flags2); - if (ret) - return ret; - } - - return 0; - } else if (bkey_extent_is_inline_data(k.k)) { - return fiemap_fill_next_extent(info, - bkey_start_offset(k.k) << 9, - 0, k.k->size << 9, - flags| - FIEMAP_EXTENT_DATA_INLINE); - } else if (k.k->type == KEY_TYPE_reservation) { - return fiemap_fill_next_extent(info, - bkey_start_offset(k.k) << 9, - 0, k.k->size << 9, - flags| - FIEMAP_EXTENT_DELALLOC| - FIEMAP_EXTENT_UNWRITTEN); - } else { - BUG(); - } -} - -/* - * Scan a range of an inode for data in pagecache. - * - * Intended to be retryable, so don't modify the output params until success is - * imminent. - */ -static int -bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end, - bool nonblock) -{ - loff_t dstart, dend; - - dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock); - if (dstart < 0) - return dstart; - - if (dstart == *end) { - *start = dstart; - return 0; - } - - dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock); - if (dend < 0) - return dend; - - /* race */ - BUG_ON(dstart == dend); - - *start = dstart; - *end = dend; - return 0; -} - -/* - * Scan a range of pagecache that corresponds to a file mapping hole in the - * extent btree. If data is found, fake up an extent key so it looks like a - * delalloc extent to the rest of the fiemap processing code. - */ -static int -bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode, - u64 start, u64 end, struct bch_fiemap_extent *cur) -{ - struct bch_fs *c = trans->c; - struct bkey_i_extent *delextent; - struct bch_extent_ptr ptr = {}; - loff_t dstart = start << 9, dend = end << 9; - int ret; - - /* - * We hold btree locks here so we cannot block on folio locks without - * dropping trans locks first. Run a nonblocking scan for the common - * case of no folios over holes and fall back on failure. - * - * Note that dropping locks like this is technically racy against - * writeback inserting to the extent tree, but a non-sync fiemap scan is - * fundamentally racy with writeback anyways. Therefore, just report the - * range as delalloc regardless of whether we have to cycle trans locks. - */ - ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true); - if (ret == -EAGAIN) - ret = drop_locks_do(trans, - bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false)); - if (ret < 0) - return ret; - - /* - * Create a fake extent key in the buffer. We have to add a dummy extent - * pointer for the fill code to add an extent entry. It's explicitly - * zeroed to reflect delayed allocation (i.e. phys offset 0). - */ - bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64)); - delextent = bkey_extent_init(cur->kbuf.k); - delextent->k.p = POS(inode->ei_inum.inum, dend >> 9); - delextent->k.size = (dend - dstart) >> 9; - bch2_bkey_append_ptr(&delextent->k_i, ptr); - - cur->flags = FIEMAP_EXTENT_DELALLOC; - - return 0; -} - -static int bch2_next_fiemap_extent(struct btree_trans *trans, - struct bch_inode_info *inode, - u64 start, u64 end, - struct bch_fiemap_extent *cur) -{ - u32 snapshot; - int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); - if (ret) - return ret; - - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inode->ei_inum.inum, start, snapshot), 0); - - struct bkey_s_c k = - bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end)); - ret = bkey_err(k); - if (ret) - goto err; - - u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end; - - ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur); - if (ret) - goto err; - - struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k); - - /* - * Does the pagecache or the btree take precedence? - * - * It _should_ be the pagecache, so that we correctly report delalloc - * extents when dirty in the pagecache (we're COW, after all). - * - * But we'd have to add per-sector writeback tracking to - * bch_folio_state, otherwise we report delalloc extents for clean - * cached data in the pagecache. - * - * We should do this, but even then fiemap won't report stable mappings: - * on bcachefs data moves around in the background (copygc, rebalance) - * and we don't provide a way for userspace to lock that out. - */ - if (k.k && - bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)), - pagecache_start)) { - bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k); - bch2_cut_front(iter.pos, cur->kbuf.k); - bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k); - cur->flags = 0; - } else if (k.k) { - bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k); - } - - if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) { - unsigned sectors = cur->kbuf.k->k.size; - s64 offset_into_extent = 0; - enum btree_id data_btree = BTREE_ID_extents; - ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, - &cur->kbuf); - if (ret) - goto err; - - struct bkey_i *k = cur->kbuf.k; - sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent); - - bch2_cut_front(POS(k->k.p.inode, - bkey_start_offset(&k->k) + offset_into_extent), - k); - bch2_key_resize(&k->k, sectors); - k->k.p = iter.pos; - k->k.p.offset += k->k.size; - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, - u64 start, u64 len) -{ - struct bch_fs *c = vinode->i_sb->s_fs_info; - struct bch_inode_info *ei = to_bch_ei(vinode); - struct btree_trans *trans; - struct bch_fiemap_extent cur, prev; - int ret = 0; - - ret = fiemap_prep(&ei->v, info, start, &len, 0); - if (ret) - return ret; - - if (start + len < start) - return -EINVAL; - - start >>= 9; - u64 end = (start + len) >> 9; - - bch2_bkey_buf_init(&cur.kbuf); - bch2_bkey_buf_init(&prev.kbuf); - bkey_init(&prev.kbuf.k->k); - - trans = bch2_trans_get(c); - - while (start < end) { - ret = lockrestart_do(trans, - bch2_next_fiemap_extent(trans, ei, start, end, &cur)); - if (ret) - goto err; - - BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start); - BUG_ON(cur.kbuf.k->k.p.offset > end); - - if (bkey_start_offset(&cur.kbuf.k->k) == end) - break; - - start = cur.kbuf.k->k.p.offset; - - if (!bkey_deleted(&prev.kbuf.k->k)) { - bch2_trans_unlock(trans); - ret = bch2_fill_extent(c, info, &prev); - if (ret) - goto err; - } - - bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k); - prev.flags = cur.flags; - } - - if (!bkey_deleted(&prev.kbuf.k->k)) { - bch2_trans_unlock(trans); - prev.flags |= FIEMAP_EXTENT_LAST; - ret = bch2_fill_extent(c, info, &prev); - } -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&cur.kbuf, c); - bch2_bkey_buf_exit(&prev.kbuf, c); - - return bch2_err_class(ret < 0 ? ret : 0); -} - -static const struct vm_operations_struct bch_vm_ops = { - .fault = bch2_page_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = bch2_page_mkwrite, -}; - -static int bch2_mmap_prepare(struct vm_area_desc *desc) -{ - file_accessed(desc->file); - - desc->vm_ops = &bch_vm_ops; - return 0; -} - -/* Directories: */ - -static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) -{ - return generic_file_llseek_size(file, offset, whence, - S64_MAX, S64_MAX); -} - -static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - - if (!dir_emit_dots(file, ctx)) - return 0; - - int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx); - - bch_err_fn(c, ret); - return bch2_err_class(ret); -} - -static int bch2_open(struct inode *vinode, struct file *file) -{ - if (file->f_flags & (O_WRONLY|O_RDWR)) { - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); - if (ret) - return ret; - } - - file->f_mode |= FMODE_CAN_ODIRECT; - - return generic_file_open(vinode, file); -} - -/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -static const __maybe_unused unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_sync] = FS_SYNC_FL, - [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, - [__BCH_INODE_append] = FS_APPEND_FL, - [__BCH_INODE_nodump] = FS_NODUMP_FL, - [__BCH_INODE_noatime] = FS_NOATIME_FL, -}; - -/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -static const __maybe_unused unsigned bch_flags_to_xflags[] = { - [__BCH_INODE_sync] = FS_XFLAG_SYNC, - [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, - [__BCH_INODE_append] = FS_XFLAG_APPEND, - [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, - [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, -}; - -static int bch2_fileattr_get(struct dentry *dentry, - struct file_kattr *fa) -{ - struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags)); - - if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) - fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; - - if (bch2_inode_casefold(c, &inode->ei_inode)) - fa->flags |= FS_CASEFOLD_FL; - - fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ]; - return 0; -} - -struct flags_set { - unsigned mask; - unsigned flags; - unsigned projid; - bool set_project; - bool set_casefold; - bool casefold; -}; - -static int fssetxattr_inode_update_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = trans->c; - struct flags_set *s = p; - - /* - * We're relying on btree locking here for exclusion with other ioctl - * calls - use the flags in the btree (@bi), not inode->i_flags: - */ - if (!S_ISREG(bi->bi_mode) && - !S_ISDIR(bi->bi_mode) && - (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags) - return -EINVAL; - - if (s->casefold != bch2_inode_casefold(c, bi)) { - int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->casefold); - if (ret) - return ret; - } - - if (s->set_project) { - bi->bi_project = s->projid; - bi->bi_fields_set |= BIT(Inode_opt_project); - } - - bi->bi_flags &= ~s->mask; - bi->bi_flags |= s->flags; - - bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); - return 0; -} - -static int bch2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, - struct file_kattr *fa) -{ - struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct flags_set s = {}; - int ret; - - if (fa->fsx_valid) { - fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT; - - s.mask = map_defined(bch_flags_to_xflags); - s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); - if (fa->fsx_xflags) - return -EOPNOTSUPP; - - if (fa->fsx_projid >= U32_MAX) - return -EINVAL; - - /* - * inode fields accessible via the xattr interface are stored with a +1 - * bias, so that 0 means unset: - */ - if ((inode->ei_inode.bi_project || - fa->fsx_projid) && - inode->ei_inode.bi_project != fa->fsx_projid + 1) { - s.projid = fa->fsx_projid + 1; - s.set_project = true; - } - } - - if (fa->flags_valid) { - s.mask = map_defined(bch_flags_to_uflags); - - s.set_casefold = true; - s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0; - fa->flags &= ~FS_CASEFOLD_FL; - - s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); - if (fa->flags) - return -EOPNOTSUPP; - } - - mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - (s.set_project - ? bch2_set_projid(c, inode, fa->fsx_projid) - : 0) ?: - bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, - ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); - - return bch2_err_class(ret); -} - -static const struct file_operations bch_file_operations = { - .open = bch2_open, - .llseek = bch2_llseek, - .read_iter = bch2_read_iter, - .write_iter = bch2_write_iter, - .mmap_prepare = bch2_mmap_prepare, - .get_unmapped_area = thp_get_unmapped_area, - .fsync = bch2_fsync, - .splice_read = filemap_splice_read, - .splice_write = iter_file_splice_write, - .fallocate = bch2_fallocate_dispatch, - .unlocked_ioctl = bch2_fs_file_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = bch2_compat_fs_ioctl, -#endif - .remap_file_range = bch2_remap_file_range, -}; - -static const struct inode_operations bch_file_inode_operations = { - .getattr = bch2_getattr, - .setattr = bch2_setattr, - .fiemap = bch2_fiemap, - .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, -#endif - .fileattr_get = bch2_fileattr_get, - .fileattr_set = bch2_fileattr_set, -}; - -static const struct inode_operations bch_dir_inode_operations = { - .lookup = bch2_lookup, - .create = bch2_create, - .link = bch2_link, - .unlink = bch2_unlink, - .symlink = bch2_symlink, - .mkdir = bch2_mkdir, - .rmdir = bch2_unlink, - .mknod = bch2_mknod, - .rename = bch2_rename2, - .getattr = bch2_getattr, - .setattr = bch2_setattr, - .tmpfile = bch2_tmpfile, - .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, -#endif - .fileattr_get = bch2_fileattr_get, - .fileattr_set = bch2_fileattr_set, -}; - -static const struct file_operations bch_dir_file_operations = { - .llseek = bch2_dir_llseek, - .read = generic_read_dir, - .iterate_shared = bch2_vfs_readdir, - .fsync = bch2_fsync, - .unlocked_ioctl = bch2_fs_file_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = bch2_compat_fs_ioctl, -#endif -}; - -static const struct inode_operations bch_symlink_inode_operations = { - .get_link = page_get_link, - .getattr = bch2_getattr, - .setattr = bch2_setattr, - .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, -#endif - .fileattr_get = bch2_fileattr_get, - .fileattr_set = bch2_fileattr_set, -}; - -static const struct inode_operations bch_special_inode_operations = { - .getattr = bch2_getattr, - .setattr = bch2_setattr, - .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, -#endif - .fileattr_get = bch2_fileattr_get, - .fileattr_set = bch2_fileattr_set, -}; - -static const struct address_space_operations bch_address_space_operations = { - .read_folio = bch2_read_folio, - .writepages = bch2_writepages, - .readahead = bch2_readahead, - .dirty_folio = filemap_dirty_folio, - .write_begin = bch2_write_begin, - .write_end = bch2_write_end, - .invalidate_folio = bch2_invalidate_folio, - .release_folio = bch2_release_folio, -#ifdef CONFIG_MIGRATION - .migrate_folio = filemap_migrate_folio, -#endif - .error_remove_folio = generic_error_remove_folio, -}; - -struct bcachefs_fid { - u64 inum; - u32 subvol; - u32 gen; -} __packed; - -struct bcachefs_fid_with_parent { - struct bcachefs_fid fid; - struct bcachefs_fid dir; -} __packed; - -static int bcachefs_fid_valid(int fh_len, int fh_type) -{ - switch (fh_type) { - case FILEID_BCACHEFS_WITHOUT_PARENT: - return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); - case FILEID_BCACHEFS_WITH_PARENT: - return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); - default: - return false; - } -} - -static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) -{ - return (struct bcachefs_fid) { - .inum = inode->ei_inum.inum, - .subvol = inode->ei_inum.subvol, - .gen = inode->ei_inode.bi_generation, - }; -} - -static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, - struct inode *vdir) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_inode_info *dir = to_bch_ei(vdir); - int min_len; - - if (!S_ISDIR(inode->v.i_mode) && dir) { - struct bcachefs_fid_with_parent *fid = (void *) fh; - - min_len = sizeof(*fid) / sizeof(u32); - if (*len < min_len) { - *len = min_len; - return FILEID_INVALID; - } - - fid->fid = bch2_inode_to_fid(inode); - fid->dir = bch2_inode_to_fid(dir); - - *len = min_len; - return FILEID_BCACHEFS_WITH_PARENT; - } else { - struct bcachefs_fid *fid = (void *) fh; - - min_len = sizeof(*fid) / sizeof(u32); - if (*len < min_len) { - *len = min_len; - return FILEID_INVALID; - } - *fid = bch2_inode_to_fid(inode); - - *len = min_len; - return FILEID_BCACHEFS_WITHOUT_PARENT; - } -} - -static struct inode *bch2_nfs_get_inode(struct super_block *sb, - struct bcachefs_fid fid) -{ - struct bch_fs *c = sb->s_fs_info; - struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { - .subvol = fid.subvol, - .inum = fid.inum, - }); - if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { - iput(vinode); - vinode = ERR_PTR(-ESTALE); - } - return vinode; -} - -static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, - int fh_len, int fh_type) -{ - struct bcachefs_fid *fid = (void *) _fid; - - if (!bcachefs_fid_valid(fh_len, fh_type)) - return NULL; - - return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); -} - -static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, - int fh_len, int fh_type) -{ - struct bcachefs_fid_with_parent *fid = (void *) _fid; - - if (!bcachefs_fid_valid(fh_len, fh_type) || - fh_type != FILEID_BCACHEFS_WITH_PARENT) - return NULL; - - return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); -} - -static struct dentry *bch2_get_parent(struct dentry *child) -{ - struct bch_inode_info *inode = to_bch_ei(child->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - subvol_inum parent_inum = { - .subvol = inode->ei_inode.bi_parent_subvol ?: - inode->ei_inum.subvol, - .inum = inode->ei_inode.bi_dir, - }; - - return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); -} - -static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) -{ - struct bch_inode_info *inode = to_bch_ei(child->d_inode); - struct bch_inode_info *dir = to_bch_ei(parent->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct btree_iter iter1; - struct btree_iter iter2; - struct bkey_s_c k; - struct bkey_s_c_dirent d; - struct bch_inode_unpacked inode_u; - subvol_inum target; - u32 snapshot; - struct qstr dirent_name; - unsigned name_len = 0; - int ret; - - if (!S_ISDIR(dir->v.i_mode)) - return -EINVAL; - - trans = bch2_trans_get(c); - - bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, - POS(dir->ei_inode.bi_inum, 0), 0); - bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, - POS(dir->ei_inode.bi_inum, 0), 0); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &iter1, snapshot); - bch2_btree_iter_set_snapshot(trans, &iter2, snapshot); - - ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); - if (ret) - goto err; - - if (inode_u.bi_dir == dir->ei_inode.bi_inum) { - bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); - - k = bch2_btree_iter_peek_slot(trans, &iter1); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_dirent) { - ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); - goto err; - } - - d = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); - if (ret > 0) - ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); - if (ret) - goto err; - - if (subvol_inum_eq(target, inode->ei_inum)) - goto found; - } else { - /* - * File with multiple hardlinks and our backref is to the wrong - * directory - linear search: - */ - for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) { - if (k.k->p.inode > dir->ei_inode.bi_inum) - break; - - if (k.k->type != KEY_TYPE_dirent) - continue; - - d = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); - if (ret < 0) - break; - if (ret) - continue; - - if (subvol_inum_eq(target, inode->ei_inum)) - goto found; - } - } - - ret = -ENOENT; - goto err; -found: - dirent_name = bch2_dirent_get_name(d); - - name_len = min_t(unsigned, dirent_name.len, NAME_MAX); - memcpy(name, dirent_name.name, name_len); - name[name_len] = '\0'; -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_iter_exit(trans, &iter1); - bch2_trans_iter_exit(trans, &iter2); - bch2_trans_put(trans); - - return ret; -} - -static const struct export_operations bch_export_ops = { - .encode_fh = bch2_encode_fh, - .fh_to_dentry = bch2_fh_to_dentry, - .fh_to_parent = bch2_fh_to_parent, - .get_parent = bch2_get_parent, - .get_name = bch2_get_name, -}; - -static void bch2_vfs_inode_init(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - struct bch_subvolume *subvol) -{ - inode->v.i_ino = inum.inum; - inode->ei_inum = inum; - inode->ei_inode.bi_inum = inum.inum; - bch2_inode_update_after_write(trans, inode, bi, ~0); - - inode->v.i_blocks = bi->bi_sectors; - inode->v.i_rdev = bi->bi_dev; - inode->v.i_generation = bi->bi_generation; - inode->v.i_size = bi->bi_size; - - inode->ei_flags = 0; - inode->ei_quota_reserved = 0; - inode->ei_qid = bch_qid(bi); - - if (BCH_SUBVOLUME_SNAP(subvol)) - set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - - inode->v.i_mapping->a_ops = &bch_address_space_operations; - - switch (inode->v.i_mode & S_IFMT) { - case S_IFREG: - inode->v.i_op = &bch_file_inode_operations; - inode->v.i_fop = &bch_file_operations; - break; - case S_IFDIR: - inode->v.i_op = &bch_dir_inode_operations; - inode->v.i_fop = &bch_dir_file_operations; - break; - case S_IFLNK: - inode_nohighmem(&inode->v); - inode->v.i_op = &bch_symlink_inode_operations; - break; - default: - init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); - inode->v.i_op = &bch_special_inode_operations; - break; - } - - mapping_set_folio_min_order(inode->v.i_mapping, - get_order(trans->c->opts.block_size)); -} - -static void bch2_free_inode(struct inode *vinode) -{ - kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); -} - -static int inode_update_times_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); - bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); - bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); - - return 0; -} - -static int bch2_vfs_write_inode(struct inode *vinode, - struct writeback_control *wbc) -{ - struct bch_fs *c = vinode->i_sb->s_fs_info; - struct bch_inode_info *inode = to_bch_ei(vinode); - int ret; - - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); - - return bch2_err_class(ret); -} - -static void bch2_evict_inode(struct inode *vinode) -{ - struct bch_fs *c = vinode->i_sb->s_fs_info; - struct bch_inode_info *inode = to_bch_ei(vinode); - bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); - - /* - * evict() has waited for outstanding writeback, we'll do no more IO - * through this inode: it's safe to remove from VFS inode hashtable here - * - * Do that now so that other threads aren't blocked from pulling it back - * in, there's no reason for them to be: - */ - if (!delete) - bch2_inode_hash_remove(c, inode); - - truncate_inode_pages_final(&inode->v.i_data); - - clear_inode(&inode->v); - - BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); - - if (delete) { - bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), - KEY_TYPE_QUOTA_WARN); - bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, - KEY_TYPE_QUOTA_WARN); - int ret = bch2_inode_rm(c, inode_inum(inode)); - if (ret && !bch2_err_matches(ret, EROFS)) { - bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu", - inode->ei_inum.subvol, - inode->ei_inum.inum); - bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm); - } - - /* - * If we are deleting, we need it present in the vfs hash table - * so that fsck can check if unlinked inodes are still open: - */ - bch2_inode_hash_remove(c, inode); - } - - mutex_lock(&c->vfs_inodes_lock); - list_del_init(&inode->ei_vfs_inode_list); - mutex_unlock(&c->vfs_inodes_lock); -} - -void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) -{ - struct bch_inode_info *inode; - DARRAY(struct bch_inode_info *) grabbed; - bool clean_pass = false, this_pass_clean; - - /* - * Initially, we scan for inodes without I_DONTCACHE, then mark them to - * be pruned with d_mark_dontcache(). - * - * Once we've had a clean pass where we didn't find any inodes without - * I_DONTCACHE, we wait for them to be freed: - */ - - darray_init(&grabbed); - darray_make_room(&grabbed, 1024); -again: - cond_resched(); - this_pass_clean = true; - - mutex_lock(&c->vfs_inodes_lock); - list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { - if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) - continue; - - if (!(inode->v.i_state & I_DONTCACHE) && - !(inode->v.i_state & I_FREEING) && - igrab(&inode->v)) { - this_pass_clean = false; - - if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { - iput(&inode->v); - break; - } - } else if (clean_pass && this_pass_clean) { - struct wait_bit_queue_entry wqe; - struct wait_queue_head *wq_head; - - wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); - prepare_to_wait_event(wq_head, &wqe.wq_entry, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&c->vfs_inodes_lock); - - schedule(); - finish_wait(wq_head, &wqe.wq_entry); - goto again; - } - } - mutex_unlock(&c->vfs_inodes_lock); - - darray_for_each(grabbed, i) { - inode = *i; - d_mark_dontcache(&inode->v); - d_prune_aliases(&inode->v); - iput(&inode->v); - } - grabbed.nr = 0; - - if (!clean_pass || !this_pass_clean) { - clean_pass = this_pass_clean; - goto again; - } - - darray_exit(&grabbed); -} - -static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_sb; - struct bch_fs *c = sb->s_fs_info; - struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); - unsigned shift = sb->s_blocksize_bits - 9; - /* - * this assumes inodes take up 64 bytes, which is a decent average - * number: - */ - u64 avail_inodes = ((usage.capacity - usage.used) << 3); - - buf->f_type = BCACHEFS_STATFS_MAGIC; - buf->f_bsize = sb->s_blocksize; - buf->f_blocks = usage.capacity >> shift; - buf->f_bfree = usage.free >> shift; - buf->f_bavail = avail_factor(usage.free) >> shift; - - buf->f_files = usage.nr_inodes + avail_inodes; - buf->f_ffree = avail_inodes; - - buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); - buf->f_namelen = BCH_NAME_MAX; - - return 0; -} - -static int bch2_sync_fs(struct super_block *sb, int wait) -{ - struct bch_fs *c = sb->s_fs_info; - int ret; - - trace_bch2_sync_fs(sb, wait); - - if (c->opts.journal_flush_disabled) - return 0; - - if (!wait) { - bch2_journal_flush_async(&c->journal, NULL); - return 0; - } - - ret = bch2_journal_flush(&c->journal); - return bch2_err_class(ret); -} - -static struct bch_fs *bch2_path_to_fs(const char *path) -{ - struct bch_fs *c; - dev_t dev; - int ret; - - ret = lookup_bdev(path, &dev); - if (ret) - return ERR_PTR(ret); - - c = bch2_dev_to_fs(dev); - if (c) - closure_put(&c->cl); - return c ?: ERR_PTR(-ENOENT); -} - -static int bch2_show_devname(struct seq_file *seq, struct dentry *root) -{ - struct bch_fs *c = root->d_sb->s_fs_info; - bool first = true; - - guard(rcu)(); - for_each_online_member_rcu(c, ca) { - if (!first) - seq_putc(seq, ':'); - first = false; - seq_puts(seq, ca->disk_sb.sb_name); - } - - return 0; -} - -static int bch2_show_options(struct seq_file *seq, struct dentry *root) -{ - struct bch_fs *c = root->d_sb->s_fs_info; - struct printbuf buf = PRINTBUF; - - bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, - OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); - printbuf_nul_terminate(&buf); - seq_printf(seq, ",%s", buf.buf); - - int ret = buf.allocation_failure ? -ENOMEM : 0; - printbuf_exit(&buf); - return ret; -} - -static void bch2_put_super(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - - __bch2_fs_stop(c); -} - -/* - * bcachefs doesn't currently integrate intwrite freeze protection but the - * internal write references serve the same purpose. Therefore reuse the - * read-only transition code to perform the quiesce. The caveat is that we don't - * currently have the ability to block tasks that want a write reference while - * the superblock is frozen. This is fine for now, but we should either add - * blocking support or find a way to integrate sb_start_intwrite() and friends. - */ -static int bch2_freeze(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - - down_write(&c->state_lock); - bch2_fs_read_only(c); - up_write(&c->state_lock); - return 0; -} - -static int bch2_unfreeze(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - int ret; - - if (test_bit(BCH_FS_emergency_ro, &c->flags)) - return 0; - - down_write(&c->state_lock); - ret = bch2_fs_read_write(c); - up_write(&c->state_lock); - return ret; -} - -static const struct super_operations bch_super_operations = { - .alloc_inode = bch2_alloc_inode, - .free_inode = bch2_free_inode, - .write_inode = bch2_vfs_write_inode, - .evict_inode = bch2_evict_inode, - .sync_fs = bch2_sync_fs, - .statfs = bch2_statfs, - .show_devname = bch2_show_devname, - .show_options = bch2_show_options, - .put_super = bch2_put_super, - .freeze_fs = bch2_freeze, - .unfreeze_fs = bch2_unfreeze, -}; - -static int bch2_set_super(struct super_block *s, void *data) -{ - s->s_fs_info = data; - return 0; -} - -static int bch2_noset_super(struct super_block *s, void *data) -{ - return -EBUSY; -} - -typedef DARRAY(struct bch_fs *) darray_fs; - -static int bch2_test_super(struct super_block *s, void *data) -{ - struct bch_fs *c = s->s_fs_info; - darray_fs *d = data; - - if (!c) - return false; - - darray_for_each(*d, i) - if (c != *i) - return false; - return true; -} - -static int bch2_fs_get_tree(struct fs_context *fc) -{ - struct bch_fs *c; - struct super_block *sb; - struct inode *vinode; - struct bch2_opts_parse *opts_parse = fc->fs_private; - struct bch_opts opts = opts_parse->opts; - darray_const_str devs; - darray_fs devs_to_fs = {}; - int ret; - - opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); - opt_set(opts, nostart, true); - - if (!fc->source || strlen(fc->source) == 0) - return -EINVAL; - - ret = bch2_split_devs(fc->source, &devs); - if (ret) - return ret; - - darray_for_each(devs, i) { - ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); - if (ret) - goto err; - } - - sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs); - if (!IS_ERR(sb)) - goto got_sb; - - c = bch2_fs_open(&devs, &opts); - ret = PTR_ERR_OR_ZERO(c); - if (ret) - goto err; - - if (opt_defined(opts, discard)) - set_bit(BCH_FS_discard_mount_opt_set, &c->flags); - - /* Some options can't be parsed until after the fs is started: */ - opts = bch2_opts_empty(); - ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false); - if (ret) - goto err_stop_fs; - - bch2_opts_apply(&c->opts, opts); - - ret = bch2_fs_start(c); - if (ret) - goto err_stop_fs; - - /* - * We might be doing a RO mount because other options required it, or we - * have no alloc info and it's a small image with no room to regenerate - * it - */ - if (c->opts.read_only) - fc->sb_flags |= SB_RDONLY; - - sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); - ret = PTR_ERR_OR_ZERO(sb); - if (ret) - goto err_stop_fs; -got_sb: - c = sb->s_fs_info; - - if (sb->s_root) { - if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) { - ret = -EBUSY; - goto err_put_super; - } - goto out; - } - - sb->s_blocksize = block_bytes(c); - sb->s_blocksize_bits = ilog2(block_bytes(c)); - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_op = &bch_super_operations; - sb->s_export_op = &bch_export_ops; -#ifdef CONFIG_BCACHEFS_QUOTA - sb->s_qcop = &bch2_quotactl_operations; - sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; -#endif - sb->s_xattr = bch2_xattr_handlers; - sb->s_magic = BCACHEFS_STATFS_MAGIC; - sb->s_time_gran = c->sb.nsec_per_time_unit; - sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; - sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); - super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); - - if (c->sb.multi_device) - super_set_sysfs_name_uuid(sb); - else - strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name)); - - sb->s_shrink->seeks = 0; - c->vfs_sb = sb; - strscpy(sb->s_id, c->name, sizeof(sb->s_id)); - - ret = super_setup_bdi(sb); - if (ret) - goto err_put_super; - - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - - scoped_guard(rcu) { - for_each_online_member_rcu(c, ca) { - struct block_device *bdev = ca->disk_sb.bdev; - - /* XXX: create an anonymous device for multi device filesystems */ - sb->s_bdev = bdev; - sb->s_dev = bdev->bd_dev; - break; - } - } - - c->dev = sb->s_dev; - -#ifdef CONFIG_BCACHEFS_POSIX_ACL - if (c->opts.acl) - sb->s_flags |= SB_POSIXACL; -#endif - - sb->s_shrink->seeks = 0; - -#ifdef CONFIG_UNICODE - if (bch2_fs_casefold_enabled(c)) - sb->s_encoding = c->cf_encoding; - generic_set_sb_d_ops(sb); -#endif - - vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); - ret = PTR_ERR_OR_ZERO(vinode); - bch_err_msg(c, ret, "mounting: error getting root inode"); - if (ret) - goto err_put_super; - - sb->s_root = d_make_root(vinode); - if (!sb->s_root) { - bch_err(c, "error mounting: error allocating root dentry"); - ret = -ENOMEM; - goto err_put_super; - } - - sb->s_flags |= SB_ACTIVE; -out: - fc->root = dget(sb->s_root); -err: - darray_exit(&devs_to_fs); - bch2_darray_str_exit(&devs); - if (ret) - pr_err("error: %s", bch2_err_str(ret)); - /* - * On an inconsistency error in recovery we might see an -EROFS derived - * errorcode (from the journal), but we don't want to return that to - * userspace as that causes util-linux to retry the mount RO - which is - * confusing: - */ - if (bch2_err_matches(ret, EROFS) && ret != -EROFS) - ret = -EIO; - return bch2_err_class(ret); - -err_stop_fs: - bch2_fs_stop(c); - goto err; - -err_put_super: - if (!sb->s_root) - __bch2_fs_stop(c); - deactivate_locked_super(sb); - goto err; -} - -static void bch2_kill_sb(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - - generic_shutdown_super(sb); - bch2_fs_free(c); -} - -static void bch2_fs_context_free(struct fs_context *fc) -{ - struct bch2_opts_parse *opts = fc->fs_private; - - if (opts) { - printbuf_exit(&opts->parse_later); - kfree(opts); - } -} - -static int bch2_fs_parse_param(struct fs_context *fc, - struct fs_parameter *param) -{ - /* - * the "source" param, i.e., the name of the device(s) to mount, - * is handled by the VFS layer. - */ - if (!strcmp(param->key, "source")) - return -ENOPARAM; - - struct bch2_opts_parse *opts = fc->fs_private; - struct bch_fs *c = NULL; - - /* for reconfigure, we already have a struct bch_fs */ - if (fc->root) - c = fc->root->d_sb->s_fs_info; - - int ret = bch2_parse_one_mount_opt(c, &opts->opts, - &opts->parse_later, param->key, - param->string); - if (ret) - pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret)); - - return bch2_err_class(ret); -} - -static int bch2_fs_reconfigure(struct fs_context *fc) -{ - struct super_block *sb = fc->root->d_sb; - struct bch2_opts_parse *opts = fc->fs_private; - struct bch_fs *c = sb->s_fs_info; - int ret = 0; - - opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); - - if (opts->opts.read_only != c->opts.read_only) { - down_write(&c->state_lock); - - if (opts->opts.read_only) { - bch2_fs_read_only(c); - - sb->s_flags |= SB_RDONLY; - } else { - ret = bch2_fs_read_write(c); - if (ret) { - bch_err(c, "error going rw: %i", ret); - up_write(&c->state_lock); - ret = -EINVAL; - goto err; - } - - sb->s_flags &= ~SB_RDONLY; - } - - c->opts.read_only = opts->opts.read_only; - - up_write(&c->state_lock); - } - - if (opt_defined(opts->opts, errors)) - c->opts.errors = opts->opts.errors; -err: - return bch2_err_class(ret); -} - -static const struct fs_context_operations bch2_context_ops = { - .free = bch2_fs_context_free, - .parse_param = bch2_fs_parse_param, - .get_tree = bch2_fs_get_tree, - .reconfigure = bch2_fs_reconfigure, -}; - -static int bch2_init_fs_context(struct fs_context *fc) -{ - struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL); - - if (!opts) - return -ENOMEM; - - opts->parse_later = PRINTBUF; - - fc->ops = &bch2_context_ops; - fc->fs_private = opts; - - return 0; -} - -void bch2_fs_vfs_exit(struct bch_fs *c) -{ - if (c->vfs_inodes_by_inum_table.ht.tbl) - rhltable_destroy(&c->vfs_inodes_by_inum_table); - if (c->vfs_inodes_table.tbl) - rhashtable_destroy(&c->vfs_inodes_table); -} - -int bch2_fs_vfs_init(struct bch_fs *c) -{ - return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: - rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); -} - -static struct file_system_type bcache_fs_type = { - .owner = THIS_MODULE, - .name = "bcachefs", - .init_fs_context = bch2_init_fs_context, - .kill_sb = bch2_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS, -}; - -MODULE_ALIAS_FS("bcachefs"); - -void bch2_vfs_exit(void) -{ - unregister_filesystem(&bcache_fs_type); - kmem_cache_destroy(bch2_inode_cache); -} - -int __init bch2_vfs_init(void) -{ - int ret = -ENOMEM; - - bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | - SLAB_ACCOUNT); - if (!bch2_inode_cache) - goto err; - - ret = register_filesystem(&bcache_fs_type); - if (ret) - goto err; - - return 0; -err: - bch2_vfs_exit(); - return ret; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h deleted file mode 100644 index dd2198541455..000000000000 --- a/fs/bcachefs/fs.h +++ /dev/null @@ -1,215 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_H -#define _BCACHEFS_FS_H - -#include "inode.h" -#include "opts.h" -#include "str_hash.h" -#include "quota_types.h" -#include "two_state_shared_lock.h" - -#include <linux/seqlock.h> -#include <linux/stat.h> - -struct bch_inode_info { - struct inode v; - struct rhash_head hash; - struct rhlist_head by_inum_hash; - subvol_inum ei_inum; - - struct list_head ei_vfs_inode_list; - unsigned long ei_flags; - - struct mutex ei_update_lock; - u64 ei_quota_reserved; - unsigned long ei_last_dirtied; - two_state_lock_t ei_pagecache_lock; - - struct mutex ei_quota_lock; - struct bch_qid ei_qid; - - /* - * When we've been doing nocow writes we'll need to issue flushes to the - * underlying block devices - * - * XXX: a device may have had a flush issued by some other codepath. It - * would be better to keep for each device a sequence number that's - * incremented when we isusue a cache flush, and track here the sequence - * number that needs flushing. - */ - struct bch_devs_mask ei_devs_need_flush; - - /* copy of inode in btree: */ - struct bch_inode_unpacked ei_inode; -}; - -#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0) -#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0) -#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0) - -#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1) -#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1) - -static inline subvol_inum inode_inum(struct bch_inode_info *inode) -{ - return inode->ei_inum; -} - -/* - * Set if we've gotten a btree error for this inode, and thus the vfs inode and - * btree inode may be inconsistent: - */ -#define EI_INODE_ERROR 0 - -/* - * Set in the inode is in a snapshot subvolume - we don't do quota accounting in - * those: - */ -#define EI_INODE_SNAPSHOT 1 -#define EI_INODE_HASHED 2 - -#define to_bch_ei(_inode) \ - container_of_or_null(_inode, struct bch_inode_info, v) - -static inline int ptrcmp(void *l, void *r) -{ - return cmp_int(l, r); -} - -enum bch_inode_lock_op { - INODE_PAGECACHE_BLOCK = (1U << 0), - INODE_UPDATE_LOCK = (1U << 1), -}; - -#define bch2_lock_inodes(_locks, ...) \ -do { \ - struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ - unsigned i; \ - \ - bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ - \ - for (i = 1; i < ARRAY_SIZE(a); i++) \ - if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_PAGECACHE_BLOCK) \ - bch2_pagecache_block_get(a[i]);\ - if ((_locks) & INODE_UPDATE_LOCK) \ - mutex_lock_nested(&a[i]->ei_update_lock, i);\ - } \ -} while (0) - -#define bch2_unlock_inodes(_locks, ...) \ -do { \ - struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ - unsigned i; \ - \ - bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ - \ - for (i = 1; i < ARRAY_SIZE(a); i++) \ - if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_PAGECACHE_BLOCK) \ - bch2_pagecache_block_put(a[i]);\ - if ((_locks) & INODE_UPDATE_LOCK) \ - mutex_unlock(&a[i]->ei_update_lock); \ - } \ -} while (0) - -static inline struct bch_inode_info *file_bch_inode(struct file *file) -{ - return to_bch_ei(file_inode(file)); -} - -static inline bool inode_attr_changing(struct bch_inode_info *dir, - struct bch_inode_info *inode, - enum inode_opt_id id) -{ - return !(inode->ei_inode.bi_fields_set & (1 << id)) && - bch2_inode_opt_get(&dir->ei_inode, id) != - bch2_inode_opt_get(&inode->ei_inode, id); -} - -static inline bool inode_attrs_changing(struct bch_inode_info *dir, - struct bch_inode_info *inode) -{ - unsigned id; - - for (id = 0; id < Inode_opt_nr; id++) - if (inode_attr_changing(dir, inode, id)) - return true; - - return false; -} - -struct bch_inode_unpacked; - -#ifndef NO_BCACHEFS_FS - -struct bch_inode_info * -__bch2_create(struct mnt_idmap *, struct bch_inode_info *, - struct dentry *, umode_t, dev_t, subvol_inum, unsigned); - -int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p); - -int bch2_fs_quota_transfer(struct bch_fs *, - struct bch_inode_info *, - struct bch_qid, - unsigned, - enum quota_acct_mode); - -static inline int bch2_set_projid(struct bch_fs *c, - struct bch_inode_info *inode, - u32 projid) -{ - struct bch_qid qid = inode->ei_qid; - - qid.q[QTYP_PRJ] = projid; - - return bch2_fs_quota_transfer(c, inode, qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); -} - -struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); - -/* returns 0 if we want to do the update, or error is passed up */ -typedef int (*inode_set_fn)(struct btree_trans *, - struct bch_inode_info *, - struct bch_inode_unpacked *, void *); - -void bch2_inode_update_after_write(struct btree_trans *, - struct bch_inode_info *, - struct bch_inode_unpacked *, - unsigned); -int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, - inode_set_fn, void *, unsigned); - -int bch2_setattr_nonsize(struct mnt_idmap *, - struct bch_inode_info *, - struct iattr *); -int __bch2_unlink(struct inode *, struct dentry *, bool); - -void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); - -void bch2_fs_vfs_exit(struct bch_fs *); -int bch2_fs_vfs_init(struct bch_fs *); - -void bch2_vfs_exit(void); -int bch2_vfs_init(void); - -#else - -#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) - -static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; } - -static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, - snapshot_id_list *s) {} - -static inline void bch2_fs_vfs_exit(struct bch_fs *c) {} -static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; } - -static inline void bch2_vfs_exit(void) {} -static inline int bch2_vfs_init(void) { return 0; } - -#endif /* NO_BCACHEFS_FS */ - -#endif /* _BCACHEFS_FS_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c deleted file mode 100644 index 15c1e890d299..000000000000 --- a/fs/bcachefs/fsck.c +++ /dev/null @@ -1,3363 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bcachefs_ioctl.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_update.h" -#include "buckets.h" -#include "darray.h" -#include "dirent.h" -#include "error.h" -#include "fs.h" -#include "fsck.h" -#include "inode.h" -#include "io_misc.h" -#include "keylist.h" -#include "namei.h" -#include "recovery_passes.h" -#include "snapshot.h" -#include "super.h" -#include "thread_with_file.h" -#include "xattr.h" - -#include <linux/bsearch.h> -#include <linux/dcache.h> /* struct qstr */ - -static int dirent_points_to_inode_nowarn(struct bch_fs *c, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) -{ - if (d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol - : le64_to_cpu(d.v->d_inum) == inode->bi_inum) - return 0; - return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); -} - -static void dirent_inode_mismatch_msg(struct printbuf *out, - struct bch_fs *c, - struct bkey_s_c_dirent dirent, - struct bch_inode_unpacked *inode) -{ - prt_str(out, "inode points to dirent that does not point back:"); - prt_newline(out); - bch2_bkey_val_to_text(out, c, dirent.s_c); - prt_newline(out); - bch2_inode_unpacked_to_text(out, inode); -} - -static int dirent_points_to_inode(struct bch_fs *c, - struct bkey_s_c_dirent dirent, - struct bch_inode_unpacked *inode) -{ - int ret = dirent_points_to_inode_nowarn(c, dirent, inode); - if (ret) { - struct printbuf buf = PRINTBUF; - dirent_inode_mismatch_msg(&buf, c, dirent, inode); - bch_warn(c, "%s", buf.buf); - printbuf_exit(&buf); - } - return ret; -} - -/* - * XXX: this is handling transaction restarts without returning - * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: - */ -static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, - u32 snapshot) -{ - u64 sectors = 0; - - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(inum, 0, snapshot), - POS(inum, U64_MAX), - 0, k, ({ - if (bkey_extent_is_allocation(k.k)) - sectors += k.k->size; - 0; - })); - - return ret ?: sectors; -} - -static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, - u32 snapshot) -{ - u64 subdirs = 0; - - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), - POS(inum, U64_MAX), - 0, k, ({ - if (k.k->type == KEY_TYPE_dirent && - bkey_s_c_to_dirent(k).v->d_type == DT_DIR) - subdirs++; - 0; - })); - - return ret ?: subdirs; -} - -static int subvol_lookup(struct btree_trans *trans, u32 subvol, - u32 *snapshot, u64 *inum) -{ - struct bch_subvolume s; - int ret = bch2_subvolume_get(trans, subvol, false, &s); - - *snapshot = le32_to_cpu(s.snapshot); - *inum = le64_to_cpu(s.inode); - return ret; -} - -static int lookup_dirent_in_snapshot(struct btree_trans *trans, - struct bch_hash_info hash_info, - subvol_inum dir, struct qstr *name, - u64 *target, unsigned *type, u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0, snapshot); - int ret = bkey_err(k); - if (ret) - return ret; - - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - *target = le64_to_cpu(d.v->d_inum); - *type = d.v->d_type; - bch2_trans_iter_exit(trans, &iter); - return 0; -} - -/* - * Find any subvolume associated with a tree of snapshots - * We can't rely on master_subvol - it might have been deleted. - */ -static int find_snapshot_tree_subvol(struct btree_trans *trans, - u32 tree_id, u32 *subvol) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { - if (k.k->type != KEY_TYPE_snapshot) - continue; - - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - if (le32_to_cpu(s.v->tree) != tree_id) - continue; - - if (s.v->subvol) { - *subvol = le32_to_cpu(s.v->subvol); - goto found; - } - } - ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol); -found: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* Get lost+found, create if it doesn't exist: */ -static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - struct bch_inode_unpacked *lostfound, - u64 reattaching_inum) -{ - struct bch_fs *c = trans->c; - struct qstr lostfound_str = QSTR("lost+found"); - struct btree_iter lostfound_iter = {}; - u64 inum = 0; - unsigned d_type = 0; - int ret; - - struct bch_snapshot_tree st; - ret = bch2_snapshot_tree_lookup(trans, - bch2_snapshot_tree(c, snapshot), &st); - if (ret) - return ret; - - u32 subvolid; - ret = find_snapshot_tree_subvol(trans, - bch2_snapshot_tree(c, snapshot), &subvolid); - bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u", - bch2_snapshot_tree(c, snapshot)); - if (ret) - return ret; - - struct bch_subvolume subvol; - ret = bch2_subvolume_get(trans, subvolid, false, &subvol); - bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot); - if (ret) - return ret; - - if (!subvol.inode) { - struct btree_iter iter; - struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvolid), - 0, subvolume); - ret = PTR_ERR_OR_ZERO(subvol); - if (ret) - return ret; - - subvol->v.inode = cpu_to_le64(reattaching_inum); - bch2_trans_iter_exit(trans, &iter); - } - - subvol_inum root_inum = { - .subvol = subvolid, - .inum = le64_to_cpu(subvol.inode) - }; - - struct bch_inode_unpacked root_inode; - struct bch_hash_info root_hash_info; - ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0); - bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", - root_inum.inum, subvolid); - if (ret) - return ret; - - root_hash_info = bch2_hash_info_init(c, &root_inode); - - ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, - &lostfound_str, &inum, &d_type, snapshot); - if (bch2_err_matches(ret, ENOENT)) - goto create_lostfound; - - bch_err_fn(c, ret); - if (ret) - return ret; - - if (d_type != DT_DIR) { - bch_err(c, "error looking up lost+found: not a directory"); - return bch_err_throw(c, ENOENT_not_directory); - } - - /* - * The bch2_check_dirents pass has already run, dangling dirents - * shouldn't exist here: - */ - ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0); - bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", - inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); - return ret; - -create_lostfound: - /* - * we always create lost+found in the root snapshot; we don't want - * different branches of the snapshot tree to have different lost+found - */ - snapshot = le32_to_cpu(st.root_snapshot); - /* - * XXX: we could have a nicer log message here if we had a nice way to - * walk backpointers to print a path - */ - struct printbuf path = PRINTBUF; - ret = bch2_inum_to_path(trans, root_inum, &path); - if (ret) - goto err; - - bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", - path.buf, root_inum.subvol, snapshot); - printbuf_exit(&path); - - u64 now = bch2_current_time(c); - u64 cpu = raw_smp_processor_id(); - - bch2_inode_init_early(c, lostfound); - bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); - lostfound->bi_dir = root_inode.bi_inum; - lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); - - root_inode.bi_nlink++; - - ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot); - ret = bch2_btree_iter_traverse(trans, &lostfound_iter); - if (ret) - goto err; - - ret = bch2_dirent_create_snapshot(trans, - 0, root_inode.bi_inum, snapshot, &root_hash_info, - mode_to_type(lostfound->bi_mode), - &lostfound_str, - lostfound->bi_inum, - &lostfound->bi_dir_offset, - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create) ?: - bch2_inode_write_flags(trans, &lostfound_iter, lostfound, - BTREE_UPDATE_internal_snapshot_node); -err: - bch_err_msg(c, ret, "creating lost+found"); - bch2_trans_iter_exit(trans, &lostfound_iter); - return ret; -} - -static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) -{ - if (inode->bi_inum == BCACHEFS_ROOT_INO && - inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) - return false; - - /* - * Subvolume roots are special: older versions of subvolume roots may be - * disconnected, it's only the newest version that matters. - * - * We only keep a single dirent pointing to a subvolume root, i.e. - * older versions of snapshots will not have a different dirent pointing - * to the same subvolume root. - * - * This is because dirents that point to subvolumes are only visible in - * the parent subvolume - versioning is not needed - and keeping them - * around would break fsck, because when we're crossing subvolumes we - * don't have a consistent snapshot ID to do check the inode <-> dirent - * relationships. - * - * Thus, a subvolume root that's been renamed after a snapshot will have - * a disconnected older version - that's expected. - * - * Note that taking a snapshot always updates the root inode (to update - * the dirent backpointer), so a subvolume root inode with - * BCH_INODE_has_child_snapshot is never visible. - */ - if (inode->bi_subvol && - (inode->bi_flags & BCH_INODE_has_child_snapshot)) - return false; - - return !bch2_inode_has_backpointer(inode) && - !(inode->bi_flags & BCH_INODE_unlinked); -} - -static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, - SPOS(d_pos.inode, d_pos.offset, snapshot), - BTREE_ITER_intent| - BTREE_ITER_with_updates); - int ret = bkey_err(k); - if (ret) - return ret; - - if (bpos_eq(k.k->p, d_pos)) { - /* - * delet_at() doesn't work because the update path doesn't - * internally use BTREE_ITER_with_updates yet - */ - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); - ret = PTR_ERR_OR_ZERO(k); - if (ret) - goto err; - - bkey_init(&k->k); - k->k.type = KEY_TYPE_whiteout; - k->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) -{ - struct bch_fs *c = trans->c; - struct bch_inode_unpacked lostfound; - char name_buf[20]; - int ret; - - u32 dirent_snapshot = inode->bi_snapshot; - if (inode->bi_subvol) { - inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; - - struct btree_iter subvol_iter; - struct bkey_i_subvolume *subvol = - bch2_bkey_get_mut_typed(trans, &subvol_iter, - BTREE_ID_subvolumes, POS(0, inode->bi_subvol), - 0, subvolume); - ret = PTR_ERR_OR_ZERO(subvol); - if (ret) - return ret; - - subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL; - bch2_trans_iter_exit(trans, &subvol_iter); - - u64 root_inum; - ret = subvol_lookup(trans, inode->bi_parent_subvol, - &dirent_snapshot, &root_inum); - if (ret) - return ret; - - snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); - } else { - snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); - } - - ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); - if (ret) - return ret; - - bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum); - - lostfound.bi_nlink += S_ISDIR(inode->bi_mode); - - /* ensure lost+found inode is also present in inode snapshot */ - if (!inode->bi_subvol) { - BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot)); - lostfound.bi_snapshot = inode->bi_snapshot; - } - - ret = __bch2_fsck_write_inode(trans, &lostfound); - if (ret) - return ret; - - struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); - struct qstr name = QSTR(name_buf); - - inode->bi_dir = lostfound.bi_inum; - - ret = bch2_dirent_create_snapshot(trans, - inode->bi_parent_subvol, lostfound.bi_inum, - dirent_snapshot, - &dir_hash, - inode_d_type(inode), - &name, - inode->bi_subvol ?: inode->bi_inum, - &inode->bi_dir_offset, - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create); - if (ret) { - bch_err_msg(c, ret, "error creating dirent"); - return ret; - } - - ret = __bch2_fsck_write_inode(trans, inode); - if (ret) - return ret; - - { - CLASS(printbuf, buf)(); - ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, - inode->bi_snapshot, NULL, &buf); - if (ret) - return ret; - - bch_info(c, "reattached at %s", buf.buf); - } - - /* - * Fix up inodes in child snapshots: if they should also be reattached - * update the backpointer field, if they should not be we need to emit - * whiteouts for the dirent we just created. - */ - if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { - snapshot_id_list whiteouts_done; - struct btree_iter iter; - struct bkey_s_c k; - - darray_init(&whiteouts_done); - - for_each_btree_key_reverse_norestart(trans, iter, - BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), - BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { - if (k.k->p.offset != inode->bi_inum) - break; - - if (!bkey_is_inode(k.k) || - !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || - snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) - continue; - - struct bch_inode_unpacked child_inode; - ret = bch2_inode_unpack(k, &child_inode); - if (ret) - break; - - if (!inode_should_reattach(&child_inode)) { - ret = maybe_delete_dirent(trans, - SPOS(lostfound.bi_inum, inode->bi_dir_offset, - dirent_snapshot), - k.k->p.snapshot); - if (ret) - break; - - ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); - if (ret) - break; - } else { - iter.snapshot = k.k->p.snapshot; - child_inode.bi_dir = inode->bi_dir; - child_inode.bi_dir_offset = inode->bi_dir_offset; - - ret = bch2_inode_write_flags(trans, &iter, &child_inode, - BTREE_UPDATE_internal_snapshot_node); - if (ret) - break; - } - } - darray_exit(&whiteouts_done); - bch2_trans_iter_exit(trans, &iter); - } - - return ret; -} - -static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos) -{ - return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); -} - -static int remove_backpointer(struct btree_trans *trans, - struct bch_inode_unpacked *inode) -{ - if (!bch2_inode_has_backpointer(inode)) - return 0; - - u32 snapshot = inode->bi_snapshot; - - if (inode->bi_parent_subvol) { - int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot); - if (ret) - return ret; - } - - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); - int ret = bkey_err(d) ?: - dirent_points_to_inode(c, d, inode) ?: - bch2_fsck_remove_dirent(trans, d.k->p); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) -{ - struct bch_fs *c = trans->c; - - struct bch_inode_unpacked inode; - int ret = bch2_inode_find_by_inum_trans(trans, - (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, - &inode); - if (ret) - return ret; - - ret = remove_backpointer(trans, &inode); - if (!bch2_err_matches(ret, ENOENT)) - bch_err_msg(c, ret, "removing dirent"); - if (ret) - return ret; - - ret = reattach_inode(trans, &inode); - bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); - return ret; -} - -static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) -{ - struct bch_fs *c = trans->c; - - if (!bch2_snapshot_is_leaf(c, snapshotid)) { - bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); - return bch_err_throw(c, fsck_repair_unimplemented); - } - - /* - * If inum isn't set, that means we're being called from check_dirents, - * not check_inodes - the root of this subvolume doesn't exist or we - * would have found it there: - */ - if (!inum) { - struct btree_iter inode_iter = {}; - struct bch_inode_unpacked new_inode; - u64 cpu = raw_smp_processor_id(); - - bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); - - new_inode.bi_subvol = subvolid; - - int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: - bch2_btree_iter_traverse(trans, &inode_iter) ?: - bch2_inode_write(trans, &inode_iter, &new_inode); - bch2_trans_iter_exit(trans, &inode_iter); - if (ret) - return ret; - - inum = new_inode.bi_inum; - } - - bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); - - struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); - int ret = PTR_ERR_OR_ZERO(new_subvol); - if (ret) - return ret; - - bkey_subvolume_init(&new_subvol->k_i); - new_subvol->k.p.offset = subvolid; - new_subvol->v.snapshot = cpu_to_le32(snapshotid); - new_subvol->v.inode = cpu_to_le64(inum); - ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); - if (ret) - return ret; - - struct btree_iter iter; - struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, snapshotid), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(s); - bch_err_msg(c, ret, "getting snapshot %u", snapshotid); - if (ret) - return ret; - - u32 snapshot_tree = le32_to_cpu(s->v.tree); - - s->v.subvol = cpu_to_le32(subvolid); - SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); - bch2_trans_iter_exit(trans, &iter); - - struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshot_trees, POS(0, snapshot_tree), - 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(st); - bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); - if (ret) - return ret; - - if (!st->v.master_subvol) - st->v.master_subvol = cpu_to_le32(subvolid); - - bch2_trans_iter_exit(trans, &iter); - return 0; -} - -static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) -{ - struct bch_fs *c = trans->c; - unsigned i_mode = S_IFREG; - u64 i_size = 0; - - switch (btree) { - case BTREE_ID_extents: { - struct btree_iter iter = {}; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); - struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0)); - bch2_trans_iter_exit(trans, &iter); - int ret = bkey_err(k); - if (ret) - return ret; - - i_size = k.k->p.offset << 9; - break; - } - case BTREE_ID_dirents: - i_mode = S_IFDIR; - break; - case BTREE_ID_xattrs: - break; - default: - BUG(); - } - - struct bch_inode_unpacked new_inode; - bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); - new_inode.bi_size = i_size; - new_inode.bi_inum = inum; - new_inode.bi_snapshot = snapshot; - - return __bch2_fsck_write_inode(trans, &new_inode); -} - -static inline void snapshots_seen_exit(struct snapshots_seen *s) -{ - darray_exit(&s->ids); -} - -static inline void snapshots_seen_init(struct snapshots_seen *s) -{ - memset(s, 0, sizeof(*s)); -} - -static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) -{ - u32 *i; - __darray_for_each(s->ids, i) { - if (*i == id) - return 0; - if (*i > id) - break; - } - - int ret = darray_insert_item(&s->ids, i - s->ids.data, id); - if (ret) - bch_err(c, "error reallocating snapshots_seen table (size %zu)", - s->ids.size); - return ret; -} - -static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, - enum btree_id btree_id, struct bpos pos) -{ - if (!bkey_eq(s->pos, pos)) - s->ids.nr = 0; - s->pos = pos; - - return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); -} - -/** - * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, - * and @ancestor hasn't been overwritten in @seen - * - * @c: filesystem handle - * @seen: list of snapshot ids already seen at current position - * @id: descendent snapshot id - * @ancestor: ancestor snapshot id - * - * Returns: whether key in @ancestor snapshot is visible in @id snapshot - */ -static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, - u32 id, u32 ancestor) -{ - EBUG_ON(id > ancestor); - - if (id == ancestor) - return true; - - if (!bch2_snapshot_is_ancestor(c, id, ancestor)) - return false; - - /* - * We know that @id is a descendant of @ancestor, we're checking if - * we've seen a key that overwrote @ancestor - i.e. also a descendent of - * @ascestor and with @id as a descendent. - * - * But we already know that we're scanning IDs between @id and @ancestor - * numerically, since snapshot ID lists are kept sorted, so if we find - * an id that's an ancestor of @id we're done: - */ - darray_for_each_reverse(seen->ids, i) - if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i)) - return false; - - return true; -} - -/** - * ref_visible - given a key with snapshot id @src that points to a key with - * snapshot id @dst, test whether there is some snapshot in which @dst is - * visible. - * - * @c: filesystem handle - * @s: list of snapshot IDs already seen at @src - * @src: snapshot ID of src key - * @dst: snapshot ID of dst key - * Returns: true if there is some snapshot in which @dst is visible - * - * Assumes we're visiting @src keys in natural key order - */ -static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, - u32 src, u32 dst) -{ - return dst <= src - ? key_visible_in_snapshot(c, s, dst, src) - : bch2_snapshot_is_ancestor(c, src, dst); -} - -static int ref_visible2(struct bch_fs *c, - u32 src, struct snapshots_seen *src_seen, - u32 dst, struct snapshots_seen *dst_seen) -{ - if (dst > src) { - swap(dst, src); - swap(dst_seen, src_seen); - } - return key_visible_in_snapshot(c, src_seen, dst, src); -} - -#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ - for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ - (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ - if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) - -struct inode_walker_entry { - struct bch_inode_unpacked inode; - bool whiteout; - u64 count; - u64 i_size; -}; - -struct inode_walker { - bool first_this_inode; - bool have_inodes; - bool recalculate_sums; - struct bpos last_pos; - - DARRAY(struct inode_walker_entry) inodes; - snapshot_id_list deletes; -}; - -static void inode_walker_exit(struct inode_walker *w) -{ - darray_exit(&w->inodes); - darray_exit(&w->deletes); -} - -static struct inode_walker inode_walker_init(void) -{ - return (struct inode_walker) { 0, }; -} - -static int add_inode(struct bch_fs *c, struct inode_walker *w, - struct bkey_s_c inode) -{ - int ret = darray_push(&w->inodes, ((struct inode_walker_entry) { - .whiteout = !bkey_is_inode(inode.k), - })); - if (ret) - return ret; - - struct inode_walker_entry *n = &darray_last(w->inodes); - if (!n->whiteout) { - return bch2_inode_unpack(inode, &n->inode); - } else { - n->inode.bi_inum = inode.k->p.offset; - n->inode.bi_snapshot = inode.k->p.snapshot; - return 0; - } -} - -static int get_inodes_all_snapshots(struct btree_trans *trans, - struct inode_walker *w, u64 inum) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - /* - * We no longer have inodes for w->last_pos; clear this to avoid - * screwing up check_i_sectors/check_subdir_count if we take a - * transaction restart here: - */ - w->have_inodes = false; - w->recalculate_sums = false; - w->inodes.nr = 0; - - for_each_btree_key_max_norestart(trans, iter, - BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - ret = add_inode(c, w, k); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - return ret; - - w->first_this_inode = true; - w->have_inodes = true; - return 0; -} - -static int get_visible_inodes(struct btree_trans *trans, - struct inode_walker *w, - struct snapshots_seen *s, - u64 inum) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - w->inodes.nr = 0; - w->deletes.nr = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - - if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) - continue; - - if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) - continue; - - ret = bkey_is_inode(k.k) - ? add_inode(c, w, k) - : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static struct inode_walker_entry * -lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - - struct inode_walker_entry *i = darray_find_p(w->inodes, i, - bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)); - - if (!i) - return NULL; - - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, - trans, snapshot_key_missing_inode_snapshot, - "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" - "unexpected because we should always update the inode when we update a key in that inode\n" - "%s", - w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, - (bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - if (!i->whiteout) { - struct bch_inode_unpacked new = i->inode; - new.bi_snapshot = k.k->p.snapshot; - ret = __bch2_fsck_write_inode(trans, &new); - } else { - struct bkey_i whiteout; - bkey_init(&whiteout.k); - whiteout.k.type = KEY_TYPE_whiteout; - whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot); - ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, - &whiteout, - BTREE_UPDATE_internal_snapshot_node); - } - - if (ret) - goto fsck_err; - - ret = bch2_trans_commit(trans, NULL, NULL, 0); - if (ret) - goto fsck_err; - - struct inode_walker_entry new_entry = *i; - - new_entry.inode.bi_snapshot = k.k->p.snapshot; - new_entry.count = 0; - new_entry.i_size = 0; - - while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) - --i; - - size_t pos = i - w->inodes.data; - ret = darray_insert_item(&w->inodes, pos, new_entry); - if (ret) - goto fsck_err; - - ret = bch_err_throw(c, transaction_restart_nested); - goto fsck_err; - } - - printbuf_exit(&buf); - return i; -fsck_err: - printbuf_exit(&buf); - return ERR_PTR(ret); -} - -static struct inode_walker_entry *walk_inode(struct btree_trans *trans, - struct inode_walker *w, - struct bkey_s_c k) -{ - if (w->last_pos.inode != k.k->p.inode) { - int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); - if (ret) - return ERR_PTR(ret); - } - - w->last_pos = k.k->p; - - return lookup_inode_for_snapshot(trans, w, k); -} - -/* - * Prefer to delete the first one, since that will be the one at the wrong - * offset: - * return value: 0 -> delete k1, 1 -> delete k2 - */ -int bch2_fsck_update_backpointers(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_i *new) -{ - if (new->k.type != KEY_TYPE_dirent) - return 0; - - struct bkey_i_dirent *d = bkey_i_to_dirent(new); - struct inode_walker target = inode_walker_init(); - int ret = 0; - - if (d->v.d_type == DT_SUBVOL) { - bch_err(trans->c, "%s does not support DT_SUBVOL", __func__); - ret = -BCH_ERR_fsck_repair_unimplemented; - } else { - ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); - if (ret) - goto err; - - darray_for_each(target.inodes, i) { - i->inode.bi_dir_offset = d->k.p.offset; - ret = __bch2_fsck_write_inode(trans, &i->inode); - if (ret) - goto err; - } - } -err: - inode_walker_exit(&target); - return ret; -} - -static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - u32 *snapshot) -{ - if (inode->bi_subvol) { - u64 inum; - int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); - if (ret) - return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); - } - - return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); -} - -static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); - int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int check_inode_dirent_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - bool *write_inode) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - u32 inode_snapshot = inode->bi_snapshot; - struct btree_iter dirent_iter = {}; - struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); - int ret = bkey_err(d); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) && - inode->bi_subvol && - (inode->bi_flags & BCH_INODE_has_child_snapshot)) { - /* Older version of a renamed subvolume root: we won't have a - * correct dirent for it. That's expected, see - * inode_should_reattach(). - * - * We don't clear the backpointer field when doing the rename - * because there might be arbitrarily many versions in older - * snapshots. - */ - inode->bi_dir = 0; - inode->bi_dir_offset = 0; - *write_inode = true; - goto out; - } - - if (fsck_err_on(ret, - trans, inode_points_to_missing_dirent, - "inode points to missing dirent\n%s", - (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || - fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode), - trans, inode_points_to_wrong_dirent, - "%s", - (printbuf_reset(&buf), - dirent_inode_mismatch_msg(&buf, c, d, inode), - buf.buf))) { - /* - * We just clear the backpointer fields for now. If we find a - * dirent that points to this inode in check_dirents(), we'll - * update it then; then when we get to check_path() if the - * backpointer is still 0 we'll reattach it. - */ - inode->bi_dir = 0; - inode->bi_dir_offset = 0; - *write_inode = true; - } -out: - ret = 0; -fsck_err: - bch2_trans_iter_exit(trans, &dirent_iter); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -static int check_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_inode_unpacked *snapshot_root, - struct snapshots_seen *s) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct bch_inode_unpacked u; - bool do_update = false; - int ret; - - ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret < 0) - goto err; - if (ret) - return 0; - - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); - if (ret) - goto err; - - if (!bkey_is_inode(k.k)) - return 0; - - ret = bch2_inode_unpack(k, &u); - if (ret) - goto err; - - if (snapshot_root->bi_inum != u.bi_inum) { - ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); - if (ret) - goto err; - } - - if (u.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { - ret = bch2_repair_inode_hash_info(trans, snapshot_root); - BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); - if (ret) - goto err; - } - - ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); - if (ret) - goto err; - - if (bch2_inode_has_backpointer(&u)) { - ret = check_inode_dirent_inode(trans, &u, &do_update); - if (ret) - goto err; - } - - if (fsck_err_on(bch2_inode_has_backpointer(&u) && - (u.bi_flags & BCH_INODE_unlinked), - trans, inode_unlinked_but_has_dirent, - "inode unlinked but has dirent\n%s", - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &u), - buf.buf))) { - u.bi_flags &= ~BCH_INODE_unlinked; - do_update = true; - } - - if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) { - /* Check for this early so that check_unreachable_inode() will reattach it */ - - ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot); - if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty) - goto err; - - fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty, - "dir unlinked but not empty\n%s", - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &u), - buf.buf)); - u.bi_flags &= ~BCH_INODE_unlinked; - do_update = true; - ret = 0; - } - - if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size, - trans, inode_dir_has_nonzero_i_size, - "directory %llu:%u with nonzero i_size %lli", - u.bi_inum, u.bi_snapshot, u.bi_size)) { - u.bi_size = 0; - do_update = true; - } - - ret = bch2_inode_has_child_snapshots(trans, k.k->p); - if (ret < 0) - goto err; - - if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), - trans, inode_has_child_snapshots_wrong, - "inode has_child_snapshots flag wrong (should be %u)\n%s", - ret, - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &u), - buf.buf))) { - if (ret) - u.bi_flags |= BCH_INODE_has_child_snapshot; - else - u.bi_flags &= ~BCH_INODE_has_child_snapshot; - do_update = true; - } - ret = 0; - - if ((u.bi_flags & BCH_INODE_unlinked) && - !(u.bi_flags & BCH_INODE_has_child_snapshot)) { - if (!test_bit(BCH_FS_started, &c->flags)) { - /* - * If we're not in online fsck, don't delete unlinked - * inodes, just make sure they're on the deleted list. - * - * They might be referred to by a logged operation - - * i.e. we might have crashed in the middle of a - * truncate on an unlinked but open file - so we want to - * let the delete_dead_inodes kill it after resuming - * logged ops. - */ - ret = check_inode_deleted_list(trans, k.k->p); - if (ret < 0) - goto err_noprint; - - fsck_err_on(!ret, - trans, unlinked_inode_not_on_deleted_list, - "inode %llu:%u unlinked, but not on deleted list", - u.bi_inum, k.k->p.snapshot); - - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); - if (ret) - goto err; - } else { - ret = bch2_inode_or_descendents_is_open(trans, k.k->p); - if (ret < 0) - goto err; - - if (fsck_err_on(!ret, - trans, inode_unlinked_and_not_open, - "inode %llu:%u unlinked and not open", - u.bi_inum, u.bi_snapshot)) { - ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); - bch_err_msg(c, ret, "in fsck deleting inode"); - goto err_noprint; - } - ret = 0; - } - } - - if (fsck_err_on(u.bi_parent_subvol && - (u.bi_subvol == 0 || - u.bi_subvol == BCACHEFS_ROOT_SUBVOL), - trans, inode_bi_parent_nonzero, - "inode %llu:%u has subvol %u but nonzero parent subvol %u", - u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { - u.bi_parent_subvol = 0; - do_update = true; - } - - if (u.bi_subvol) { - struct bch_subvolume s; - - ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { - ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); - goto do_update; - } - - if (fsck_err_on(ret, - trans, inode_bi_subvol_missing, - "inode %llu:%u bi_subvol points to missing subvolume %u", - u.bi_inum, k.k->p.snapshot, u.bi_subvol) || - fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || - !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), - k.k->p.snapshot), - trans, inode_bi_subvol_wrong, - "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", - u.bi_inum, k.k->p.snapshot, u.bi_subvol, - le64_to_cpu(s.inode), - le32_to_cpu(s.snapshot))) { - u.bi_subvol = 0; - u.bi_parent_subvol = 0; - do_update = true; - } - } - - if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal), - trans, inode_journal_seq_in_future, - "inode journal seq in future (currently at %llu)\n%s", - journal_cur_seq(&c->journal), - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &u), - buf.buf))) { - u.bi_journal_seq = journal_cur_seq(&c->journal); - do_update = true; - } -do_update: - if (do_update) { - ret = __bch2_fsck_write_inode(trans, &u); - bch_err_msg(c, ret, "in fsck updating inode"); - if (ret) - goto err_noprint; - } -err: -fsck_err: - bch_err_fn(c, ret); -err_noprint: - printbuf_exit(&buf); - return ret; -} - -int bch2_check_inodes(struct bch_fs *c) -{ - struct bch_inode_unpacked snapshot_root = {}; - struct snapshots_seen s; - - snapshots_seen_init(&s); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, - POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_inode(trans, &iter, k, &snapshot_root, &s))); - - snapshots_seen_exit(&s); - bch_err_fn(c, ret); - return ret; -} - -static int find_oldest_inode_needs_reattach(struct btree_trans *trans, - struct bch_inode_unpacked *inode) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - /* - * We look for inodes to reattach in natural key order, leaves first, - * but we should do the reattach at the oldest version that needs to be - * reattached: - */ - for_each_btree_key_norestart(trans, iter, - BTREE_ID_inodes, - SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inode->bi_inum) - break; - - if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) - continue; - - if (!bkey_is_inode(k.k)) - break; - - struct bch_inode_unpacked parent_inode; - ret = bch2_inode_unpack(k, &parent_inode); - if (ret) - break; - - if (!inode_should_reattach(&parent_inode)) - break; - - *inode = parent_inode; - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static int check_unreachable_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (!bkey_is_inode(k.k)) - return 0; - - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(k, &inode); - if (ret) - return ret; - - if (!inode_should_reattach(&inode)) - return 0; - - ret = find_oldest_inode_needs_reattach(trans, &inode); - if (ret) - return ret; - - if (fsck_err(trans, inode_unreachable, - "unreachable inode:\n%s", - (bch2_inode_unpacked_to_text(&buf, &inode), - buf.buf))) - ret = reattach_inode(trans, &inode); -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* - * Reattach unreachable (but not unlinked) inodes - * - * Run after check_inodes() and check_dirents(), so we node that inode - * backpointer fields point to valid dirents, and every inode that has a dirent - * that points to it has its backpointer field set - so we're just looking for - * non-unlinked inodes without backpointers: - * - * XXX: this is racy w.r.t. hardlink removal in online fsck - */ -int bch2_check_unreachable_inodes(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, - POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_unreachable_inode(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) -{ - switch (btree) { - case BTREE_ID_extents: - return S_ISREG(mode) || S_ISLNK(mode); - case BTREE_ID_dirents: - return S_ISDIR(mode); - case BTREE_ID_xattrs: - return true; - default: - BUG(); - } -} - -static int check_key_has_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct inode_walker *inode, - struct inode_walker_entry *i, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter iter2 = {}; - int ret = PTR_ERR_OR_ZERO(i); - if (ret) - return ret; - - if (k.k->type == KEY_TYPE_whiteout) - goto out; - - bool have_inode = i && !i->whiteout; - - if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) - goto reconstruct; - - if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode)) - goto out; - - prt_printf(&buf, ", "); - - bool have_old_inode = false; - darray_for_each(inode->inodes, i2) - if (!i2->whiteout && - bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) && - btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) { - prt_printf(&buf, "but found good inode in older snapshot\n"); - bch2_inode_unpacked_to_text(&buf, &i2->inode); - prt_newline(&buf); - have_old_inode = true; - break; - } - - struct bkey_s_c k2; - unsigned nr_keys = 0; - - prt_printf(&buf, "found keys:\n"); - - for_each_btree_key_max_norestart(trans, iter2, iter->btree_id, - SPOS(k.k->p.inode, 0, k.k->p.snapshot), - POS(k.k->p.inode, U64_MAX), - 0, k2, ret) { - nr_keys++; - if (nr_keys <= 10) { - bch2_bkey_val_to_text(&buf, c, k2); - prt_newline(&buf); - } - if (nr_keys >= 100) - break; - } - - if (ret) - goto err; - - if (nr_keys > 100) - prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys); - else if (nr_keys > 10) - prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys); - - if (!have_inode) { - if (fsck_err_on(!have_inode, - trans, key_in_missing_inode, - "key in missing inode%s", buf.buf)) { - /* - * Maybe a deletion that raced with data move, or something - * weird like that? But if we know the inode was deleted, or - * it's just a few keys, we can safely delete them. - * - * If it's many keys, we should probably recreate the inode - */ - if (have_old_inode || nr_keys <= 2) - goto delete; - else - goto reconstruct; - } - } else { - /* - * not autofix, this one would be a giant wtf - bit error in the - * inode corrupting i_mode? - * - * may want to try repairing inode instead of deleting - */ - if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), - trans, key_in_wrong_inode_type, - "key for wrong inode mode %o%s", - i->inode.bi_mode, buf.buf)) - goto delete; - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &iter2); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -delete: - /* - * XXX: print out more info - * count up extents for this inode, check if we have different inode in - * an older snapshot version, perhaps decide if we want to reconstitute - */ - ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - goto out; -reconstruct: - ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - - inode->last_pos.inode--; - ret = bch_err_throw(c, transaction_restart_nested); - goto out; -} - -static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) -{ - struct bch_fs *c = trans->c; - int ret = 0; - s64 count2; - - darray_for_each(w->inodes, i) { - if (i->inode.bi_sectors == i->count) - continue; - - count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); - - if (w->recalculate_sums) - i->count = count2; - - if (i->count != count2) { - bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); - i->count = count2; - } - - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), - trans, inode_i_sectors_wrong, - "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", - w->last_pos.inode, i->inode.bi_snapshot, - i->inode.bi_sectors, i->count)) { - i->inode.bi_sectors = i->count; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; - } - } -fsck_err: - bch_err_fn(c, ret); - return ret; -} - -static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) -{ - u32 restart_count = trans->restart_count; - return check_i_sectors_notnested(trans, w) ?: - trans_was_restarted(trans, restart_count); -} - -struct extent_end { - u32 snapshot; - u64 offset; - struct snapshots_seen seen; -}; - -struct extent_ends { - struct bpos last_pos; - DARRAY(struct extent_end) e; -}; - -static void extent_ends_reset(struct extent_ends *extent_ends) -{ - darray_for_each(extent_ends->e, i) - snapshots_seen_exit(&i->seen); - extent_ends->e.nr = 0; -} - -static void extent_ends_exit(struct extent_ends *extent_ends) -{ - extent_ends_reset(extent_ends); - darray_exit(&extent_ends->e); -} - -static void extent_ends_init(struct extent_ends *extent_ends) -{ - memset(extent_ends, 0, sizeof(*extent_ends)); -} - -static int extent_ends_at(struct bch_fs *c, - struct extent_ends *extent_ends, - struct snapshots_seen *seen, - struct bkey_s_c k) -{ - struct extent_end *i, n = (struct extent_end) { - .offset = k.k->p.offset, - .snapshot = k.k->p.snapshot, - .seen = *seen, - }; - - n.seen.ids.data = kmemdup(seen->ids.data, - sizeof(seen->ids.data[0]) * seen->ids.size, - GFP_KERNEL); - if (!n.seen.ids.data) - return bch_err_throw(c, ENOMEM_fsck_extent_ends_at); - - __darray_for_each(extent_ends->e, i) { - if (i->snapshot == k.k->p.snapshot) { - snapshots_seen_exit(&i->seen); - *i = n; - return 0; - } - - if (i->snapshot >= k.k->p.snapshot) - break; - } - - return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); -} - -static int overlapping_extents_found(struct btree_trans *trans, - enum btree_id btree, - struct bpos pos1, struct snapshots_seen *pos1_seen, - struct bkey pos2, - bool *fixed, - struct extent_end *extent_end) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter iter1, iter2 = {}; - struct bkey_s_c k1, k2; - int ret; - - BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); - - bch2_trans_iter_init(trans, &iter1, btree, pos1, - BTREE_ITER_all_snapshots| - BTREE_ITER_not_extents); - k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX)); - ret = bkey_err(k1); - if (ret) - goto err; - - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k1); - - if (!bpos_eq(pos1, k1.k->p)) { - prt_str(&buf, "\nwanted\n "); - bch2_bpos_to_text(&buf, pos1); - prt_str(&buf, "\n"); - bch2_bkey_to_text(&buf, &pos2); - - bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", - __func__, buf.buf); - ret = bch_err_throw(c, internal_fsck_err); - goto err; - } - - bch2_trans_copy_iter(trans, &iter2, &iter1); - - while (1) { - bch2_btree_iter_advance(trans, &iter2); - - k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX)); - ret = bkey_err(k2); - if (ret) - goto err; - - if (bpos_ge(k2.k->p, pos2.p)) - break; - } - - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k2); - - if (bpos_gt(k2.k->p, pos2.p) || - pos2.size != k2.k->size) { - bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", - __func__, buf.buf); - ret = bch_err_throw(c, internal_fsck_err); - goto err; - } - - prt_printf(&buf, "\noverwriting %s extent", - pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); - - if (fsck_err(trans, extent_overlapping, - "overlapping extents%s", buf.buf)) { - struct btree_iter *old_iter = &iter1; - struct disk_reservation res = { 0 }; - - if (pos1.snapshot < pos2.p.snapshot) { - old_iter = &iter2; - swap(k1, k2); - } - - trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); - - ret = bch2_trans_update_extent_overwrite(trans, old_iter, - BTREE_UPDATE_internal_snapshot_node, - k1, k2) ?: - bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); - bch2_disk_reservation_put(c, &res); - - bch_info(c, "repair ret %s", bch2_err_str(ret)); - - if (ret) - goto err; - - *fixed = true; - - if (pos1.snapshot == pos2.p.snapshot) { - /* - * We overwrote the first extent, and did the overwrite - * in the same snapshot: - */ - extent_end->offset = bkey_start_offset(&pos2); - } else if (pos1.snapshot > pos2.p.snapshot) { - /* - * We overwrote the first extent in pos2's snapshot: - */ - ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); - } else { - /* - * We overwrote the second extent - restart - * check_extent() from the top: - */ - ret = bch_err_throw(c, transaction_restart_nested); - } - } -fsck_err: -err: - bch2_trans_iter_exit(trans, &iter2); - bch2_trans_iter_exit(trans, &iter1); - printbuf_exit(&buf); - return ret; -} - -static int check_overlapping_extents(struct btree_trans *trans, - struct snapshots_seen *seen, - struct extent_ends *extent_ends, - struct bkey_s_c k, - struct btree_iter *iter, - bool *fixed) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - /* transaction restart, running again */ - if (bpos_eq(extent_ends->last_pos, k.k->p)) - return 0; - - if (extent_ends->last_pos.inode != k.k->p.inode) - extent_ends_reset(extent_ends); - - darray_for_each(extent_ends->e, i) { - if (i->offset <= bkey_start_offset(k.k)) - continue; - - if (!ref_visible2(c, - k.k->p.snapshot, seen, - i->snapshot, &i->seen)) - continue; - - ret = overlapping_extents_found(trans, iter->btree_id, - SPOS(iter->pos.inode, - i->offset, - i->snapshot), - &i->seen, - *k.k, fixed, i); - if (ret) - goto err; - } - - extent_ends->last_pos = k.k->p; -err: - return ret; -} - -static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; - - bkey_for_each_crc(k.k, ptrs, crc, i) - if (crc_is_encoded(crc) && - crc.uncompressed_size > encoded_extent_max_sectors) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); - printbuf_exit(&buf); - } - - return 0; -} - -static int check_extent(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct inode_walker *inode, - struct snapshots_seen *s, - struct extent_ends *extent_ends, - struct disk_reservation *res) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret) { - ret = ret < 0 ? ret : 0; - goto out; - } - - if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { - ret = check_i_sectors(trans, inode); - if (ret) - goto err; - } - - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); - if (ret) - goto err; - - struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); - ret = PTR_ERR_OR_ZERO(extent_i); - if (ret) - goto err; - - ret = check_key_has_inode(trans, iter, inode, extent_i, k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_whiteout) { - ret = check_overlapping_extents(trans, s, extent_ends, k, iter, - &inode->recalculate_sums); - if (ret) - goto err; - - /* - * Check inodes in reverse order, from oldest snapshots to - * newest, starting from the inode that matches this extent's - * snapshot. If we didn't have one, iterate over all inodes: - */ - for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); - inode->inodes.data && i >= inode->inodes.data; - --i) { - if (i->inode.bi_snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) - continue; - - u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9; - - if (fsck_err_on(k.k->p.offset > last_block && - !bkey_extent_is_reservation(k), - trans, extent_past_end_of_inode, - "extent type past end of inode %llu:%u, i_size %llu\n%s", - i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = snapshots_seen_add_inorder(c, s, i->inode.bi_snapshot) ?: - bch2_fpunch_snapshot(trans, - SPOS(i->inode.bi_inum, - last_block, - i->inode.bi_snapshot), - POS(i->inode.bi_inum, U64_MAX)); - if (ret) - goto err; - - iter->k.type = KEY_TYPE_whiteout; - break; - } - } - } - - ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - - if (bkey_extent_is_allocation(k.k)) { - for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); - inode->inodes.data && i >= inode->inodes.data; - --i) { - if (i->whiteout || - i->inode.bi_snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) - continue; - - i->count += k.k->size; - } - } - - if (k.k->type != KEY_TYPE_whiteout) { - ret = extent_ends_at(c, extent_ends, s, k); - if (ret) - goto err; - } -out: -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -/* - * Walk extents: verify that extents have a corresponding S_ISREG inode, and - * that i_size an i_sectors are consistent - */ -int bch2_check_extents(struct bch_fs *c) -{ - struct inode_walker w = inode_walker_init(); - struct snapshots_seen s; - struct extent_ends extent_ends; - struct disk_reservation res = { 0 }; - - snapshots_seen_init(&s); - extent_ends_init(&extent_ends); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_extents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ - bch2_disk_reservation_put(c, &res); - check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: - check_extent_overbig(trans, &iter, k); - })) ?: - check_i_sectors_notnested(trans, &w)); - - bch2_disk_reservation_put(c, &res); - extent_ends_exit(&extent_ends); - inode_walker_exit(&w); - snapshots_seen_exit(&s); - - bch_err_fn(c, ret); - return ret; -} - -int bch2_check_indirect_extents(struct bch_fs *c) -{ - struct disk_reservation res = { 0 }; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, - POS_MIN, - BTREE_ITER_prefetch, k, - &res, NULL, - BCH_TRANS_COMMIT_no_enospc, ({ - bch2_disk_reservation_put(c, &res); - check_extent_overbig(trans, &iter, k); - }))); - - bch2_disk_reservation_put(c, &res); - bch_err_fn(c, ret); - return ret; -} - -static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) -{ - struct bch_fs *c = trans->c; - int ret = 0; - s64 count2; - - darray_for_each(w->inodes, i) { - if (i->inode.bi_nlink == i->count) - continue; - - count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); - if (count2 < 0) - return count2; - - if (i->count != count2) { - bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); - i->count = count2; - if (i->inode.bi_nlink == i->count) - continue; - } - - if (i->inode.bi_nlink != i->count) { - CLASS(printbuf, buf)(); - - lockrestart_do(trans, - bch2_inum_snapshot_to_path(trans, w->last_pos.inode, - i->inode.bi_snapshot, NULL, &buf)); - - if (fsck_err_on(i->inode.bi_nlink != i->count, - trans, inode_dir_wrong_nlink, - "directory with wrong i_nlink: got %u, should be %llu\n%s", - i->inode.bi_nlink, i->count, buf.buf)) { - i->inode.bi_nlink = i->count; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; - } - } - } -fsck_err: - bch_err_fn(c, ret); - return ret; -} - -static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) -{ - u32 restart_count = trans->restart_count; - return check_subdir_count_notnested(trans, w) ?: - trans_was_restarted(trans, restart_count); -} - -/* find a subvolume that's a descendent of @snapshot: */ -static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { - if (k.k->type != KEY_TYPE_subvolume) - continue; - - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { - bch2_trans_iter_exit(trans, &iter); - *subvolid = k.k->p.offset; - goto found; - } - } - if (!ret) - ret = -ENOENT; -found: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -noinline_for_stack -static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c_dirent d) -{ - struct bch_fs *c = trans->c; - struct btree_iter subvol_iter = {}; - struct bch_inode_unpacked subvol_root; - u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); - u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); - u32 parent_snapshot; - u32 new_parent_subvol = 0; - u64 parent_inum; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (ret || - (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { - int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); - if (ret2 && !bch2_err_matches(ret, ENOENT)) - return ret2; - } - - if (ret && - !new_parent_subvol && - (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { - /* - * Couldn't find a subvol for dirent's snapshot - but we lost - * subvols, so we need to reconstruct: - */ - ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); - if (ret) - return ret; - - parent_snapshot = d.k->p.snapshot; - } - - if (fsck_err_on(ret, - trans, dirent_to_missing_parent_subvol, - "dirent parent_subvol points to missing subvolume\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || - fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), - trans, dirent_not_visible_in_parent_subvol, - "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", - parent_snapshot, - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - if (!new_parent_subvol) { - bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); - return bch_err_throw(c, fsck_repair_unimplemented); - } - - struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); - ret = PTR_ERR_OR_ZERO(new_dirent); - if (ret) - goto err; - - new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); - } - - struct bkey_s_c_subvolume s = - bch2_bkey_get_iter_typed(trans, &subvol_iter, - BTREE_ID_subvolumes, POS(0, target_subvol), - 0, subvolume); - ret = bkey_err(s.s_c); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (ret) { - if (fsck_err(trans, dirent_to_missing_subvol, - "dirent points to missing subvolume\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) - return bch2_fsck_remove_dirent(trans, d.k->p); - ret = 0; - goto out; - } - - if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { - printbuf_reset(&buf); - - prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", - parent_subvol); - - ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, - le64_to_cpu(s.v->inode) }, &buf); - if (ret) - goto err; - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, s.s_c); - - if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { - struct bkey_i_subvolume *n = - bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - n->v.fs_path_parent = cpu_to_le32(parent_subvol); - } - } - - u64 target_inum = le64_to_cpu(s.v->inode); - u32 target_snapshot = le32_to_cpu(s.v->snapshot); - - ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot, - &subvol_root, 0); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (ret) { - bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto err; - } - - if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, - trans, inode_bi_parent_wrong, - "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", - target_inum, - subvol_root.bi_parent_subvol, parent_subvol)) { - subvol_root.bi_parent_subvol = parent_subvol; - subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot); - ret = __bch2_fsck_write_inode(trans, &subvol_root); - if (ret) - goto err; - } - - ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); - if (ret) - goto err; -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &subvol_iter); - printbuf_exit(&buf); - return ret; -} - -static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct bch_hash_info *hash_info, - struct inode_walker *dir, - struct inode_walker *target, - struct snapshots_seen *s, - bool *need_second_pass) -{ - struct bch_fs *c = trans->c; - struct inode_walker_entry *i; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret) { - ret = ret < 0 ? ret : 0; - goto out; - } - - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); - if (ret) - goto err; - - if (k.k->type == KEY_TYPE_whiteout) - goto out; - - if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { - ret = check_subdir_dirents_count(trans, dir); - if (ret) - goto err; - } - - i = walk_inode(trans, dir, k); - ret = PTR_ERR_OR_ZERO(i); - if (ret < 0) - goto err; - - ret = check_key_has_inode(trans, iter, dir, i, k); - if (ret) - goto err; - - if (!i || i->whiteout) - goto out; - - if (dir->first_this_inode) - *hash_info = bch2_hash_info_init(c, &i->inode); - dir->first_this_inode = false; - - hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; - - ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, - iter, k, need_second_pass); - if (ret < 0) - goto err; - if (ret) { - /* dirent has been deleted */ - ret = 0; - goto out; - } - - if (k.k->type != KEY_TYPE_dirent) - goto out; - - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - - /* check casefold */ - if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding, - trans, dirent_casefold_mismatch, - "dirent casefold does not match dir casefold\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_parent_subvol) - : 0, - }; - u64 target = d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) - : le64_to_cpu(d.v->d_inum); - struct qstr name = bch2_dirent_get_name(d); - - struct bkey_i_dirent *new_d = - bch2_dirent_create_key(trans, hash_info, dir_inum, - d.v->d_type, &name, NULL, target); - ret = PTR_ERR_OR_ZERO(new_d); - if (ret) - goto out; - - new_d->k.p.inode = d.k->p.inode; - new_d->k.p.snapshot = d.k->p.snapshot; - - struct btree_iter dup_iter = {}; - ret = bch2_hash_delete_at(trans, - bch2_dirent_hash_desc, hash_info, iter, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_str_hash_repair_key(trans, s, - &bch2_dirent_hash_desc, hash_info, - iter, bkey_i_to_s_c(&new_d->k_i), - &dup_iter, bkey_s_c_null, - need_second_pass); - goto out; - } - - if (d.v->d_type == DT_SUBVOL) { - ret = check_dirent_to_subvol(trans, iter, d); - if (ret) - goto err; - } else { - ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); - if (ret) - goto err; - - if (fsck_err_on(!target->inodes.nr, - trans, dirent_to_missing_inode, - "dirent points to missing inode:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - ret = bch2_fsck_remove_dirent(trans, d.k->p); - if (ret) - goto err; - } - - darray_for_each(target->inodes, i) { - ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); - if (ret) - goto err; - } - - darray_for_each(target->deletes, i) - if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i), - trans, dirent_to_overwritten_inode, - "dirent points to inode overwritten in snapshot %u:\n%s", - *i, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - struct btree_iter delete_iter; - bch2_trans_iter_init(trans, &delete_iter, - BTREE_ID_dirents, - SPOS(k.k->p.inode, k.k->p.offset, *i), - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &delete_iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - hash_info, - &delete_iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &delete_iter); - if (ret) - goto err; - - } - } - - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - - for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { - if (d.v->d_type == DT_DIR) - i->count++; - i->i_size += bkey_bytes(d.k); - } -out: -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* - * Walk dirents: verify that they all have a corresponding S_ISDIR inode, - * validate d_type - */ -int bch2_check_dirents(struct bch_fs *c) -{ - struct inode_walker dir = inode_walker_init(); - struct inode_walker target = inode_walker_init(); - struct snapshots_seen s; - struct bch_hash_info hash_info; - bool need_second_pass = false, did_second_pass = false; - int ret; - - snapshots_seen_init(&s); -again: - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s, - &need_second_pass)) ?: - check_subdir_count_notnested(trans, &dir)); - - if (!ret && need_second_pass && !did_second_pass) { - bch_info(c, "check_dirents requires second pass"); - swap(did_second_pass, need_second_pass); - goto again; - } - - if (!ret && need_second_pass) { - bch_err(c, "dirents not repairing"); - ret = -EINVAL; - } - - snapshots_seen_exit(&s); - inode_walker_exit(&dir); - inode_walker_exit(&target); - bch_err_fn(c, ret); - return ret; -} - -static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct bch_hash_info *hash_info, - struct inode_walker *inode) -{ - struct bch_fs *c = trans->c; - - int ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret < 0) - return ret; - if (ret) - return 0; - - struct inode_walker_entry *i = walk_inode(trans, inode, k); - ret = PTR_ERR_OR_ZERO(i); - if (ret) - return ret; - - ret = check_key_has_inode(trans, iter, inode, i, k); - if (ret) - return ret; - - if (!i || i->whiteout) - return 0; - - if (inode->first_this_inode) - *hash_info = bch2_hash_info_init(c, &i->inode); - inode->first_this_inode = false; - - bool need_second_pass = false; - return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, - iter, k, &need_second_pass); -} - -/* - * Walk xattrs: verify that they all have a corresponding inode - */ -int bch2_check_xattrs(struct bch_fs *c) -{ - struct inode_walker inode = inode_walker_init(); - struct bch_hash_info hash_info; - int ret = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, - k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - check_xattr(trans, &iter, k, &hash_info, &inode))); - - inode_walker_exit(&inode); - bch_err_fn(c, ret); - return ret; -} - -static int check_root_trans(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct bch_inode_unpacked root_inode; - u32 snapshot; - u64 inum; - int ret; - - ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, - "root subvol missing")) { - struct bkey_i_subvolume *root_subvol = - bch2_trans_kmalloc(trans, sizeof(*root_subvol)); - ret = PTR_ERR_OR_ZERO(root_subvol); - if (ret) - goto err; - - snapshot = U32_MAX; - inum = BCACHEFS_ROOT_INO; - - bkey_subvolume_init(&root_subvol->k_i); - root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_subvol->v.flags = 0; - root_subvol->v.snapshot = cpu_to_le32(snapshot); - root_subvol->v.inode = cpu_to_le64(inum); - ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); - bch_err_msg(c, ret, "writing root subvol"); - if (ret) - goto err; - } - - ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot, - &root_inode, 0); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (mustfix_fsck_err_on(ret, - trans, root_dir_missing, - "root directory missing") || - mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), - trans, root_inode_not_dir, - "root inode not a directory")) { - bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, - 0, NULL); - root_inode.bi_inum = inum; - root_inode.bi_snapshot = snapshot; - - ret = __bch2_fsck_write_inode(trans, &root_inode); - bch_err_msg(c, ret, "writing root inode"); - } -err: -fsck_err: - return ret; -} - -/* Get root directory, create if it doesn't exist: */ -int bch2_check_root(struct bch_fs *c) -{ - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_root_trans(trans)); - bch_err_fn(c, ret); - return ret; -} - -static bool darray_u32_has(darray_u32 *d, u32 v) -{ - darray_for_each(*d, i) - if (*i == v) - return true; - return false; -} - -static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct btree_iter parent_iter = {}; - darray_u32 subvol_path = {}; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (k.k->type != KEY_TYPE_subvolume) - return 0; - - subvol_inum start = { - .subvol = k.k->p.offset, - .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode), - }; - - while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { - ret = darray_push(&subvol_path, k.k->p.offset); - if (ret) - goto err; - - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - - struct bch_inode_unpacked subvol_root; - ret = bch2_inode_find_by_inum_trans(trans, - (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, - &subvol_root); - if (ret) - break; - - u32 parent = le32_to_cpu(s.v->fs_path_parent); - - if (darray_u32_has(&subvol_path, parent)) { - printbuf_reset(&buf); - prt_printf(&buf, "subvolume loop: "); - - ret = bch2_inum_to_path(trans, start, &buf); - if (ret) - goto err; - - if (fsck_err(trans, subvol_loop, "%s", buf.buf)) - ret = reattach_subvol(trans, s); - break; - } - - bch2_trans_iter_exit(trans, &parent_iter); - bch2_trans_iter_init(trans, &parent_iter, - BTREE_ID_subvolumes, POS(0, parent), 0); - k = bch2_btree_iter_peek_slot(trans, &parent_iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, - trans, subvol_unreachable, - "unreachable subvolume %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, s.s_c), - buf.buf))) { - ret = reattach_subvol(trans, s); - break; - } - } -fsck_err: -err: - printbuf_exit(&buf); - darray_exit(&subvol_path); - bch2_trans_iter_exit(trans, &parent_iter); - return ret; -} - -int bch2_check_subvolume_structure(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol_path(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_bi_depth_renumber_one(struct btree_trans *trans, - u64 inum, u32 snapshot, - u32 new_depth) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum, snapshot), 0); - - struct bch_inode_unpacked inode; - int ret = bkey_err(k) ?: - !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode - : bch2_inode_unpack(k, &inode); - if (ret) - goto err; - - if (inode.bi_depth != new_depth) { - inode.bi_depth = new_depth; - ret = __bch2_fsck_write_inode(trans, &inode) ?: - bch2_trans_commit(trans, NULL, NULL, 0); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path, - u32 snapshot, u32 new_bi_depth) -{ - u32 restart_count = trans->restart_count; - int ret = 0; - - darray_for_each_reverse(*path, i) { - ret = nested_lockrestart_do(trans, - bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth)); - bch_err_fn(trans->c, ret); - if (ret) - break; - - new_bi_depth++; - } - - return ret ?: trans_was_restarted(trans, restart_count); -} - -static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) -{ - struct bch_fs *c = trans->c; - struct btree_iter inode_iter = {}; - darray_u64 path = {}; - struct printbuf buf = PRINTBUF; - u32 snapshot = inode_k.k->p.snapshot; - bool redo_bi_depth = false; - u32 min_bi_depth = U32_MAX; - int ret = 0; - - struct bpos start = inode_k.k->p; - - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(inode_k, &inode); - if (ret) - return ret; - - /* - * If we're running full fsck, check_dirents() will have already ran, - * and we shouldn't see any missing backpointers here - otherwise that's - * handled separately, by check_unreachable_inodes - */ - while (!inode.bi_subvol && - bch2_inode_has_backpointer(&inode)) { - struct btree_iter dirent_iter; - struct bkey_s_c_dirent d; - - d = dirent_get_by_pos(trans, &dirent_iter, - SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot)); - ret = bkey_err(d.s_c); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto out; - - if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) - bch2_trans_iter_exit(trans, &dirent_iter); - - if (bch2_err_matches(ret, ENOENT)) { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, inode_k); - bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", - bch2_err_str(ret), buf.buf); - goto out; - } - - bch2_trans_iter_exit(trans, &dirent_iter); - - ret = darray_push(&path, inode.bi_inum); - if (ret) - return ret; - - bch2_trans_iter_exit(trans, &inode_iter); - inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, - SPOS(0, inode.bi_dir, snapshot), 0); - - struct bch_inode_unpacked parent_inode; - ret = bkey_err(inode_k) ?: - !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode - : bch2_inode_unpack(inode_k, &parent_inode); - if (ret) { - /* Should have been caught in dirents pass */ - bch_err_msg(c, ret, "error looking up parent directory"); - goto out; - } - - min_bi_depth = parent_inode.bi_depth; - - if (parent_inode.bi_depth < inode.bi_depth && - min_bi_depth < U16_MAX) - break; - - inode = parent_inode; - redo_bi_depth = true; - - if (darray_find(path, inode.bi_inum)) { - printbuf_reset(&buf); - prt_printf(&buf, "directory structure loop in snapshot %u: ", - snapshot); - - ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf); - if (ret) - goto out; - - if (c->opts.verbose) { - prt_newline(&buf); - darray_for_each(path, i) - prt_printf(&buf, "%llu ", *i); - } - - if (fsck_err(trans, dir_loop, "%s", buf.buf)) { - ret = remove_backpointer(trans, &inode); - bch_err_msg(c, ret, "removing dirent"); - if (ret) - goto out; - - ret = reattach_inode(trans, &inode); - bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); - } - - goto out; - } - } - - if (inode.bi_subvol) - min_bi_depth = 0; - - if (redo_bi_depth) - ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth); -out: -fsck_err: - bch2_trans_iter_exit(trans, &inode_iter); - darray_exit(&path); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -/* - * Check for loops in the directory structure: all other connectivity issues - * have been fixed by prior passes - */ -int bch2_check_directory_structure(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_intent| - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - if (!S_ISDIR(bkey_inode_mode(k))) - continue; - - if (bch2_inode_flags(k) & BCH_INODE_unlinked) - continue; - - check_path_loop(trans, k); - }))); - - bch_err_fn(c, ret); - return ret; -} - -struct nlink_table { - size_t nr; - size_t size; - - struct nlink { - u64 inum; - u32 snapshot; - u32 count; - } *d; -}; - -static int add_nlink(struct bch_fs *c, struct nlink_table *t, - u64 inum, u32 snapshot) -{ - if (t->nr == t->size) { - size_t new_size = max_t(size_t, 128UL, t->size * 2); - void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); - - if (!d) { - bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", - new_size); - return bch_err_throw(c, ENOMEM_fsck_add_nlink); - } - - if (t->d) - memcpy(d, t->d, t->size * sizeof(t->d[0])); - kvfree(t->d); - - t->d = d; - t->size = new_size; - } - - - t->d[t->nr++] = (struct nlink) { - .inum = inum, - .snapshot = snapshot, - }; - - return 0; -} - -static int nlink_cmp(const void *_l, const void *_r) -{ - const struct nlink *l = _l; - const struct nlink *r = _r; - - return cmp_int(l->inum, r->inum); -} - -static void inc_link(struct bch_fs *c, struct snapshots_seen *s, - struct nlink_table *links, - u64 range_start, u64 range_end, u64 inum, u32 snapshot) -{ - struct nlink *link, key = { - .inum = inum, .snapshot = U32_MAX, - }; - - if (inum < range_start || inum >= range_end) - return; - - link = __inline_bsearch(&key, links->d, links->nr, - sizeof(links->d[0]), nlink_cmp); - if (!link) - return; - - while (link > links->d && link[0].inum == link[-1].inum) - --link; - - for (; link < links->d + links->nr && link->inum == inum; link++) - if (ref_visible(c, s, snapshot, link->snapshot)) { - link->count++; - if (link->snapshot >= snapshot) - break; - } -} - -noinline_for_stack -static int check_nlinks_find_hardlinks(struct bch_fs *c, - struct nlink_table *t, - u64 start, u64 *end) -{ - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_inodes, - POS(0, start), - BTREE_ITER_intent| - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - if (!bkey_is_inode(k.k)) - continue; - - /* Should never fail, checked by bch2_inode_invalid: */ - struct bch_inode_unpacked u; - _ret3 = bch2_inode_unpack(k, &u); - if (_ret3) - break; - - /* - * Backpointer and directory structure checks are sufficient for - * directories, since they can't have hardlinks: - */ - if (S_ISDIR(u.bi_mode)) - continue; - - /* - * Previous passes ensured that bi_nlink is nonzero if - * it had multiple hardlinks: - */ - if (!u.bi_nlink) - continue; - - ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); - if (ret) { - *end = k.k->p.offset; - ret = 0; - break; - } - 0; - }))); - - bch_err_fn(c, ret); - return ret; -} - -noinline_for_stack -static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, - u64 range_start, u64 range_end) -{ - struct snapshots_seen s; - - snapshots_seen_init(&s); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, - BTREE_ITER_intent| - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); - if (ret) - break; - - if (k.k->type == KEY_TYPE_dirent) { - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - - if (d.v->d_type != DT_DIR && - d.v->d_type != DT_SUBVOL) - inc_link(c, &s, links, range_start, range_end, - le64_to_cpu(d.v->d_inum), d.k->p.snapshot); - } - 0; - }))); - - snapshots_seen_exit(&s); - - bch_err_fn(c, ret); - return ret; -} - -static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct nlink_table *links, - size_t *idx, u64 range_end) -{ - struct bch_inode_unpacked u; - struct nlink *link = &links->d[*idx]; - int ret = 0; - - if (k.k->p.offset >= range_end) - return 1; - - if (!bkey_is_inode(k.k)) - return 0; - - ret = bch2_inode_unpack(k, &u); - if (ret) - return ret; - - if (S_ISDIR(u.bi_mode)) - return 0; - - if (!u.bi_nlink) - return 0; - - while ((cmp_int(link->inum, k.k->p.offset) ?: - cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { - BUG_ON(*idx == links->nr); - link = &links->d[++*idx]; - } - - if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, - trans, inode_wrong_nlink, - "inode %llu type %s has wrong i_nlink (%u, should be %u)", - u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], - bch2_inode_nlink_get(&u), link->count)) { - bch2_inode_nlink_set(&u, link->count); - ret = __bch2_fsck_write_inode(trans, &u); - } -fsck_err: - return ret; -} - -noinline_for_stack -static int check_nlinks_update_hardlinks(struct bch_fs *c, - struct nlink_table *links, - u64 range_start, u64 range_end) -{ - size_t idx = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, - POS(0, range_start), - BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); - if (ret < 0) { - bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); - return ret; - } - - return 0; -} - -int bch2_check_nlinks(struct bch_fs *c) -{ - struct nlink_table links = { 0 }; - u64 this_iter_range_start, next_iter_range_start = 0; - int ret = 0; - - do { - this_iter_range_start = next_iter_range_start; - next_iter_range_start = U64_MAX; - - ret = check_nlinks_find_hardlinks(c, &links, - this_iter_range_start, - &next_iter_range_start); - - ret = check_nlinks_walk_dirents(c, &links, - this_iter_range_start, - next_iter_range_start); - if (ret) - break; - - ret = check_nlinks_update_hardlinks(c, &links, - this_iter_range_start, - next_iter_range_start); - if (ret) - break; - - links.nr = 0; - } while (next_iter_range_start != U64_MAX); - - kvfree(links.d); - bch_err_fn(c, ret); - return ret; -} - -static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bkey_s_c_reflink_p p; - struct bkey_i_reflink_p *u; - - if (k.k->type != KEY_TYPE_reflink_p) - return 0; - - p = bkey_s_c_to_reflink_p(k); - - if (!p.v->front_pad && !p.v->back_pad) - return 0; - - u = bch2_trans_kmalloc(trans, sizeof(*u)); - int ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - bkey_reassemble(&u->k_i, k); - u->v.front_pad = 0; - u->v.back_pad = 0; - - return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); -} - -int bch2_fix_reflink_p(struct bch_fs *c) -{ - if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) - return 0; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_extents, POS_MIN, - BTREE_ITER_intent|BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - fix_reflink_p_key(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -#ifndef NO_BCACHEFS_CHARDEV - -struct fsck_thread { - struct thread_with_stdio thr; - struct bch_fs *c; - struct bch_opts opts; -}; - -static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) -{ - struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); - kfree(thr); -} - -static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) -{ - struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = thr->c; - - int ret = PTR_ERR_OR_ZERO(c); - if (ret) - return ret; - - ret = bch2_fs_start(thr->c); - if (ret) - goto err; - - if (test_bit(BCH_FS_errors_fixed, &c->flags)) { - bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); - ret |= 1; - } - if (test_bit(BCH_FS_error, &c->flags)) { - bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); - ret |= 4; - } -err: - bch2_fs_stop(c); - return ret; -} - -static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { - .exit = bch2_fsck_thread_exit, - .fn = bch2_fsck_offline_thread_fn, -}; - -long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) -{ - struct bch_ioctl_fsck_offline arg; - struct fsck_thread *thr = NULL; - darray_const_str devs = {}; - long ret = 0; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - for (size_t i = 0; i < arg.nr_devs; i++) { - u64 dev_u64; - ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); - if (ret) - goto err; - - char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); - ret = PTR_ERR_OR_ZERO(dev_str); - if (ret) - goto err; - - ret = darray_push(&devs, dev_str); - if (ret) { - kfree(dev_str); - goto err; - } - } - - thr = kzalloc(sizeof(*thr), GFP_KERNEL); - if (!thr) { - ret = -ENOMEM; - goto err; - } - - thr->opts = bch2_opts_empty(); - - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false); - if (!IS_ERR(optstr)) - kfree(optstr); - - if (ret) - goto err; - } - - opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); - opt_set(thr->opts, read_only, 1); - opt_set(thr->opts, ratelimit_errors, 0); - - /* We need request_key() to be called before we punt to kthread: */ - opt_set(thr->opts, nostart, true); - - bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); - - thr->c = bch2_fs_open(&devs, &thr->opts); - - if (!IS_ERR(thr->c) && - thr->c->opts.errors == BCH_ON_ERROR_panic) - thr->c->opts.errors = BCH_ON_ERROR_ro; - - ret = __bch2_run_thread_with_stdio(&thr->thr); -out: - darray_for_each(devs, i) - kfree(*i); - darray_exit(&devs); - return ret; -err: - if (thr) - bch2_fsck_thread_exit(&thr->thr); - pr_err("ret %s", bch2_err_str(ret)); - goto out; -} - -static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) -{ - struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = thr->c; - - c->stdio_filter = current; - c->stdio = &thr->thr.stdio; - - /* - * XXX: can we figure out a way to do this without mucking with c->opts? - */ - unsigned old_fix_errors = c->opts.fix_errors; - if (opt_defined(thr->opts, fix_errors)) - c->opts.fix_errors = thr->opts.fix_errors; - else - c->opts.fix_errors = FSCK_FIX_ask; - - c->opts.fsck = true; - set_bit(BCH_FS_in_fsck, &c->flags); - - int ret = bch2_run_online_recovery_passes(c, ~0ULL); - - clear_bit(BCH_FS_in_fsck, &c->flags); - bch_err_fn(c, ret); - - c->stdio = NULL; - c->stdio_filter = NULL; - c->opts.fix_errors = old_fix_errors; - - up(&c->recovery.run_lock); - bch2_ro_ref_put(c); - return ret; -} - -static const struct thread_with_stdio_ops bch2_online_fsck_ops = { - .exit = bch2_fsck_thread_exit, - .fn = bch2_fsck_online_thread_fn, -}; - -long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) -{ - struct fsck_thread *thr = NULL; - long ret = 0; - - if (arg.flags) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!bch2_ro_ref_tryget(c)) - return -EROFS; - - if (down_trylock(&c->recovery.run_lock)) { - bch2_ro_ref_put(c); - return -EAGAIN; - } - - thr = kzalloc(sizeof(*thr), GFP_KERNEL); - if (!thr) { - ret = -ENOMEM; - goto err; - } - - thr->c = c; - thr->opts = bch2_opts_empty(); - - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - - ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false); - if (!IS_ERR(optstr)) - kfree(optstr); - - if (ret) - goto err; - } - - ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); -err: - if (ret < 0) { - bch_err_fn(c, ret); - if (thr) - bch2_fsck_thread_exit(&thr->thr); - up(&c->recovery.run_lock); - bch2_ro_ref_put(c); - } - return ret; -} - -#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h deleted file mode 100644 index e5fe7cf7b251..000000000000 --- a/fs/bcachefs/fsck.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FSCK_H -#define _BCACHEFS_FSCK_H - -#include "str_hash.h" - -/* recoverds snapshot IDs of overwrites at @pos */ -struct snapshots_seen { - struct bpos pos; - snapshot_id_list ids; -}; - -int bch2_fsck_update_backpointers(struct btree_trans *, - struct snapshots_seen *, - const struct bch_hash_desc, - struct bch_hash_info *, - struct bkey_i *); - -int bch2_check_inodes(struct bch_fs *); -int bch2_check_extents(struct bch_fs *); -int bch2_check_indirect_extents(struct bch_fs *); -int bch2_check_dirents(struct bch_fs *); -int bch2_check_xattrs(struct bch_fs *); -int bch2_check_root(struct bch_fs *); -int bch2_check_subvolume_structure(struct bch_fs *); -int bch2_check_unreachable_inodes(struct bch_fs *); -int bch2_check_directory_structure(struct bch_fs *); -int bch2_check_nlinks(struct bch_fs *); -int bch2_fix_reflink_p(struct bch_fs *); - -long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *); -long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online); - -#endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c deleted file mode 100644 index ef4cc7395b86..000000000000 --- a/fs/bcachefs/inode.c +++ /dev/null @@ -1,1566 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_key_cache.h" -#include "btree_write_buffer.h" -#include "bkey_methods.h" -#include "btree_update.h" -#include "buckets.h" -#include "compress.h" -#include "dirent.h" -#include "disk_accounting.h" -#include "error.h" -#include "extents.h" -#include "extent_update.h" -#include "fs.h" -#include "inode.h" -#include "namei.h" -#include "opts.h" -#include "str_hash.h" -#include "snapshot.h" -#include "subvolume.h" -#include "varint.h" - -#include <linux/random.h> - -#include <linux/unaligned.h> - -#define x(name, ...) #name, -const char * const bch2_inode_opts[] = { - BCH_INODE_OPTS() - NULL, -}; - -static const char * const bch2_inode_flag_strs[] = { - BCH_INODE_FLAGS() - NULL -}; -#undef x - -static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); -static int may_delete_deleted_inum(struct btree_trans *, subvol_inum); - -static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; - -static int inode_decode_field(const u8 *in, const u8 *end, - u64 out[2], unsigned *out_bits) -{ - __be64 be[2] = { 0, 0 }; - unsigned bytes, shift; - u8 *p; - - if (in >= end) - return -BCH_ERR_inode_unpack_error; - - if (!*in) - return -BCH_ERR_inode_unpack_error; - - /* - * position of highest set bit indicates number of bytes: - * shift = number of bits to remove in high byte: - */ - shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ - bytes = byte_table[shift - 1]; - - if (in + bytes > end) - return -BCH_ERR_inode_unpack_error; - - p = (u8 *) be + 16 - bytes; - memcpy(p, in, bytes); - *p ^= (1 << 8) >> shift; - - out[0] = be64_to_cpu(be[0]); - out[1] = be64_to_cpu(be[1]); - *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); - - return bytes; -} - -static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) -{ - struct bkey_i_inode_v3 *k = &packed->inode; - u8 *out = k->v.fields; - u8 *end = (void *) &packed[1]; - u8 *last_nonzero_field = out; - unsigned nr_fields = 0, last_nonzero_fieldnr = 0; - unsigned bytes; - int ret; - - bkey_inode_v3_init(&packed->inode.k_i); - packed->inode.k.p.offset = inode->bi_inum; - packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); - packed->inode.v.bi_hash_seed = inode->bi_hash_seed; - packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); - packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors); - packed->inode.v.bi_size = cpu_to_le64(inode->bi_size); - packed->inode.v.bi_version = cpu_to_le64(inode->bi_version); - SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode); - SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR); - - -#define x(_name, _bits) \ - nr_fields++; \ - \ - if (inode->_name) { \ - ret = bch2_varint_encode_fast(out, inode->_name); \ - out += ret; \ - \ - if (_bits > 64) \ - *out++ = 0; \ - \ - last_nonzero_field = out; \ - last_nonzero_fieldnr = nr_fields; \ - } else { \ - *out++ = 0; \ - \ - if (_bits > 64) \ - *out++ = 0; \ - } - - BCH_INODE_FIELDS_v3() -#undef x - BUG_ON(out > end); - - out = last_nonzero_field; - nr_fields = last_nonzero_fieldnr; - - bytes = out - (u8 *) &packed->inode.v; - set_bkey_val_bytes(&packed->inode.k, bytes); - memset_u64s_tail(&packed->inode.v, 0, bytes); - - SET_INODEv3_NR_FIELDS(&k->v, nr_fields); - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - struct bch_inode_unpacked unpacked; - - ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked); - BUG_ON(ret); - BUG_ON(unpacked.bi_inum != inode->bi_inum); - BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); - BUG_ON(unpacked.bi_sectors != inode->bi_sectors); - BUG_ON(unpacked.bi_size != inode->bi_size); - BUG_ON(unpacked.bi_version != inode->bi_version); - BUG_ON(unpacked.bi_mode != inode->bi_mode); - -#define x(_name, _bits) if (unpacked._name != inode->_name) \ - panic("unpacked %llu should be %llu", \ - (u64) unpacked._name, (u64) inode->_name); - BCH_INODE_FIELDS_v3() -#undef x - } -} - -void bch2_inode_pack(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) -{ - bch2_inode_pack_inlined(packed, inode); -} - -static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, - struct bch_inode_unpacked *unpacked) -{ - const u8 *in = inode.v->fields; - const u8 *end = bkey_val_end(inode); - u64 field[2]; - unsigned fieldnr = 0, field_bits; - int ret; - -#define x(_name, _bits) \ - if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \ - unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ - memset((void *) unpacked + offset, 0, \ - sizeof(*unpacked) - offset); \ - return 0; \ - } \ - \ - ret = inode_decode_field(in, end, field, &field_bits); \ - if (ret < 0) \ - return ret; \ - \ - if (field_bits > sizeof(unpacked->_name) * 8) \ - return -BCH_ERR_inode_unpack_error; \ - \ - unpacked->_name = field[1]; \ - in += ret; - - BCH_INODE_FIELDS_v2() -#undef x - - /* XXX: signal if there were more fields than expected? */ - return 0; -} - -static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, - const u8 *in, const u8 *end, - unsigned nr_fields) -{ - unsigned fieldnr = 0; - int ret; - u64 v[2]; - -#define x(_name, _bits) \ - if (fieldnr < nr_fields) { \ - ret = bch2_varint_decode_fast(in, end, &v[0]); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - \ - if (_bits > 64) { \ - ret = bch2_varint_decode_fast(in, end, &v[1]); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - } else { \ - v[1] = 0; \ - } \ - } else { \ - v[0] = v[1] = 0; \ - } \ - \ - unpacked->_name = v[0]; \ - if (v[1] || v[0] != unpacked->_name) \ - return -BCH_ERR_inode_unpack_error; \ - fieldnr++; - - BCH_INODE_FIELDS_v2() -#undef x - - /* XXX: signal if there were more fields than expected? */ - return 0; -} - -static int bch2_inode_unpack_v3(struct bkey_s_c k, - struct bch_inode_unpacked *unpacked) -{ - struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); - const u8 *in = inode.v->fields; - const u8 *end = bkey_val_end(inode); - unsigned nr_fields = INODEv3_NR_FIELDS(inode.v); - unsigned fieldnr = 0; - int ret; - u64 v[2]; - - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_snapshot = inode.k->p.snapshot; - unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); - unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors); - unpacked->bi_size = le64_to_cpu(inode.v->bi_size); - unpacked->bi_version = le64_to_cpu(inode.v->bi_version); - unpacked->bi_mode = INODEv3_MODE(inode.v); - -#define x(_name, _bits) \ - if (fieldnr < nr_fields) { \ - ret = bch2_varint_decode_fast(in, end, &v[0]); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - \ - if (_bits > 64) { \ - ret = bch2_varint_decode_fast(in, end, &v[1]); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - } else { \ - v[1] = 0; \ - } \ - } else { \ - v[0] = v[1] = 0; \ - } \ - \ - unpacked->_name = v[0]; \ - if (v[1] || v[0] != unpacked->_name) \ - return -BCH_ERR_inode_unpack_error; \ - fieldnr++; - - BCH_INODE_FIELDS_v3() -#undef x - - /* XXX: signal if there were more fields than expected? */ - return 0; -} - -static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, - struct bch_inode_unpacked *unpacked) -{ - memset(unpacked, 0, sizeof(*unpacked)); - - switch (k.k->type) { - case KEY_TYPE_inode: { - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_snapshot = inode.k->p.snapshot; - unpacked->bi_journal_seq= 0; - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); - unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); - - if (INODEv1_NEW_VARINT(inode.v)) { - return bch2_inode_unpack_v2(unpacked, inode.v->fields, - bkey_val_end(inode), - INODEv1_NR_FIELDS(inode.v)); - } else { - return bch2_inode_unpack_v1(inode, unpacked); - } - break; - } - case KEY_TYPE_inode_v2: { - struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); - - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_snapshot = inode.k->p.snapshot; - unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); - unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); - - return bch2_inode_unpack_v2(unpacked, inode.v->fields, - bkey_val_end(inode), - INODEv2_NR_FIELDS(inode.v)); - } - default: - BUG(); - } -} - -int bch2_inode_unpack(struct bkey_s_c k, - struct bch_inode_unpacked *unpacked) -{ - return likely(k.k->type == KEY_TYPE_inode_v3) - ? bch2_inode_unpack_v3(k, unpacked) - : bch2_inode_unpack_slowpath(k, unpacked); -} - -int __bch2_inode_peek(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - subvol_inum inum, unsigned flags, - bool warn) -{ - u32 snapshot; - int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn); - if (ret) - return ret; - - struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), - flags|BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) - return ret; - - ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; - if (ret) - goto err; - - ret = bch2_inode_unpack(k, inode); - if (ret) - goto err; - - return 0; -err: - if (warn) - bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum); - bch2_trans_iter_exit(trans, iter); - return ret; -} - -int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans, - u64 inode_nr, u32 snapshot, - struct bch_inode_unpacked *inode, - unsigned flags) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inode_nr, snapshot), flags); - int ret = bkey_err(k); - if (ret) - goto err; - - ret = bkey_is_inode(k.k) - ? bch2_inode_unpack(k, inode) - : -BCH_ERR_ENOENT_inode; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); -} - -int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, - struct bch_inode_unpacked *root) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) { - ret = bch2_inode_unpack(k, root); - goto out; - } - } - /* We're only called when we know we have an inode for @inum */ - BUG_ON(!ret); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_write_flags(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_inode_buf *inode_p; - - inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); - - bch2_inode_pack_inlined(inode_p, inode); - inode_p->inode.k.p.snapshot = iter->snapshot; - return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); -} - -int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) -{ - struct bkey_inode_buf *inode_p = - bch2_trans_kmalloc(trans, sizeof(*inode_p)); - - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); - - bch2_inode_pack(inode_p, inode); - inode_p->inode.k.p.snapshot = inode->bi_snapshot; - - return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, - &inode_p->inode.k_i, - BTREE_UPDATE_internal_snapshot_node); -} - -int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) -{ - int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_fsck_write_inode(trans, inode)); - bch_err_fn(trans->c, ret); - return ret; -} - -struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) -{ - struct bch_inode_unpacked u; - struct bkey_inode_buf *inode_p; - int ret; - - if (!bkey_is_inode(&k->k)) - return ERR_PTR(-ENOENT); - - inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); - if (IS_ERR(inode_p)) - return ERR_CAST(inode_p); - - ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u); - if (ret) - return ERR_PTR(ret); - - bch2_inode_pack(inode_p, &u); - return &inode_p->inode.k_i; -} - -static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bch_inode_unpacked unpacked; - int ret = 0; - - bkey_fsck_err_on(k.k->p.inode, - c, inode_pos_inode_nonzero, - "nonzero k.p.inode"); - - bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, - c, inode_pos_blockdev_range, - "fs inode in blockdev range"); - - bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), - c, inode_unpack_error, - "invalid variable length fields"); - - bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, - c, inode_checksum_type_invalid, - "invalid data checksum type (%u >= %u", - unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); - - bkey_fsck_err_on(unpacked.bi_compression && - !bch2_compression_opt_valid(unpacked.bi_compression - 1), - c, inode_compression_type_invalid, - "invalid compression opt %u", unpacked.bi_compression - 1); - - bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && - unpacked.bi_nlink != 0, - c, inode_unlinked_but_nlink_nonzero, - "flagged as unlinked but bi_nlink != 0"); - - bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), - c, inode_subvol_root_but_not_dir, - "subvolume root but not a directory"); -fsck_err: - return ret; -} - -int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - int ret = 0; - - bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR, - c, inode_str_hash_invalid, - "invalid str hash type (%llu >= %u)", - INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR); - - ret = __bch2_inode_validate(c, k, from); -fsck_err: - return ret; -} - -int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); - int ret = 0; - - bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, - c, inode_str_hash_invalid, - "invalid str hash type (%llu >= %u)", - INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - - ret = __bch2_inode_validate(c, k, from); -fsck_err: - return ret; -} - -int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); - int ret = 0; - - bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || - INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), - c, inode_v3_fields_start_bad, - "invalid fields_start (got %llu, min %u max %zu)", - INODEv3_FIELDS_START(inode.v), - INODEv3_FIELDS_START_INITIAL, - bkey_val_u64s(inode.k)); - - bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, - c, inode_str_hash_invalid, - "invalid str hash type (%llu >= %u)", - INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - - ret = __bch2_inode_validate(c, k, from); -fsck_err: - return ret; -} - -static void __bch2_inode_unpacked_to_text(struct printbuf *out, - struct bch_inode_unpacked *inode) -{ - prt_printf(out, "\n"); - printbuf_indent_add(out, 2); - prt_printf(out, "mode=%o\n", inode->bi_mode); - - prt_str(out, "flags="); - prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); - prt_printf(out, "(%x)\n", inode->bi_flags); - - prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); - prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed); - prt_printf(out, "hash_type="); - bch2_prt_str_hash_type(out, INODE_STR_HASH(inode)); - prt_newline(out); - prt_printf(out, "bi_size=%llu\n", inode->bi_size); - prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors); - prt_printf(out, "bi_version=%llu\n", inode->bi_version); - -#define x(_name, _bits) \ - prt_printf(out, #_name "=%llu\n", (u64) inode->_name); - BCH_INODE_FIELDS_v3() -#undef x - - bch2_printbuf_strip_trailing_newline(out); - printbuf_indent_sub(out, 2); -} - -void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) -{ - prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot); - __bch2_inode_unpacked_to_text(out, inode); -} - -void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bch_inode_unpacked inode; - - if (bch2_inode_unpack(k, &inode)) { - prt_printf(out, "(unpack error)"); - return; - } - - __bch2_inode_unpacked_to_text(out, &inode); -} - -static inline u64 bkey_inode_flags(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); - case KEY_TYPE_inode_v2: - return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); - case KEY_TYPE_inode_v3: - return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); - default: - return 0; - } -} - -static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f); - return; - case KEY_TYPE_inode_v2: - bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f); - return; - case KEY_TYPE_inode_v3: - bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f); - return; - default: - BUG(); - } -} - -static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) -{ - unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; - - return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); -} - -static struct bkey_s_c -bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, - enum btree_id btree, struct bpos pos, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_max_norestart(trans, *iter, btree, - bpos_successor(pos), - SPOS(pos.inode, pos.offset, U32_MAX), - flags|BTREE_ITER_all_snapshots, k, ret) - if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) - return k; - - bch2_trans_iter_exit(trans, iter); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; -} - -static struct bkey_s_c -bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos, unsigned flags) -{ - struct bkey_s_c k; -again: - k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags); - if (!k.k || - bkey_err(k) || - bkey_is_inode(k.k)) - return k; - - bch2_trans_iter_exit(trans, iter); - pos = k.k->p; - goto again; -} - -int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_max_norestart(trans, iter, - BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), - BTREE_ITER_all_snapshots| - BTREE_ITER_with_updates, k, ret) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) && - bkey_is_inode(k.k)) { - ret = 1; - break; - } - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int update_inode_has_children(struct btree_trans *trans, - struct bkey_s k, - bool have_child) -{ - if (!have_child) { - int ret = bch2_inode_has_child_snapshots(trans, k.k->p); - if (ret) - return ret < 0 ? ret : 0; - } - - u64 f = bkey_inode_flags(k.s_c); - if (have_child != !!(f & BCH_INODE_has_child_snapshot)) - bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot); - - return 0; -} - -static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos, - bool have_child) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans, - &iter, pos, BTREE_ITER_with_updates); - int ret = bkey_err(k); - if (ret) - return ret; - if (!k.k) - return 0; - - if (!have_child) { - ret = bch2_inode_has_child_snapshots(trans, k.k->p); - if (ret) { - ret = ret < 0 ? ret : 0; - goto err; - } - } - - u64 f = bkey_inode_flags(k); - if (have_child != !!(f & BCH_INODE_has_child_snapshot)) { - struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k, - BTREE_UPDATE_internal_snapshot_node); - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_trigger_inode(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - - if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { - BUG_ON(!trans->journal_res.seq); - bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); - } - - s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) }; - if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) { - int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes); - if (ret) - return ret; - } - - if (flags & BTREE_TRIGGER_transactional) { - int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) - - (int) bkey_is_unlinked_inode(old); - if (unlinked_delta) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, - new.k->p, unlinked_delta > 0); - if (ret) - return ret; - } - - /* - * If we're creating or deleting an inode at this snapshot ID, - * and there might be an inode in a parent snapshot ID, we might - * need to set or clear the has_child_snapshot flag on the - * parent. - */ - int deleted_delta = (int) bkey_is_inode(new.k) - - (int) bkey_is_inode(old.k); - if (deleted_delta && - bch2_snapshot_parent(c, new.k->p.snapshot)) { - int ret = update_parent_inode_has_children(trans, new.k->p, - deleted_delta > 0); - if (ret) - return ret; - } - - /* - * When an inode is first updated in a new snapshot, we may need - * to clear has_child_snapshot - */ - if (deleted_delta > 0) { - int ret = update_inode_has_children(trans, new, false); - if (ret) - return ret; - } - } - - return 0; -} - -int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(k.k->p.inode, - c, inode_pos_inode_nonzero, - "nonzero k.p.inode"); -fsck_err: - return ret; -} - -void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); - - prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); -} - -int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors, - c, inode_alloc_cursor_inode_bad, - "k.p.inode bad"); -fsck_err: - return ret; -} - -void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k); - - prt_printf(out, "idx %llu generation %llu", - le64_to_cpu(i.v->idx), - le64_to_cpu(i.v->gen)); -} - -void bch2_inode_init_early(struct bch_fs *c, - struct bch_inode_unpacked *inode_u) -{ - enum bch_str_hash_type str_hash = - bch2_str_hash_opt_to_type(c, c->opts.str_hash); - - memset(inode_u, 0, sizeof(*inode_u)); - - SET_INODE_STR_HASH(inode_u, str_hash); - get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); -} - -void bch2_inode_init_late(struct bch_fs *c, - struct bch_inode_unpacked *inode_u, u64 now, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev, - struct bch_inode_unpacked *parent) -{ - inode_u->bi_mode = mode; - inode_u->bi_uid = uid; - inode_u->bi_gid = gid; - inode_u->bi_dev = rdev; - inode_u->bi_atime = now; - inode_u->bi_mtime = now; - inode_u->bi_ctime = now; - inode_u->bi_otime = now; - - if (parent && parent->bi_mode & S_ISGID) { - inode_u->bi_gid = parent->bi_gid; - if (S_ISDIR(mode)) - inode_u->bi_mode |= S_ISGID; - } - - if (parent) { -#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; - BCH_INODE_OPTS() -#undef x - } - - if (!S_ISDIR(mode)) - inode_u->bi_casefold = 0; - - if (bch2_inode_casefold(c, inode_u)) - inode_u->bi_flags |= BCH_INODE_has_case_insensitive; -} - -void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev, - struct bch_inode_unpacked *parent) -{ - bch2_inode_init_early(c, inode_u); - bch2_inode_init_late(c, inode_u, bch2_current_time(c), - uid, gid, mode, rdev, parent); -} - -static struct bkey_i_inode_alloc_cursor * -bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) -{ - struct bch_fs *c = trans->c; - - u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1; - - cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_logged_ops, - POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), - BTREE_ITER_cached); - int ret = bkey_err(k); - if (ret) - return ERR_PTR(ret); - - struct bkey_i_inode_alloc_cursor *cursor = - k.k->type == KEY_TYPE_inode_alloc_cursor - ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) - : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); - ret = PTR_ERR_OR_ZERO(cursor); - if (ret) - goto err; - - if (c->opts.inodes_32bit) { - *min = BLOCKDEV_INODE_MAX; - *max = INT_MAX; - } else { - cursor->v.bits = c->opts.shard_inode_numbers_bits; - - unsigned bits = 63 - c->opts.shard_inode_numbers_bits; - - *min = max(cpu << bits, (u64) INT_MAX + 1); - *max = (cpu << bits) | ~(ULLONG_MAX << bits); - } - - if (le64_to_cpu(cursor->v.idx) < *min) - cursor->v.idx = cpu_to_le64(*min); - - if (le64_to_cpu(cursor->v.idx) >= *max) { - cursor->v.idx = cpu_to_le64(*min); - le32_add_cpu(&cursor->v.gen, 1); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret ? ERR_PTR(ret) : cursor; -} - -/* - * This just finds an empty slot: - */ -int bch2_inode_create(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode_u, - u32 snapshot, u64 cpu) -{ - u64 min, max; - struct bkey_i_inode_alloc_cursor *cursor = - bch2_inode_alloc_cursor_get(trans, cpu, &min, &max); - int ret = PTR_ERR_OR_ZERO(cursor); - if (ret) - return ret; - - u64 start = le64_to_cpu(cursor->v.idx); - u64 pos = start; - - bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), - BTREE_ITER_all_snapshots| - BTREE_ITER_intent); - struct bkey_s_c k; -again: - while ((k = bch2_btree_iter_peek(trans, iter)).k && - !(ret = bkey_err(k)) && - bkey_lt(k.k->p, POS(0, max))) { - if (pos < iter->pos.offset) - goto found_slot; - - /* - * We don't need to iterate over keys in every snapshot once - * we've found just one: - */ - pos = iter->pos.offset + 1; - bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); - } - - if (!ret && pos < max) - goto found_slot; - - if (!ret && start == min) - ret = bch_err_throw(trans->c, ENOSPC_inode_create); - - if (ret) { - bch2_trans_iter_exit(trans, iter); - return ret; - } - - /* Retry from start */ - pos = start = min; - bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); - le32_add_cpu(&cursor->v.gen, 1); - goto again; -found_slot: - bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot)); - k = bch2_btree_iter_peek_slot(trans, iter); - ret = bkey_err(k); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return ret; - } - - inode_u->bi_inum = k.k->p.offset; - inode_u->bi_generation = le64_to_cpu(cursor->v.gen); - cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); - return 0; -} - -static int bch2_inode_delete_keys(struct btree_trans *trans, - subvol_inum inum, enum btree_id id) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i delete; - struct bpos end = POS(inum.inum, U64_MAX); - u32 snapshot; - int ret = 0; - - /* - * We're never going to be deleting partial extents, no need to use an - * extent iterator: - */ - bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), - BTREE_ITER_intent); - - while (1) { - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - - k = bch2_btree_iter_peek_max(trans, &iter, end); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k) - break; - - bkey_init(&delete.k); - delete.k.p = iter.pos; - - if (iter.flags & BTREE_ITER_is_extents) - bch2_key_resize(&delete.k, - bpos_min(end, k.k->p).offset - - iter.pos.offset); - - ret = bch2_trans_update(trans, &iter, &delete, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); -err: - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - break; - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = {}; - struct bkey_s_c k; - u32 snapshot; - int ret; - - ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum)); - if (ret) - goto err2; - - /* - * If this was a directory, there shouldn't be any real dirents left - - * but there could be whiteouts (from hash collisions) that we should - * delete: - * - * XXX: the dirent code ideally would delete whiteouts when they're no - * longer needed - */ - ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: - bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: - bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); - if (ret) - goto err2; -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), - BTREE_ITER_intent|BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) - goto err; - - if (!bkey_is_inode(k.k)) { - bch2_fs_inconsistent(c, - "inode %llu:%u not found when deleting", - inum.inum, snapshot); - ret = bch_err_throw(c, ENOENT_inode); - goto err; - } - - ret = bch2_btree_delete_at(trans, &iter, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); -err: - bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - if (ret) - goto err2; - - ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); -err2: - bch2_trans_put(trans); - return ret; -} - -int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) -{ - if (bi->bi_flags & BCH_INODE_unlinked) - bi->bi_flags &= ~BCH_INODE_unlinked; - else { - if (bi->bi_nlink == U32_MAX) - return -EINVAL; - - bi->bi_nlink++; - } - - return 0; -} - -void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) -{ - if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) { - bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", - bi->bi_inum); - return; - } - - if (bi->bi_flags & BCH_INODE_unlinked) { - bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); - return; - } - - if (bi->bi_nlink) - bi->bi_nlink--; - else - bi->bi_flags |= BCH_INODE_unlinked; -} - -struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) -{ - struct bch_opts ret = { 0 }; -#define x(_name, _bits) \ - if (inode->bi_##_name) \ - opt_set(ret, _name, inode->bi_##_name - 1); - BCH_INODE_OPTS() -#undef x - return ret; -} - -void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, - struct bch_inode_unpacked *inode) -{ -#define x(_name, _bits) \ - if ((inode)->bi_##_name) { \ - opts->_name = inode->bi_##_name - 1; \ - opts->_name##_from_inode = true; \ - } else { \ - opts->_name = c->opts._name; \ - opts->_name##_from_inode = false; \ - } - BCH_INODE_OPTS() -#undef x - - bch2_io_opts_fixups(opts); -} - -int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) -{ - struct bch_inode_unpacked inode; - int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode)); - - if (ret) - return ret; - - bch2_inode_opts_get(opts, trans->c, &inode); - return 0; -} - -int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *bi, unsigned v) -{ - struct bch_fs *c = trans->c; - -#ifndef CONFIG_UNICODE - bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); - return -EOPNOTSUPP; -#endif - - if (c->opts.casefold_disabled) - return -EOPNOTSUPP; - - int ret = 0; - /* Not supported on individual files. */ - if (!S_ISDIR(bi->bi_mode)) - return -EOPNOTSUPP; - - /* - * Make sure the dir is empty, as otherwise we'd need to - * rehash everything and update the dirent keys. - */ - ret = bch2_empty_dir_trans(trans, inum); - if (ret < 0) - return ret; - - ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding); - if (ret) - return ret; - - bch2_check_set_feature(c, BCH_FEATURE_casefolding); - - bi->bi_casefold = v + 1; - bi->bi_fields_set |= BIT(Inode_opt_casefold); - - return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi); -} - -static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter = {}; - struct bkey_i_inode_generation delete; - struct bch_inode_unpacked inode_u; - struct bkey_s_c k; - int ret; - - do { - ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL) ?: - bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL) ?: - bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL); - } while (ret == -BCH_ERR_transaction_restart_nested); - if (ret) - goto err; -retry: - bch2_trans_begin(trans); - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum, snapshot), BTREE_ITER_intent); - ret = bkey_err(k); - if (ret) - goto err; - - if (!bkey_is_inode(k.k)) { - bch2_fs_inconsistent(c, - "inode %llu:%u not found when deleting", - inum, snapshot); - ret = bch_err_throw(c, ENOENT_inode); - goto err; - } - - bch2_inode_unpack(k, &inode_u); - - /* Subvolume root? */ - if (inode_u.bi_subvol) - bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); - - bkey_inode_generation_init(&delete.k_i); - delete.k.p = iter.pos; - delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - - ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); -err: - bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - return ret ?: -BCH_ERR_transaction_restart_nested; -} - -/* - * After deleting an inode, there may be versions in older snapshots that should - * also be deleted - if they're not referenced by sibling snapshots and not open - * in other subvolumes: - */ -static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; -next_parent: - ret = lockrestart_do(trans, - bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0))); - if (ret || !k.k) - return ret; - - bool unlinked = bkey_is_unlinked_inode(k); - pos = k.k->p; - bch2_trans_iter_exit(trans, &iter); - - if (!unlinked) - return 0; - - ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); - if (ret) - return ret < 0 ? ret : 0; - - ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); - if (ret) - return ret; - goto next_parent; -} - -int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) -{ - return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: - delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); -} - -static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, - bool from_deleted_inodes) -{ - struct bch_fs *c = trans->c; - struct btree_iter inode_iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - struct printbuf buf = PRINTBUF; - int ret; - - k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) - return ret; - - ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode); - if (fsck_err_on(from_deleted_inodes && ret, - trans, deleted_inode_missing, - "nonexistent inode %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; - if (ret) - goto out; - - ret = bch2_inode_unpack(k, &inode); - if (ret) - goto out; - - if (S_ISDIR(inode.bi_mode)) { - ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); - if (fsck_err_on(from_deleted_inodes && - bch2_err_matches(ret, ENOTEMPTY), - trans, deleted_inode_is_dir, - "non empty directory %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; - if (ret) - goto out; - } - - ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked); - if (fsck_err_on(from_deleted_inodes && ret, - trans, deleted_inode_not_unlinked, - "non-deleted inode %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; - if (ret) - goto out; - - ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot) - ? 0 : bch_err_throw(c, inode_has_child_snapshot); - - if (fsck_err_on(from_deleted_inodes && ret, - trans, deleted_inode_has_child_snapshots, - "inode with child snapshots %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; - if (ret) - goto out; - - ret = bch2_inode_has_child_snapshots(trans, k.k->p); - if (ret < 0) - goto out; - - if (ret) { - if (fsck_err(trans, inode_has_child_snapshots_wrong, - "inode has_child_snapshots flag wrong (should be set)\n%s", - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &inode), - buf.buf))) { - inode.bi_flags |= BCH_INODE_has_child_snapshot; - ret = __bch2_fsck_write_inode(trans, &inode); - if (ret) - goto out; - } - - if (!from_deleted_inodes) { - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - bch_err_throw(c, inode_has_child_snapshot); - goto out; - } - - goto delete; - - } - - if (from_deleted_inodes) { - if (test_bit(BCH_FS_clean_recovery, &c->flags) && - !fsck_err(trans, deleted_inode_but_clean, - "filesystem marked as clean but have deleted inode %llu:%u", - pos.offset, pos.snapshot)) { - ret = 0; - goto out; - } - - ret = 1; - } -out: -fsck_err: - bch2_trans_iter_exit(trans, &inode_iter); - printbuf_exit(&buf); - return ret; -delete: - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); - goto out; -} - -static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum) -{ - u32 snapshot; - - return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: - may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false); -} - -int bch2_delete_dead_inodes(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - int ret; - - /* - * if we ran check_inodes() unlinked inodes will have already been - * cleaned up but the write buffer will be out of sync; therefore we - * alway need a write buffer flush - */ - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - /* - * Weird transaction restart handling here because on successful delete, - * bch2_inode_rm_snapshot() will return a nested transaction restart, - * but we can't retry because the btree write buffer won't have been - * flushed and we'd spin: - */ - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - ret = may_delete_deleted_inode(trans, k.k->p, true); - if (ret > 0) { - bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", - k.k->p.offset, k.k->p.snapshot); - - ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); - /* - * We don't want to loop here: a transaction restart - * error here means we handled a transaction restart and - * we're actually done, but if we loop we'll retry the - * same key because the write buffer hasn't been flushed - * yet - */ - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - } - - ret; - })); -err: - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h deleted file mode 100644 index b8ec3e628d90..000000000000 --- a/fs/bcachefs/inode.h +++ /dev/null @@ -1,319 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_INODE_H -#define _BCACHEFS_INODE_H - -#include "bkey.h" -#include "bkey_methods.h" -#include "opts.h" -#include "snapshot.h" - -extern const char * const bch2_inode_opts[]; - -int bch2_inode_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); - -static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) -{ - return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0 - ? __bch2_inode_has_child_snapshots(trans, pos) - : 0; -} - -int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_inode ((struct bkey_ops) { \ - .key_validate = bch2_inode_validate, \ - .val_to_text = bch2_inode_to_text, \ - .trigger = bch2_trigger_inode, \ - .min_val_size = 16, \ -}) - -#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ - .key_validate = bch2_inode_v2_validate, \ - .val_to_text = bch2_inode_to_text, \ - .trigger = bch2_trigger_inode, \ - .min_val_size = 32, \ -}) - -#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ - .key_validate = bch2_inode_v3_validate, \ - .val_to_text = bch2_inode_to_text, \ - .trigger = bch2_trigger_inode, \ - .min_val_size = 48, \ -}) - -static inline bool bkey_is_inode(const struct bkey *k) -{ - return k->type == KEY_TYPE_inode || - k->type == KEY_TYPE_inode_v2 || - k->type == KEY_TYPE_inode_v3; -} - -int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ - .key_validate = bch2_inode_generation_validate, \ - .val_to_text = bch2_inode_generation_to_text, \ - .min_val_size = 8, \ -}) - -int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \ - .key_validate = bch2_inode_alloc_cursor_validate, \ - .val_to_text = bch2_inode_alloc_cursor_to_text, \ - .min_val_size = 16, \ -}) - -#if 0 -typedef struct { - u64 lo; - u32 hi; -} __packed __aligned(4) u96; -#endif -typedef u64 u96; - -struct bch_inode_unpacked { - u64 bi_inum; - u32 bi_snapshot; - u64 bi_journal_seq; - __le64 bi_hash_seed; - u64 bi_size; - u64 bi_sectors; - u64 bi_version; - u32 bi_flags; - u16 bi_mode; - -#define x(_name, _bits) u##_bits _name; - BCH_INODE_FIELDS_v3() -#undef x -}; -BITMASK(INODE_STR_HASH, struct bch_inode_unpacked, bi_flags, 20, 24); - -struct bkey_inode_buf { - struct bkey_i_inode_v3 inode; - -#define x(_name, _bits) + 8 + _bits / 8 - u8 _pad[0 + BCH_INODE_FIELDS_v3()]; -#undef x -}; - -void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); -int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); -struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *); - -void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); - -int __bch2_inode_peek(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, subvol_inum, unsigned, bool); - -static inline int bch2_inode_peek_nowarn(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - subvol_inum inum, unsigned flags) -{ - return __bch2_inode_peek(trans, iter, inode, inum, flags, false); -} - -static inline int bch2_inode_peek(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - subvol_inum inum, unsigned flags) -{ - return __bch2_inode_peek(trans, iter, inode, inum, flags, true); -} - -int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32, - struct bch_inode_unpacked *, unsigned); -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, - subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, - struct bch_inode_unpacked *); - -int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, - struct bch_inode_unpacked *root); - -int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); - -static inline int bch2_inode_write(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode) -{ - return bch2_inode_write_flags(trans, iter, inode, 0); -} - -int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); -int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); - -void bch2_inode_init_early(struct bch_fs *, - struct bch_inode_unpacked *); -void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64, - uid_t, gid_t, umode_t, dev_t, - struct bch_inode_unpacked *); -void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, - uid_t, gid_t, umode_t, dev_t, - struct bch_inode_unpacked *); - -int bch2_inode_create(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, u32, u64); - -int bch2_inode_rm(struct bch_fs *, subvol_inum); - -#define inode_opt_get(_c, _inode, _name) \ - ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) - -static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, - enum inode_opt_id id, u64 v) -{ - switch (id) { -#define x(_name, ...) \ - case Inode_opt_##_name: \ - inode->bi_##_name = v; \ - break; - BCH_INODE_OPTS() -#undef x - default: - BUG(); - } -} - -static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, - enum inode_opt_id id) -{ - switch (id) { -#define x(_name, ...) \ - case Inode_opt_##_name: \ - return inode->bi_##_name; - BCH_INODE_OPTS() -#undef x - default: - BUG(); - } -} - -static inline u8 mode_to_type(umode_t mode) -{ - return (mode >> 12) & 15; -} - -static inline u8 inode_d_type(struct bch_inode_unpacked *inode) -{ - return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); -} - -static inline u32 bch2_inode_flags(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); - case KEY_TYPE_inode_v2: - return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); - case KEY_TYPE_inode_v3: - return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); - default: - return 0; - } -} - -static inline unsigned bkey_inode_mode(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode); - case KEY_TYPE_inode_v2: - return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode); - case KEY_TYPE_inode_v3: - return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v); - default: - return 0; - } -} - -static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi) -{ - /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */ - return bi->bi_casefold - ? bi->bi_casefold - 1 - : c->opts.casefold; -} - -static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi) -{ - return bi->bi_dir || bi->bi_dir_offset; -} - -/* i_nlink: */ - -static inline unsigned nlink_bias(umode_t mode) -{ - return S_ISDIR(mode) ? 2 : 1; -} - -static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) -{ - return bi->bi_flags & BCH_INODE_unlinked - ? 0 - : bi->bi_nlink + nlink_bias(bi->bi_mode); -} - -static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, - unsigned nlink) -{ - if (nlink) { - bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); - bi->bi_flags &= ~BCH_INODE_unlinked; - } else { - bi->bi_nlink = 0; - bi->bi_flags |= BCH_INODE_unlinked; - } -} - -int bch2_inode_nlink_inc(struct bch_inode_unpacked *); -void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); - -struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); -void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, - struct bch_inode_unpacked *); -int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *); -int bch2_inode_set_casefold(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, unsigned); - -#include "rebalance.h" - -static inline struct bch_extent_rebalance -bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) -{ - struct bch_io_opts io_opts; - bch2_inode_opts_get(&io_opts, c, inode); - return io_opts_to_rebalance_opts(c, &io_opts); -} - -#define BCACHEFS_ROOT_SUBVOL_INUM \ - ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) - -static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b) -{ - return a.subvol == b.subvol && a.inum == b.inum; -} - -int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); -int bch2_delete_dead_inodes(struct bch_fs *); - -#endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h deleted file mode 100644 index 1f00938b1bdc..000000000000 --- a/fs/bcachefs/inode_format.h +++ /dev/null @@ -1,185 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_INODE_FORMAT_H -#define _BCACHEFS_INODE_FORMAT_H - -#define BLOCKDEV_INODE_MAX 4096 -#define BCACHEFS_ROOT_INO 4096 - -struct bch_inode { - struct bch_val v; - - __le64 bi_hash_seed; - __le32 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v2 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v3 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le64 bi_sectors; - __le64 bi_size; - __le64 bi_version; - __u8 fields[]; -} __packed __aligned(8); - -#define INODEv3_FIELDS_START_INITIAL 6 -#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) - -struct bch_inode_generation { - struct bch_val v; - - __le32 bi_generation; - __le32 pad; -} __packed __aligned(8); - -/* - * bi_subvol and bi_parent_subvol are only set for subvolume roots: - */ - -#define BCH_INODE_FIELDS_v2() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_size, 64) \ - x(bi_sectors, 64) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) - -#define BCH_INODE_FIELDS_v3() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) \ - x(bi_nocow, 8) \ - x(bi_depth, 32) \ - x(bi_inodes_32bit, 8) \ - x(bi_casefold, 8) - -/* subset of BCH_INODE_FIELDS */ -#define BCH_INODE_OPTS() \ - x(data_checksum, 8) \ - x(compression, 8) \ - x(project, 32) \ - x(background_compression, 8) \ - x(data_replicas, 8) \ - x(promote_target, 16) \ - x(foreground_target, 16) \ - x(background_target, 16) \ - x(erasure_code, 16) \ - x(nocow, 8) \ - x(inodes_32bit, 8) \ - x(casefold, 8) - -enum inode_opt_id { -#define x(name, ...) \ - Inode_opt_##name, - BCH_INODE_OPTS() -#undef x - Inode_opt_nr, -}; - -/* - * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive - - * for overlayfs - */ -#define BCH_INODE_FLAGS() \ - x(sync, 0) \ - x(immutable, 1) \ - x(append, 2) \ - x(nodump, 3) \ - x(noatime, 4) \ - x(i_size_dirty, 5) \ - x(i_sectors_dirty, 6) \ - x(unlinked, 7) \ - x(backptr_untrusted, 8) \ - x(has_child_snapshot, 9) \ - x(has_case_insensitive, 10) - -/* bits 20+ reserved for packed fields below: */ - -enum bch_inode_flags { -#define x(t, n) BCH_INODE_##t = 1U << n, - BCH_INODE_FLAGS() -#undef x -}; - -enum __bch_inode_flags { -#define x(t, n) __BCH_INODE_##t = n, - BCH_INODE_FLAGS() -#undef x -}; - -LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODEv1_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); -LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32); - -LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); -LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); -LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_FIELDS_START, - struct bch_inode_v3, bi_flags, 31, 36); -LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); - -struct bch_inode_alloc_cursor { - struct bch_val v; - __u8 bits; - __u8 pad; - __le32 gen; - __le64 idx; -}; - -#endif /* _BCACHEFS_INODE_FORMAT_H */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c deleted file mode 100644 index 07023667a475..000000000000 --- a/fs/bcachefs/io_misc.c +++ /dev/null @@ -1,570 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * io_misc.c - fallocate, fpunch, truncate: - */ - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "clock.h" -#include "error.h" -#include "extents.h" -#include "extent_update.h" -#include "inode.h" -#include "io_misc.h" -#include "io_write.h" -#include "logged_ops.h" -#include "rebalance.h" -#include "subvolume.h" - -/* Overwrites whatever was present with zeroes: */ -int bch2_extent_fallocate(struct btree_trans *trans, - subvol_inum inum, - struct btree_iter *iter, - u64 sectors, - struct bch_io_opts opts, - s64 *i_sectors_delta, - struct write_point_specifier write_point) -{ - struct bch_fs *c = trans->c; - struct disk_reservation disk_res = { 0 }; - struct closure cl; - struct open_buckets open_buckets = { 0 }; - struct bkey_s_c k; - struct bkey_buf old, new; - unsigned sectors_allocated = 0, new_replicas; - bool unwritten = opts.nocow && - c->sb.version >= bcachefs_metadata_version_unwritten_extents; - int ret; - - bch2_bkey_buf_init(&old); - bch2_bkey_buf_init(&new); - closure_init_stack(&cl); - - k = bch2_btree_iter_peek_slot(trans, iter); - ret = bkey_err(k); - if (ret) - return ret; - - sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); - new_replicas = max(0, (int) opts.data_replicas - - (int) bch2_bkey_nr_ptrs_fully_allocated(k)); - - /* - * Get a disk reservation before (in the nocow case) calling - * into the allocator: - */ - ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); - if (unlikely(ret)) - goto err_noprint; - - bch2_bkey_buf_reassemble(&old, c, k); - - if (!unwritten) { - struct bkey_i_reservation *reservation; - - bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); - reservation = bkey_reservation_init(new.k); - reservation->k.p = iter->pos; - bch2_key_resize(&reservation->k, sectors); - reservation->v.nr_replicas = opts.data_replicas; - } else { - struct bkey_i_extent *e; - struct bch_devs_list devs_have; - struct write_point *wp; - - devs_have.nr = 0; - - bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); - - e = bkey_extent_init(new.k); - e->k.p = iter->pos; - - ret = bch2_alloc_sectors_start_trans(trans, - opts.foreground_target, - false, - write_point, - &devs_have, - opts.data_replicas, - opts.data_replicas, - BCH_WATERMARK_normal, 0, &cl, &wp); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = bch_err_throw(c, transaction_restart_nested); - if (ret) - goto err; - - sectors = min_t(u64, sectors, wp->sectors_free); - sectors_allocated = sectors; - - bch2_key_resize(&e->k, sectors); - - bch2_open_bucket_get(c, wp, &open_buckets); - bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); - bch2_alloc_sectors_done(c, wp); - - extent_for_each_ptr(extent_i_to_s(e), ptr) - ptr->unwritten = true; - } - - ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, - 0, i_sectors_delta, true); -err: - if (!ret && sectors_allocated) - bch2_increment_clock(c, sectors_allocated, WRITE); - if (should_print_err(ret)) { - struct printbuf buf = PRINTBUF; - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9)); - prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } -err_noprint: - bch2_open_buckets_put(c, &open_buckets); - bch2_disk_reservation_put(c, &disk_res); - bch2_bkey_buf_exit(&new, c); - bch2_bkey_buf_exit(&old, c); - - if (closure_nr_remaining(&cl) != 1) { - bch2_trans_unlock_long(trans); - bch2_wait_on_allocator(c, &cl); - } - - return ret; -} - -/* For fsck */ -int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end) -{ - u32 restart_count = trans->restart_count; - struct bch_fs *c = trans->c; - struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct bkey_i delete; - - int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, - start, end, 0, k, - &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - bkey_init(&delete.k); - delete.k.p = iter.pos; - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete); - - bch2_extent_trim_atomic(trans, &iter, &delete) ?: - bch2_trans_update(trans, &iter, &delete, 0); - })); - - bch2_disk_reservation_put(c, &disk_res); - return ret ?: trans_was_restarted(trans, restart_count); -} - -/* - * Returns -BCH_ERR_transacton_restart if we had to drop locks: - */ -int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - subvol_inum inum, u64 end, - s64 *i_sectors_delta) -{ - struct bch_fs *c = trans->c; - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct bpos end_pos = POS(inum.inum, end); - struct bkey_s_c k; - int ret = 0, ret2 = 0; - u32 snapshot; - - while (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete; - - if (ret) - ret2 = ret; - - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(trans, iter, snapshot); - - /* - * peek_max() doesn't have ideal semantics for extents: - */ - k = bch2_btree_iter_peek_max(trans, iter, end_pos); - if (!k.k) - break; - - ret = bkey_err(k); - if (ret) - continue; - - bkey_init(&delete.k); - delete.k.p = iter->pos; - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end_pos, &delete); - - ret = bch2_extent_update(trans, inum, iter, &delete, - &disk_res, 0, i_sectors_delta, false); - bch2_disk_reservation_put(c, &disk_res); - } - - return ret ?: ret2; -} - -int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, - s64 *i_sectors_delta) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, start), - BTREE_ITER_intent); - - ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - - return ret; -} - -/* truncate: */ - -void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k); - - prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); - prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); - prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size)); -} - -static int truncate_set_isize(struct btree_trans *trans, - subvol_inum inum, - u64 new_i_size, - bool warn) -{ - struct btree_iter iter = {}; - struct bch_inode_unpacked inode_u; - int ret; - - ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?: - (inode_u.bi_size = new_i_size, 0) ?: - bch2_inode_write(trans, &iter, &inode_u); - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, - struct bkey_i *op_k, - u64 *i_sectors_delta) -{ - struct bch_fs *c = trans->c; - struct btree_iter fpunch_iter; - struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k); - subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; - u64 new_i_size = le64_to_cpu(op->v.new_i_size); - bool warn_errors = i_sectors_delta != NULL; - int ret; - - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL)); - if (ret) - goto err; - - bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, - POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), - BTREE_ITER_intent); - ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); - bch2_trans_iter_exit(trans, &fpunch_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; -err: - if (warn_errors) - bch_err_fn(c, ret); - return ret; -} - -int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k) -{ - return __bch2_resume_logged_op_truncate(trans, op_k, NULL); -} - -int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta) -{ - struct bkey_i_logged_op_truncate op; - - bkey_logged_op_truncate_init(&op.k_i); - op.v.subvol = cpu_to_le32(inum.subvol); - op.v.inum = cpu_to_le64(inum.inum); - op.v.new_i_size = cpu_to_le64(new_i_size); - - /* - * Logged ops aren't atomic w.r.t. snapshot creation: creating a - * snapshot while they're in progress, then crashing, will result in the - * resume only proceeding in one of the snapshots - */ - down_read(&c->snapshot_create_lock); - struct btree_trans *trans = bch2_trans_get(c); - int ret = bch2_logged_op_start(trans, &op.k_i); - if (ret) - goto out; - ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta); - ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; -out: - bch2_trans_put(trans); - up_read(&c->snapshot_create_lock); - - return ret; -} - -/* finsert/fcollapse: */ - -void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k); - - prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); - prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); - prt_printf(out, " dst_offset=%lli", le64_to_cpu(op.v->dst_offset)); - prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset)); -} - -static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, - u64 offset, s64 len, bool warn) -{ - struct btree_iter iter; - struct bch_inode_unpacked inode_u; - int ret; - - offset <<= 9; - len <<= 9; - - ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn); - if (ret) - return ret; - - if (len > 0) { - if (MAX_LFS_FILESIZE - inode_u.bi_size < len) { - ret = -EFBIG; - goto err; - } - - if (offset >= inode_u.bi_size) { - ret = -EINVAL; - goto err; - } - } - - inode_u.bi_size += len; - inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c); - - ret = bch2_inode_write(trans, &iter, &inode_u); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, - struct bkey_i *op_k, - u64 *i_sectors_delta) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); - subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; - struct bch_io_opts opts; - u64 dst_offset = le64_to_cpu(op->v.dst_offset); - u64 src_offset = le64_to_cpu(op->v.src_offset); - s64 shift = dst_offset - src_offset; - u64 len = abs(shift); - u64 pos = le64_to_cpu(op->v.pos); - bool insert = shift > 0; - u32 snapshot; - bool warn_errors = i_sectors_delta != NULL; - int ret = 0; - - ret = bch2_inum_opts_get(trans, inum, &opts); - if (ret) - return ret; - - /* - * check for missing subvolume before fpunch, as in resume we don't want - * it to be a fatal error - */ - ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors)); - if (ret) - return ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, 0), - BTREE_ITER_intent); - - switch (op->v.state) { -case LOGGED_OP_FINSERT_start: - op->v.state = LOGGED_OP_FINSERT_shift_extents; - - if (insert) { - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - adjust_i_size(trans, inum, src_offset, len, warn_errors) ?: - bch2_logged_op_update(trans, &op->k_i)); - if (ret) - goto err; - } else { - bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset)); - - ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto err; - - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_logged_op_update(trans, &op->k_i)); - } - - fallthrough; -case LOGGED_OP_FINSERT_shift_extents: - while (1) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete, *copy; - struct bkey_s_c k; - struct bpos src_pos = POS(inum.inum, src_offset); - - bch2_trans_begin(trans); - - ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, - warn_errors); - if (ret) - goto btree_err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot)); - - k = insert - ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0)) - : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX)); - if ((ret = bkey_err(k))) - goto btree_err; - - if (!k.k || - k.k->p.inode != inum.inum || - bkey_le(k.k->p, POS(inum.inum, src_offset))) - break; - - copy = bch2_bkey_make_mut_noupdate(trans, k); - if ((ret = PTR_ERR_OR_ZERO(copy))) - goto btree_err; - - if (insert && - bkey_lt(bkey_start_pos(k.k), src_pos)) { - bch2_cut_front(src_pos, copy); - - /* Splitting compressed extent? */ - bch2_disk_reservation_add(c, &disk_res, - copy->k.size * - bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)), - BCH_DISK_RESERVATION_NOFAIL); - } - - bkey_init(&delete.k); - delete.k.p = copy->k.p; - delete.k.p.snapshot = snapshot; - delete.k.size = copy->k.size; - - copy->k.p.offset += shift; - copy->k.p.snapshot = snapshot; - - op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - - ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?: - bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: - bch2_logged_op_update(trans, &op->k_i) ?: - bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc); -btree_err: - bch2_disk_reservation_put(c, &disk_res); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - - pos = le64_to_cpu(op->v.pos); - } - - op->v.state = LOGGED_OP_FINSERT_finish; - - if (!insert) { - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?: - bch2_logged_op_update(trans, &op->k_i)); - } else { - /* We need an inode update to update bi_journal_seq for fsync: */ - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - adjust_i_size(trans, inum, 0, 0, warn_errors) ?: - bch2_logged_op_update(trans, &op->k_i)); - } - - break; -case LOGGED_OP_FINSERT_finish: - break; - } -err: - bch2_trans_iter_exit(trans, &iter); - if (warn_errors) - bch_err_fn(c, ret); - return ret; -} - -int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k) -{ - return __bch2_resume_logged_op_finsert(trans, op_k, NULL); -} - -int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum, - u64 offset, u64 len, bool insert, - s64 *i_sectors_delta) -{ - struct bkey_i_logged_op_finsert op; - s64 shift = insert ? len : -len; - - bkey_logged_op_finsert_init(&op.k_i); - op.v.subvol = cpu_to_le32(inum.subvol); - op.v.inum = cpu_to_le64(inum.inum); - op.v.dst_offset = cpu_to_le64(offset + shift); - op.v.src_offset = cpu_to_le64(offset); - op.v.pos = cpu_to_le64(insert ? U64_MAX : offset); - - /* - * Logged ops aren't atomic w.r.t. snapshot creation: creating a - * snapshot while they're in progress, then crashing, will result in the - * resume only proceeding in one of the snapshots - */ - down_read(&c->snapshot_create_lock); - struct btree_trans *trans = bch2_trans_get(c); - int ret = bch2_logged_op_start(trans, &op.k_i); - if (ret) - goto out; - ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta); - ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; -out: - bch2_trans_put(trans); - up_read(&c->snapshot_create_lock); - - return ret; -} diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h deleted file mode 100644 index b93e4d4b3c0c..000000000000 --- a/fs/bcachefs/io_misc.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_MISC_H -#define _BCACHEFS_IO_MISC_H - -int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, - u64, struct bch_io_opts, s64 *, - struct write_point_specifier); - -int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos); -int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, - subvol_inum, u64, s64 *); -int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); - -void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) { \ - .val_to_text = bch2_logged_op_truncate_to_text, \ - .min_val_size = 24, \ -}) - -int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *); - -int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *); - -void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) { \ - .val_to_text = bch2_logged_op_finsert_to_text, \ - .min_val_size = 24, \ -}) - -int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *); - -int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *); - -#endif /* _BCACHEFS_IO_MISC_H */ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c deleted file mode 100644 index e0874ad9a6cf..000000000000 --- a/fs/bcachefs/io_read.c +++ /dev/null @@ -1,1543 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Some low level IO code, and hacks for various block layer limitations - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "async_objs.h" -#include "btree_update.h" -#include "buckets.h" -#include "checksum.h" -#include "clock.h" -#include "compress.h" -#include "data_update.h" -#include "disk_groups.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "io_read.h" -#include "io_misc.h" -#include "io_write.h" -#include "reflink.h" -#include "subvolume.h" -#include "trace.h" - -#include <linux/moduleparam.h> -#include <linux/random.h> -#include <linux/sched/mm.h> - -#ifdef CONFIG_BCACHEFS_DEBUG -static unsigned bch2_read_corrupt_ratio; -module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); -MODULE_PARM_DESC(read_corrupt_ratio, ""); -#endif - -static bool bch2_poison_extents_on_checksum_error; -module_param_named(poison_extents_on_checksum_error, - bch2_poison_extents_on_checksum_error, bool, 0644); -MODULE_PARM_DESC(poison_extents_on_checksum_error, - "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - -static bool bch2_target_congested(struct bch_fs *c, u16 target) -{ - const struct bch_devs_mask *devs; - unsigned d, nr = 0, total = 0; - u64 now = local_clock(), last; - s64 congested; - struct bch_dev *ca; - - if (!target) - return false; - - guard(rcu)(); - devs = bch2_target_to_mask(c, target) ?: - &c->rw_devs[BCH_DATA_user]; - - for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { - ca = rcu_dereference(c->devs[d]); - if (!ca) - continue; - - congested = atomic_read(&ca->congested); - last = READ_ONCE(ca->congested_last); - if (time_after64(now, last)) - congested -= (now - last) >> 12; - - total += max(congested, 0LL); - nr++; - } - - return get_random_u32_below(nr * CONGESTED_MAX) < total; -} - -#else - -static bool bch2_target_congested(struct bch_fs *c, u16 target) -{ - return false; -} - -#endif - -/* Cache promotion on read */ - -static const struct rhashtable_params bch_promote_params = { - .head_offset = offsetof(struct promote_op, hash), - .key_offset = offsetof(struct promote_op, pos), - .key_len = sizeof(struct bpos), - .automatic_shrinking = true, -}; - -static inline bool have_io_error(struct bch_io_failures *failed) -{ - return failed && failed->nr; -} - -static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) -{ - EBUG_ON(rbio->split); - - return rbio->data_update - ? container_of(rbio, struct data_update, rbio) - : NULL; -} - -static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) -{ - struct data_update *u = rbio_data_update(orig); - if (!u) - return false; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); - unsigned i = 0; - bkey_for_each_ptr(ptrs, ptr) { - if (ptr->dev == dev && - u->data_opts.rewrite_ptrs & BIT(i)) - return true; - i++; - } - - return false; -} - -static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, - struct bpos pos, - struct bch_io_opts opts, - unsigned flags, - struct bch_io_failures *failed) -{ - if (!have_io_error(failed)) { - BUG_ON(!opts.promote_target); - - if (!(flags & BCH_READ_may_promote)) - return bch_err_throw(c, nopromote_may_not); - - if (bch2_bkey_has_target(c, k, opts.promote_target)) - return bch_err_throw(c, nopromote_already_promoted); - - if (bkey_extent_is_unwritten(k)) - return bch_err_throw(c, nopromote_unwritten); - - if (bch2_target_congested(c, opts.promote_target)) - return bch_err_throw(c, nopromote_congested); - } - - if (rhashtable_lookup_fast(&c->promote_table, &pos, - bch_promote_params)) - return bch_err_throw(c, nopromote_in_flight); - - return 0; -} - -static noinline void promote_free(struct bch_read_bio *rbio) -{ - struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - struct bch_fs *c = rbio->c; - - int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); - - async_object_list_del(c, promote, op->list_idx); - async_object_list_del(c, rbio, rbio->list_idx); - - bch2_data_update_exit(&op->write); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); - kfree_rcu(op, rcu); -} - -static void promote_done(struct bch_write_op *wop) -{ - struct promote_op *op = container_of(wop, struct promote_op, write.op); - struct bch_fs *c = op->write.rbio.c; - - bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); - promote_free(&op->write.rbio); -} - -static void promote_start_work(struct work_struct *work) -{ - struct promote_op *op = container_of(work, struct promote_op, work); - - bch2_data_update_read_done(&op->write); -} - -static noinline void promote_start(struct bch_read_bio *rbio) -{ - struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - - trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); - - INIT_WORK(&op->work, promote_start_work); - queue_work(rbio->c->write_ref_wq, &op->work); -} - -static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_s_c k, - struct bpos pos, - struct extent_ptr_decoded *pick, - unsigned sectors, - struct bch_read_bio *orig, - struct bch_io_failures *failed) -{ - struct bch_fs *c = trans->c; - int ret; - - struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; - - if (!have_io_error(failed)) { - update_opts.target = orig->opts.promote_target; - update_opts.extra_replicas = 1; - update_opts.write_flags |= BCH_WRITE_cached; - update_opts.write_flags |= BCH_WRITE_only_specified_devs; - } else { - update_opts.target = orig->opts.foreground_target; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned ptr_bit = 1; - bkey_for_each_ptr(ptrs, ptr) { - if (bch2_dev_io_failures(failed, ptr->dev) && - !ptr_being_rewritten(orig, ptr->dev)) - update_opts.rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - - if (!update_opts.rewrite_ptrs) - return NULL; - } - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) - return ERR_PTR(-BCH_ERR_nopromote_no_writes); - - struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) { - ret = bch_err_throw(c, nopromote_enomem); - goto err_put; - } - - op->start_time = local_clock(); - op->pos = pos; - - if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, - bch_promote_params)) { - ret = bch_err_throw(c, nopromote_in_flight); - goto err; - } - - ret = async_object_list_add(c, promote, op, &op->list_idx); - if (ret < 0) - goto err_remove_hash; - - ret = bch2_data_update_init(trans, NULL, NULL, &op->write, - writepoint_hashed((unsigned long) current), - &orig->opts, - update_opts, - btree_id, k); - op->write.type = BCH_DATA_UPDATE_promote; - /* - * possible errors: -BCH_ERR_nocow_lock_blocked, - * -BCH_ERR_ENOSPC_disk_reservation: - */ - if (ret) - goto err_remove_list; - - rbio_init_fragment(&op->write.rbio.bio, orig); - op->write.rbio.bounce = true; - op->write.rbio.promote = true; - op->write.op.end_io = promote_done; - - return &op->write.rbio; -err_remove_list: - async_object_list_del(c, promote, op->list_idx); -err_remove_hash: - BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params)); -err: - bio_free_pages(&op->write.op.wbio.bio); - /* We may have added to the rhashtable and thus need rcu freeing: */ - kfree_rcu(op, rcu); -err_put: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); - return ERR_PTR(ret); -} - -noinline -static struct bch_read_bio *promote_alloc(struct btree_trans *trans, - struct bvec_iter iter, - struct bkey_s_c k, - struct extent_ptr_decoded *pick, - unsigned flags, - struct bch_read_bio *orig, - bool *bounce, - bool *read_full, - struct bch_io_failures *failed) -{ - /* - * We're in the retry path, but we don't know what to repair yet, and we - * don't want to do a promote here: - */ - if (failed && !failed->nr) - return NULL; - - struct bch_fs *c = trans->c; - /* - * if failed != NULL we're not actually doing a promote, we're - * recovering from an io/checksum error - */ - bool promote_full = (have_io_error(failed) || - *read_full || - READ_ONCE(c->opts.promote_whole_extents)); - /* data might have to be decompressed in the write path: */ - unsigned sectors = promote_full - ? max(pick->crc.compressed_size, pick->crc.live_size) - : bvec_iter_sectors(iter); - struct bpos pos = promote_full - ? bkey_start_pos(k.k) - : POS(k.k->p.inode, iter.bi_sector); - int ret; - - ret = should_promote(c, k, pos, orig->opts, flags, failed); - if (ret) - goto nopromote; - - struct bch_read_bio *promote = - __promote_alloc(trans, - k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_reflink - : BTREE_ID_extents, - k, pos, pick, sectors, orig, failed); - if (!promote) - return NULL; - - ret = PTR_ERR_OR_ZERO(promote); - if (ret) - goto nopromote; - - *bounce = true; - *read_full = promote_full; - - if (have_io_error(failed)) - orig->self_healing = true; - - return promote; -nopromote: - trace_io_read_nopromote(c, ret); - return NULL; -} - -void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op) -{ - if (!op->write.read_done) { - prt_printf(out, "parent read: %px\n", op->write.rbio.parent); - printbuf_indent_add(out, 2); - bch2_read_bio_to_text(out, op->write.rbio.parent); - printbuf_indent_sub(out, 2); - } - - bch2_data_update_to_text(out, &op->write); -} - -/* Read */ - -static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_read_bio *rbio, struct bpos read_pos) -{ - int ret = lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { rbio->subvol, read_pos.inode }, - read_pos.offset << 9)); - if (ret) - return ret; - - if (rbio->data_update) - prt_str(out, "(internal move) "); - - return 0; -} - -static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, - struct bch_read_bio *rbio, struct bpos read_pos) -{ - bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); -} - -enum rbio_context { - RBIO_CONTEXT_NULL, - RBIO_CONTEXT_HIGHPRI, - RBIO_CONTEXT_UNBOUND, -}; - -static inline struct bch_read_bio * -bch2_rbio_parent(struct bch_read_bio *rbio) -{ - return rbio->split ? rbio->parent : rbio; -} - -__always_inline -static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, - enum rbio_context context, - struct workqueue_struct *wq) -{ - if (context <= rbio->context) { - fn(&rbio->work); - } else { - rbio->work.func = fn; - rbio->context = context; - queue_work(wq, &rbio->work); - } -} - -static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) -{ - BUG_ON(rbio->bounce && !rbio->split); - - if (rbio->have_ioref) { - struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); - } - - if (rbio->split) { - struct bch_read_bio *parent = rbio->parent; - - if (unlikely(rbio->promote)) { - if (!rbio->bio.bi_status) - promote_start(rbio); - else - promote_free(rbio); - } else { - async_object_list_del(rbio->c, rbio, rbio->list_idx); - - if (rbio->bounce) - bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - - bio_put(&rbio->bio); - } - - rbio = parent; - } - - return rbio; -} - -/* - * Only called on a top level bch_read_bio to complete an entire read request, - * not a split: - */ -static void bch2_rbio_done(struct bch_read_bio *rbio) -{ - if (rbio->start_time) - bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], - rbio->start_time); -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - if (rbio->list_idx) - async_object_list_del(rbio->c, rbio, rbio->list_idx); -#endif - bio_endio(&rbio->bio); -} - -static void get_rbio_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, - struct bkey_buf *sk) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = lockrestart_do(trans, - bkey_err(k = bch2_bkey_get_iter(trans, &iter, - rbio->data_btree, rbio->data_pos, 0))); - if (ret) - return; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr) - if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { - bch2_bkey_buf_reassemble(sk, trans->c, k); - break; - } - - bch2_trans_iter_exit(trans, &iter); -} - -static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, - enum btree_id btree, struct bkey_s_c read_k) -{ - if (!bch2_poison_extents_on_checksum_error) - return 0; - - struct bch_fs *c = trans->c; - - struct data_update *u = rbio_data_update(rbio); - if (u) - read_k = bkey_i_to_s_c(u->k.k); - - u64 flags = bch2_bkey_extent_flags(read_k); - if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return 0; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), - BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_and_val_eq(k, read_k)) - goto out; - - struct bkey_i *new = bch2_trans_kmalloc(trans, - bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); - ret = PTR_ERR_OR_ZERO(new) ?: - (bkey_reassemble(new, k), 0) ?: - bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: - bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, 0); - - /* - * Propagate key change back to data update path, in particular so it - * knows the extent has been poisoned and it's safe to change the - * checksum - */ - if (u && !ret) - bch2_bkey_buf_copy(&u->k, c, new); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, - struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, - struct bch_io_failures *failed, - unsigned flags) -{ - struct data_update *u = container_of(rbio, struct data_update, rbio); -retry: - bch2_trans_begin(trans); - - struct btree_iter iter; - struct bkey_s_c k; - int ret = lockrestart_do(trans, - bkey_err(k = bch2_bkey_get_iter(trans, &iter, - u->btree_id, bkey_start_pos(&u->k.k->k), - 0))); - if (ret) - goto err; - - if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { - /* extent we wanted to read no longer exists: */ - rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten); - goto err; - } - - ret = __bch2_read_extent(trans, rbio, bvec_iter, - bkey_start_pos(&u->k.k->k), - u->btree_id, - bkey_i_to_s_c(u->k.k), - 0, failed, flags, -1); -err: - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, BCH_ERR_data_read_retry)) - goto retry; - - if (ret) { - rbio->bio.bi_status = BLK_STS_IOERR; - rbio->ret = ret; - } - - BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); - return ret; -} - -static void bch2_rbio_retry(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bvec_iter iter = rbio->bvec_iter; - unsigned flags = rbio->flags; - subvol_inum inum = { - .subvol = rbio->subvol, - .inum = rbio->read_pos.inode, - }; - struct bch_io_failures failed = { .nr = 0 }; - - struct btree_trans *trans = bch2_trans_get(c); - - struct bkey_buf sk; - bch2_bkey_buf_init(&sk); - bkey_init(&sk.k->k); - - trace_io_read_retry(&rbio->bio); - this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], - bvec_iter_sectors(rbio->bvec_iter)); - - get_rbio_extent(trans, rbio, &sk); - - if (!bkey_deleted(&sk.k->k) && - bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) - bch2_mark_io_failure(&failed, &rbio->pick, - rbio->ret == -BCH_ERR_data_read_retry_csum_err); - - if (!rbio->split) { - rbio->bio.bi_status = 0; - rbio->ret = 0; - } - - unsigned subvol = rbio->subvol; - struct bpos read_pos = rbio->read_pos; - - rbio = bch2_rbio_free(rbio); - - flags |= BCH_READ_in_retry; - flags &= ~BCH_READ_may_promote; - flags &= ~BCH_READ_last_fragment; - flags |= BCH_READ_must_clone; - - int ret = rbio->data_update - ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) - : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); - - if (ret) { - rbio->ret = ret; - rbio->bio.bi_status = BLK_STS_IOERR; - } - - if (failed.nr || ret) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, - (subvol_inum) { subvol, read_pos.inode }, - read_pos.offset << 9)); - if (rbio->data_update) - prt_str(&buf, "(internal move) "); - - prt_str(&buf, "data read error, "); - if (!ret) { - prt_str(&buf, "successful retry"); - if (rbio->self_healing) - prt_str(&buf, ", self healing"); - } else - prt_str(&buf, bch2_err_str(ret)); - prt_newline(&buf); - - - if (!bkey_deleted(&sk.k->k)) { - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); - prt_newline(&buf); - } - - bch2_io_failures_to_text(&buf, c, &failed); - - bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - - bch2_rbio_done(rbio); - bch2_bkey_buf_exit(&sk, c); - bch2_trans_put(trans); -} - -static void bch2_rbio_error(struct bch_read_bio *rbio, - int ret, blk_status_t blk_error) -{ - BUG_ON(ret >= 0); - - rbio->ret = ret; - rbio->bio.bi_status = blk_error; - - bch2_rbio_parent(rbio)->saw_error = true; - - if (rbio->flags & BCH_READ_in_retry) - return; - - if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { - bch2_rbio_punt(rbio, bch2_rbio_retry, - RBIO_CONTEXT_UNBOUND, system_unbound_wq); - } else { - rbio = bch2_rbio_free(rbio); - - rbio->ret = ret; - rbio->bio.bi_status = blk_error; - - bch2_rbio_done(rbio); - } -} - -static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, - struct bch_read_bio *rbio) -{ - struct bch_fs *c = rbio->c; - u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; - struct bch_extent_crc_unpacked new_crc; - struct btree_iter iter; - struct bkey_i *new; - struct bkey_s_c k; - int ret = 0; - - if (crc_is_compressed(rbio->pick.crc)) - return 0; - - k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_slots|BTREE_ITER_intent); - if ((ret = bkey_err(k))) - goto out; - - if (bversion_cmp(k.k->bversion, rbio->version) || - !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) - goto out; - - /* Extent was merged? */ - if (bkey_start_offset(k.k) < data_offset || - k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) - goto out; - - if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, - rbio->pick.crc, NULL, &new_crc, - bkey_start_offset(k.k) - data_offset, k.k->size, - rbio->pick.crc.csum_type)) { - bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); - ret = 0; - goto out; - } - - /* - * going to be temporarily appending another checksum entry: - */ - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + - sizeof(struct bch_extent_crc128)); - if ((ret = PTR_ERR_OR_ZERO(new))) - goto out; - - bkey_reassemble(new, k); - - if (!bch2_bkey_narrow_crcs(new, new_crc)) - goto out; - - ret = bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_internal_snapshot_node); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) -{ - bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_rbio_narrow_crcs(trans, rbio)); -} - -static void bch2_read_decompress_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_str(&buf, "decompression error"); - - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) - bch_err_ratelimited(ca, "%s", buf.buf); - else - bch_err_ratelimited(c, "%s", buf.buf); - - bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); - printbuf_exit(&buf); -} - -static void bch2_read_decrypt_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_str(&buf, "decrypt error"); - - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) - bch_err_ratelimited(ca, "%s", buf.buf); - else - bch_err_ratelimited(c, "%s", buf.buf); - - bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); - printbuf_exit(&buf); -} - -/* Inner part that may run in process context */ -static void __bch2_read_endio(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct bch_read_bio *parent = bch2_rbio_parent(rbio); - struct bio *src = &rbio->bio; - struct bio *dst = &parent->bio; - struct bvec_iter dst_iter = rbio->bvec_iter; - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - unsigned nofs_flags; - struct bch_csum csum; - int ret; - - nofs_flags = memalloc_nofs_save(); - - /* Reset iterator for checksumming and copying bounced data: */ - if (rbio->bounce) { - src->bi_iter.bi_size = crc.compressed_size << 9; - src->bi_iter.bi_idx = 0; - src->bi_iter.bi_bvec_done = 0; - } else { - src->bi_iter = rbio->bvec_iter; - } - - bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); - - csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; - - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { - rbio->flags |= BCH_READ_must_bounce; - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, - BLK_STS_IOERR); - goto out; - } - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); - - if (!csum_good) - goto csum_err; - - /* - * XXX - * We need to rework the narrow_crcs path to deliver the read completion - * first, and then punt to a different workqueue, otherwise we're - * holding up reads while doing btree updates which is bad for memory - * reclaim. - */ - if (unlikely(rbio->narrow_crcs)) - bch2_rbio_narrow_crcs(rbio); - - if (likely(!parent->data_update)) { - /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->offset_into_extent; - crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - - if (crc_is_compressed(crc)) { - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && - !c->opts.no_data_io) - goto decompression_err; - } else { - /* don't need to decrypt the entire bio: */ - nonce = nonce_add(nonce, crc.offset << 9); - bio_advance(src, crc.offset << 9); - - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; - - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - if (rbio->bounce) { - struct bvec_iter src_iter = src->bi_iter; - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } - } - } else { - if (rbio->split) - rbio->parent->pick = rbio->pick; - - if (rbio->bounce) { - struct bvec_iter src_iter = src->bi_iter; - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } - } - - if (rbio->promote) { - /* - * Re encrypt data we decrypted, so it's consistent with - * rbio->crc: - */ - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - } - - if (likely(!(rbio->flags & BCH_READ_in_retry))) { - rbio = bch2_rbio_free(rbio); - bch2_rbio_done(rbio); - } -out: - memalloc_nofs_restore(nofs_flags); - return; -csum_err: - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); - goto out; -decompression_err: - bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); - goto out; -decrypt_err: - bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); - goto out; -} - -static void bch2_read_endio(struct bio *bio) -{ - struct bch_read_bio *rbio = - container_of(bio, struct bch_read_bio, bio); - struct bch_fs *c = rbio->c; - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct workqueue_struct *wq = NULL; - enum rbio_context context = RBIO_CONTEXT_NULL; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, - rbio->submit_time, !bio->bi_status); - - if (!rbio->split) - rbio->bio.bi_end_io = rbio->end_io; - - if (unlikely(bio->bi_status)) { - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); - return; - } - - if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || - (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { - trace_and_count(c, io_read_reuse_race, &rbio->bio); - - if (rbio->flags & BCH_READ_retry_if_stale) - bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); - else - bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); - return; - } - - if (rbio->narrow_crcs || - rbio->promote || - crc_is_compressed(rbio->pick.crc) || - bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) - context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; - else if (rbio->pick.crc.csum_type) - context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; - - bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); -} - -static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c k, - struct bch_extent_ptr ptr) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct printbuf buf = PRINTBUF; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - PTR_BUCKET_POS(ca, &ptr), - BTREE_ITER_cached); - - int gen = bucket_gen_get(ca, iter.pos.offset); - if (gen >= 0) { - prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); - printbuf_indent_add(&buf, 2); - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - prt_printf(&buf, "memory gen: %u", gen); - - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); - if (!ret) { - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - } - } else { - prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", - iter.pos.inode, iter.pos.offset); - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "first bucket %u nbuckets %llu\n", - ca->mi.first_bucket, ca->mi.nbuckets); - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - } - - bch2_fs_inconsistent(c, "%s", buf.buf); - - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); -} - -int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - struct bvec_iter iter, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, - struct bch_io_failures *failed, unsigned flags, int dev) -{ - struct bch_fs *c = trans->c; - struct extent_ptr_decoded pick; - struct bch_read_bio *rbio = NULL; - bool bounce = false, read_full = false, narrow_crcs = false; - struct bpos data_pos = bkey_start_pos(k.k); - struct data_update *u = rbio_data_update(orig); - int ret = 0; - - if (bkey_extent_is_inline_data(k.k)) { - unsigned bytes = min_t(unsigned, iter.bi_size, - bkey_inline_data_bytes(k.k)); - - swap(iter.bi_size, bytes); - memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); - swap(iter.bi_size, bytes); - bio_advance_iter(&orig->bio, &iter, bytes); - zero_fill_bio_iter(&orig->bio, iter); - this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], - bvec_iter_sectors(iter)); - goto out_read_done; - } - - if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && - !orig->data_update) - return bch_err_throw(c, extent_poisoned); -retry_pick: - ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); - - /* hole or reservation - just zero fill: */ - if (!ret) - goto hole; - - if (unlikely(ret < 0)) { - if (ret == -BCH_ERR_data_read_csum_err) { - int ret2 = maybe_poison_extent(trans, orig, data_btree, k); - if (ret2) { - ret = ret2; - goto err; - } - - trace_and_count(c, io_read_fail_and_poison, &orig->bio); - } - - struct printbuf buf = PRINTBUF; - bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "%s\n ", bch2_err_str(ret)); - bch2_bkey_val_to_text(&buf, c, k); - - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - goto err; - } - - if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && - !c->chacha20_key_set) { - struct printbuf buf = PRINTBUF; - bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); - bch2_bkey_val_to_text(&buf, c, k); - - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = bch_err_throw(c, data_read_no_encryption_key); - goto err; - } - - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_io_read); - - /* - * Stale dirty pointers are treated as IO errors, but @failed isn't - * allocated unless we're in the retry path - so if we're not in the - * retry path, don't check here, it'll be caught in bch2_read_endio() - * and we'll end up in the retry path: - */ - if ((flags & BCH_READ_in_retry) && - !pick.ptr.cached && - ca && - unlikely(dev_ptr_stale(ca, &pick.ptr))) { - read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); - bch2_mark_io_failure(failed, &pick, false); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); - goto retry_pick; - } - - if (likely(!u)) { - if (!(flags & BCH_READ_last_fragment) || - bio_flagged(&orig->bio, BIO_CHAIN)) - flags |= BCH_READ_must_clone; - - narrow_crcs = !(flags & BCH_READ_in_retry) && - bch2_can_narrow_extent_crcs(k, pick.crc); - - if (narrow_crcs && (flags & BCH_READ_user_mapped)) - flags |= BCH_READ_must_bounce; - - EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - - if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_none && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick.crc.csum_type) && - (flags & BCH_READ_user_mapped)) || - (flags & BCH_READ_must_bounce)))) { - read_full = true; - bounce = true; - } - } else { - /* - * can happen if we retry, and the extent we were going to read - * has been merged in the meantime: - */ - if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { - if (ca) - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_io_read); - rbio->ret = bch_err_throw(c, data_read_buffer_too_small); - goto out_read_done; - } - - iter.bi_size = pick.crc.compressed_size << 9; - read_full = true; - } - - if (orig->opts.promote_target || have_io_error(failed)) - rbio = promote_alloc(trans, iter, k, &pick, flags, orig, - &bounce, &read_full, failed); - - if (!read_full) { - EBUG_ON(crc_is_compressed(pick.crc)); - EBUG_ON(pick.crc.csum_type && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - bvec_iter_sectors(iter) != pick.crc.live_size || - pick.crc.offset || - offset_into_extent)); - - data_pos.offset += offset_into_extent; - pick.ptr.offset += pick.crc.offset + - offset_into_extent; - offset_into_extent = 0; - pick.crc.compressed_size = bvec_iter_sectors(iter); - pick.crc.uncompressed_size = bvec_iter_sectors(iter); - pick.crc.offset = 0; - pick.crc.live_size = bvec_iter_sectors(iter); - } - - if (rbio) { - /* - * promote already allocated bounce rbio: - * promote needs to allocate a bio big enough for uncompressing - * data in the write path, but we're not going to use it all - * here: - */ - EBUG_ON(rbio->bio.bi_iter.bi_size < - pick.crc.compressed_size << 9); - rbio->bio.bi_iter.bi_size = - pick.crc.compressed_size << 9; - } else if (bounce) { - unsigned sectors = pick.crc.compressed_size; - - rbio = rbio_init_fragment(bio_alloc_bioset(NULL, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - 0, - GFP_NOFS, - &c->bio_read_split), - orig); - - bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); - rbio->bounce = true; - } else if (flags & BCH_READ_must_clone) { - /* - * Have to clone if there were any splits, due to error - * reporting issues (if a split errored, and retrying didn't - * work, when it reports the error to its parent (us) we don't - * know if the error was from our bio, and we should retry, or - * from the whole bio, in which case we don't want to retry and - * lose the error) - */ - rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, - &c->bio_read_split), - orig); - rbio->bio.bi_iter = iter; - } else { - rbio = orig; - rbio->bio.bi_iter = iter; - EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); - } - - EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - - rbio->submit_time = local_clock(); - if (!rbio->split) - rbio->end_io = orig->bio.bi_end_io; - rbio->bvec_iter = iter; - rbio->offset_into_extent= offset_into_extent; - rbio->flags = flags; - rbio->have_ioref = ca != NULL; - rbio->narrow_crcs = narrow_crcs; - rbio->ret = 0; - rbio->context = 0; - rbio->pick = pick; - rbio->subvol = orig->subvol; - rbio->read_pos = read_pos; - rbio->data_btree = data_btree; - rbio->data_pos = data_pos; - rbio->version = k.k->bversion; - INIT_WORK(&rbio->work, NULL); - - rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick.ptr.offset; - rbio->bio.bi_end_io = bch2_read_endio; - - async_object_list_add(c, rbio, rbio, &rbio->list_idx); - - if (rbio->bounce) - trace_and_count(c, io_read_bounce, &rbio->bio); - - if (!u) - this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); - else - this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); - bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - - /* - * If it's being moved internally, we don't want to flag it as a cache - * hit: - */ - if (ca && pick.ptr.cached && !u) - bch2_bucket_io_time_reset(trans, pick.ptr.dev, - PTR_BUCKET_NR(ca, &pick.ptr), READ); - - if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { - bio_inc_remaining(&orig->bio); - trace_and_count(c, io_read_split, &orig->bio); - } - - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - if (!(flags & BCH_READ_in_retry)) - bch2_trans_unlock(trans); - else - bch2_trans_unlock_long(trans); - - if (likely(!rbio->pick.do_ec_reconstruct)) { - if (unlikely(!rbio->have_ioref)) { - bch2_rbio_error(rbio, - -BCH_ERR_data_read_retry_device_offline, - BLK_STS_IOERR); - goto out; - } - - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], - bio_sectors(&rbio->bio)); - bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - - if (unlikely(c->opts.no_data_io)) { - if (likely(!(flags & BCH_READ_in_retry))) - bio_endio(&rbio->bio); - } else { - if (likely(!(flags & BCH_READ_in_retry))) - submit_bio(&rbio->bio); - else - submit_bio_wait(&rbio->bio); - } - - /* - * We just submitted IO which may block, we expect relock fail - * events and shouldn't count them: - */ - trans->notrace_relock_fail = true; - } else { - /* Attempting reconstruct read: */ - if (bch2_ec_read_extent(trans, rbio, k)) { - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, - BLK_STS_IOERR); - goto out; - } - - if (likely(!(flags & BCH_READ_in_retry))) - bio_endio(&rbio->bio); - } -out: - if (likely(!(flags & BCH_READ_in_retry))) { - return 0; - } else { - bch2_trans_unlock(trans); - - int ret; - - rbio->context = RBIO_CONTEXT_UNBOUND; - bch2_read_endio(&rbio->bio); - - ret = rbio->ret; - rbio = bch2_rbio_free(rbio); - - if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) - bch2_mark_io_failure(failed, &pick, - ret == -BCH_ERR_data_read_retry_csum_err); - - return ret; - } - -err: - if (flags & BCH_READ_in_retry) - return ret; - - orig->bio.bi_status = BLK_STS_IOERR; - orig->ret = ret; - goto out_read_done; - -hole: - this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], - bvec_iter_sectors(iter)); - /* - * won't normally happen in the data update (bch2_move_extent()) path, - * but if we retry and the extent we wanted to read no longer exists we - * have to signal that: - */ - if (u) - orig->ret = bch_err_throw(c, data_read_key_overwritten); - - zero_fill_bio_iter(&orig->bio, iter); -out_read_done: - if ((flags & BCH_READ_last_fragment) && - !(flags & BCH_READ_in_retry)) - bch2_rbio_done(orig); - return 0; -} - -int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, - struct bkey_buf *prev_read, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - enum btree_id data_btree; - int ret; - - EBUG_ON(rbio->data_update); - - bch2_bkey_buf_init(&sk); - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, bvec_iter.bi_sector), - BTREE_ITER_slots); - - while (1) { - data_btree = BTREE_ID_extents; - - bch2_trans_begin(trans); - - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - - bch2_btree_iter_set_pos(trans, &iter, - POS(inum.inum, bvec_iter.bi_sector)); - - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - s64 offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - unsigned sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&sk, c, k); - - ret = bch2_read_indirect_extent(trans, &data_btree, - &offset_into_extent, &sk); - if (ret) - goto err; - - k = bkey_i_to_s_c(sk.k); - - if (unlikely(flags & BCH_READ_in_retry)) { - if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) - failed->nr = 0; - bch2_bkey_buf_copy(prev_read, c, sk.k); - } - - /* - * With indirect extents, the amount of data to read is the min - * of the original extent and the indirect extent: - */ - sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - - unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; - swap(bvec_iter.bi_size, bytes); - - if (bvec_iter.bi_size == bytes) - flags |= BCH_READ_last_fragment; - - ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, - data_btree, k, - offset_into_extent, failed, flags, -1); - swap(bvec_iter.bi_size, bytes); - - if (ret) - goto err; - - if (flags & BCH_READ_last_fragment) - break; - - bio_advance_iter(&rbio->bio, &bvec_iter, bytes); -err: - if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) - flags |= BCH_READ_must_bounce; - - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_data_read_retry)) - break; - } - - if (unlikely(ret)) { - if (ret != -BCH_ERR_extent_poisoned) { - struct printbuf buf = PRINTBUF; - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, inum, - bvec_iter.bi_sector << 9)); - prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - rbio->bio.bi_status = BLK_STS_IOERR; - rbio->ret = ret; - - if (!(flags & BCH_READ_in_retry)) - bch2_rbio_done(rbio); - } - - bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&sk, c); - return ret; -} - -static const char * const bch2_read_bio_flags[] = { -#define x(n) #n, - BCH_READ_FLAGS() -#undef x - NULL -}; - -void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) -{ - u64 now = local_clock(); - prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0); - prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0); - - if (!rbio->split) - prt_printf(out, "end_io:\t%ps\n", rbio->end_io); - else - prt_printf(out, "parent:\t%px\n", rbio->parent); - - prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io); - - prt_printf(out, "promote:\t%u\n", rbio->promote); - prt_printf(out, "bounce:\t%u\n", rbio->bounce); - prt_printf(out, "split:\t%u\n", rbio->split); - prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); - prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); - prt_printf(out, "context:\t%u\n", rbio->context); - - int ret = READ_ONCE(rbio->ret); - if (ret < 0) - prt_printf(out, "ret:\t%s\n", bch2_err_str(ret)); - else - prt_printf(out, "ret:\t%i\n", ret); - - prt_printf(out, "flags:\t"); - bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); - prt_newline(out); - - bch2_bio_to_text(out, &rbio->bio); -} - -void bch2_fs_io_read_exit(struct bch_fs *c) -{ - if (c->promote_table.tbl) - rhashtable_destroy(&c->promote_table); - bioset_exit(&c->bio_read_split); - bioset_exit(&c->bio_read); - mempool_exit(&c->bio_bounce_pages); -} - -int bch2_fs_io_read_init(struct bch_fs *c) -{ - if (mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->opts.btree_node_size, - c->opts.encoded_extent_max) / - PAGE_SIZE, 0)) - return bch_err_throw(c, ENOMEM_bio_bounce_pages_init); - - if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return bch_err_throw(c, ENOMEM_bio_read_init); - - if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return bch_err_throw(c, ENOMEM_bio_read_split_init); - - if (rhashtable_init(&c->promote_table, &bch_promote_params)) - return bch_err_throw(c, ENOMEM_promote_table_init); - - return 0; -} diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h deleted file mode 100644 index 9c5ddbf861b3..000000000000 --- a/fs/bcachefs/io_read.h +++ /dev/null @@ -1,216 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_READ_H -#define _BCACHEFS_IO_READ_H - -#include "bkey_buf.h" -#include "btree_iter.h" -#include "extents_types.h" -#include "reflink.h" - -struct bch_read_bio { - struct bch_fs *c; - u64 start_time; - u64 submit_time; - - /* - * Reads will often have to be split, and if the extent being read from - * was checksummed or compressed we'll also have to allocate bounce - * buffers and copy the data back into the original bio. - * - * If we didn't have to split, we have to save and restore the original - * bi_end_io - @split below indicates which: - */ - union { - struct bch_read_bio *parent; - bio_end_io_t *end_io; - }; - - /* - * Saved copy of bio->bi_iter, from submission time - allows us to - * resubmit on IO error, and also to copy data back to the original bio - * when we're bouncing: - */ - struct bvec_iter bvec_iter; - - unsigned offset_into_extent; - - u16 flags; - union { - struct { - u16 data_update:1, - promote:1, - bounce:1, - split:1, - have_ioref:1, - narrow_crcs:1, - saw_error:1, - self_healing:1, - context:2; - }; - u16 _state; - }; - s16 ret; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - - struct extent_ptr_decoded pick; - - /* - * pos we read from - different from data_pos for indirect extents: - */ - u32 subvol; - struct bpos read_pos; - - /* - * start pos of data we read (may not be pos of data we want) - for - * promote, narrow extents paths: - */ - enum btree_id data_btree; - struct bpos data_pos; - struct bversion version; - - struct bch_io_opts opts; - - struct work_struct work; - - struct bio bio; -}; - -#define to_rbio(_bio) container_of((_bio), struct bch_read_bio, bio) - -struct bch_devs_mask; -struct cache_promote_op; -struct extent_ptr_decoded; - -static inline int bch2_read_indirect_extent(struct btree_trans *trans, - enum btree_id *data_btree, - s64 *offset_into_extent, - struct bkey_buf *extent) -{ - if (extent->k->k.type != KEY_TYPE_reflink_p) - return 0; - - *data_btree = BTREE_ID_reflink; - - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, - offset_into_extent, - bkey_i_to_s_c_reflink_p(extent->k), - true, 0); - int ret = bkey_err(k); - if (ret) - return ret; - - if (bkey_deleted(k.k)) { - bch2_trans_iter_exit(trans, &iter); - return bch_err_throw(c, missing_indirect_extent); - } - - bch2_bkey_buf_reassemble(extent, c, k); - bch2_trans_iter_exit(trans, &iter); - return 0; -} - -#define BCH_READ_FLAGS() \ - x(retry_if_stale) \ - x(may_promote) \ - x(user_mapped) \ - x(last_fragment) \ - x(must_bounce) \ - x(must_clone) \ - x(in_retry) - -enum __bch_read_flags { -#define x(n) __BCH_READ_##n, - BCH_READ_FLAGS() -#undef x -}; - -enum bch_read_flags { -#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), - BCH_READ_FLAGS() -#undef x -}; - -int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, - struct bvec_iter, struct bpos, enum btree_id, - struct bkey_s_c, unsigned, - struct bch_io_failures *, unsigned, int); - -static inline void bch2_read_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, unsigned flags) -{ - int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags, -1); - /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */ - WARN(ret, "unhandled error from __bch2_read_extent()"); -} - -int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, - struct bch_io_failures *, struct bkey_buf *, unsigned flags); - -static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum) -{ - BUG_ON(rbio->_state); - - rbio->subvol = inum.subvol; - - bch2_trans_run(c, - __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, - BCH_READ_retry_if_stale| - BCH_READ_may_promote| - BCH_READ_user_mapped)); -} - -static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, - struct bch_read_bio *orig) -{ - struct bch_read_bio *rbio = to_rbio(bio); - - rbio->c = orig->c; - rbio->_state = 0; - rbio->flags = 0; - rbio->ret = 0; - rbio->split = true; - rbio->parent = orig; - rbio->opts = orig->opts; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - rbio->list_idx = 0; -#endif - return rbio; -} - -static inline struct bch_read_bio *rbio_init(struct bio *bio, - struct bch_fs *c, - struct bch_io_opts opts, - bio_end_io_t end_io) -{ - struct bch_read_bio *rbio = to_rbio(bio); - - rbio->start_time = local_clock(); - rbio->c = c; - rbio->_state = 0; - rbio->flags = 0; - rbio->ret = 0; - rbio->opts = opts; - rbio->bio.bi_end_io = end_io; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - rbio->list_idx = 0; -#endif - return rbio; -} - -struct promote_op; -void bch2_promote_op_to_text(struct printbuf *, struct promote_op *); -void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *); - -void bch2_fs_io_read_exit(struct bch_fs *); -int bch2_fs_io_read_init(struct bch_fs *); - -#endif /* _BCACHEFS_IO_READ_H */ diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c deleted file mode 100644 index 88b1eec8eff3..000000000000 --- a/fs/bcachefs/io_write.c +++ /dev/null @@ -1,1780 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "async_objs.h" -#include "bkey_buf.h" -#include "bset.h" -#include "btree_update.h" -#include "buckets.h" -#include "checksum.h" -#include "clock.h" -#include "compress.h" -#include "debug.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extent_update.h" -#include "inode.h" -#include "io_write.h" -#include "journal.h" -#include "keylist.h" -#include "move.h" -#include "nocow_locking.h" -#include "rebalance.h" -#include "subvolume.h" -#include "super.h" -#include "super-io.h" -#include "trace.h" - -#include <linux/blkdev.h> -#include <linux/prefetch.h> -#include <linux/random.h> -#include <linux/sched/mm.h> - -#ifdef CONFIG_BCACHEFS_DEBUG -static unsigned bch2_write_corrupt_ratio; -module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644); -MODULE_PARM_DESC(write_corrupt_ratio, ""); -#endif - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - -static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, - u64 now, int rw) -{ - u64 latency_capable = - ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; - /* ideally we'd be taking into account the device's variance here: */ - u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); - s64 latency_over = io_latency - latency_threshold; - - if (latency_threshold && latency_over > 0) { - /* - * bump up congested by approximately latency_over * 4 / - * latency_threshold - we don't need much accuracy here so don't - * bother with the divide: - */ - if (atomic_read(&ca->congested) < CONGESTED_MAX) - atomic_add(latency_over >> - max_t(int, ilog2(latency_threshold) - 2, 0), - &ca->congested); - - ca->congested_last = now; - } else if (atomic_read(&ca->congested) > 0) { - atomic_dec(&ca->congested); - } -} - -void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) -{ - atomic64_t *latency = &ca->cur_latency[rw]; - u64 now = local_clock(); - u64 io_latency = time_after64(now, submit_time) - ? now - submit_time - : 0; - u64 old, new; - - old = atomic64_read(latency); - do { - /* - * If the io latency was reasonably close to the current - * latency, skip doing the update and atomic operation - most of - * the time: - */ - if (abs((int) (old - io_latency)) < (old >> 1) && - now & ~(~0U << 5)) - break; - - new = ewma_add(old, io_latency, 5); - } while (!atomic64_try_cmpxchg(latency, &old, new)); - - bch2_congested_acct(ca, io_latency, now, rw); - - __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); -} - -#endif - -/* Allocate, free from mempool: */ - -void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) -{ - struct bvec_iter_all iter; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, iter) - if (bv->bv_page != ZERO_PAGE(0)) - mempool_free(bv->bv_page, &c->bio_bounce_pages); - bio->bi_vcnt = 0; -} - -static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) -{ - struct page *page; - - if (likely(!*using_mempool)) { - page = alloc_page(GFP_NOFS); - if (unlikely(!page)) { - mutex_lock(&c->bio_bounce_pages_lock); - *using_mempool = true; - goto pool_alloc; - - } - } else { -pool_alloc: - page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); - } - - return page; -} - -void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, - size_t size) -{ - bool using_mempool = false; - - while (size) { - struct page *page = __bio_alloc_page_pool(c, &using_mempool); - unsigned len = min_t(size_t, PAGE_SIZE, size); - - BUG_ON(!bio_add_page(bio, page, len, 0)); - size -= len; - } - - if (using_mempool) - mutex_unlock(&c->bio_bounce_pages_lock); -} - -/* Extent update path: */ - -int bch2_sum_sector_overwrites(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *new, - bool *usage_increasing, - s64 *i_sectors_delta, - s64 *disk_sectors_delta) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c old; - unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); - bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); - int ret = 0; - - *usage_increasing = false; - *i_sectors_delta = 0; - *disk_sectors_delta = 0; - - bch2_trans_copy_iter(trans, &iter, extent_iter); - - for_each_btree_key_max_continue_norestart(trans, iter, - new->k.p, BTREE_ITER_slots, old, ret) { - s64 sectors = min(new->k.p.offset, old.k->p.offset) - - max(bkey_start_offset(&new->k), - bkey_start_offset(old.k)); - - *i_sectors_delta += sectors * - (bkey_extent_is_allocation(&new->k) - - bkey_extent_is_allocation(old.k)); - - *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); - *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot - ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) - : 0; - - if (!*usage_increasing && - (new->k.p.snapshot != old.k->p.snapshot || - new_replicas > bch2_bkey_replicas(c, old) || - (!new_compressed && bch2_bkey_sectors_compressed(old)))) - *usage_increasing = true; - - if (bkey_ge(old.k->p, new->k.p)) - break; - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, - struct btree_iter *extent_iter, - u64 new_i_size, - s64 i_sectors_delta) -{ - /* - * Crazy performance optimization: - * Every extent update needs to also update the inode: the inode trigger - * will set bi->journal_seq to the journal sequence number of this - * transaction - for fsync. - * - * But if that's the only reason we're updating the inode (we're not - * updating bi_size or bi_sectors), then we don't need the inode update - * to be journalled - if we crash, the bi_journal_seq update will be - * lost, but that's fine. - */ - unsigned inode_update_flags = BTREE_UPDATE_nojournal; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, - extent_iter->pos.inode, - extent_iter->snapshot), - BTREE_ITER_intent| - BTREE_ITER_cached); - int ret = bkey_err(k); - if (unlikely(ret)) - return ret; - - /* - * varint_decode_fast(), in the inode .invalid method, reads up to 7 - * bytes past the end of the buffer: - */ - struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8); - ret = PTR_ERR_OR_ZERO(k_mut); - if (unlikely(ret)) - goto err; - - bkey_reassemble(k_mut, k); - - if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) { - k_mut = bch2_inode_to_v3(trans, k_mut); - ret = PTR_ERR_OR_ZERO(k_mut); - if (unlikely(ret)) - goto err; - } - - struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut); - - if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && - new_i_size > le64_to_cpu(inode->v.bi_size)) { - inode->v.bi_size = cpu_to_le64(new_i_size); - inode_update_flags = 0; - } - - if (i_sectors_delta) { - s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors); - if (unlikely(bi_sectors + i_sectors_delta < 0)) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0", - extent_iter->pos.inode, bi_sectors, i_sectors_delta); - - bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - if (i_sectors_delta < 0) - i_sectors_delta = -bi_sectors; - else - i_sectors_delta = 0; - } - - le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); - inode_update_flags = 0; - } - - /* - * extents, dirents and xattrs updates require that an inode update also - * happens - to ensure that if a key exists in one of those btrees with - * a given snapshot ID an inode is also present - so we may have to skip - * the nojournal optimization: - */ - if (inode->k.p.snapshot != iter.snapshot) { - inode->k.p.snapshot = iter.snapshot; - inode_update_flags = 0; - } - - ret = bch2_trans_update(trans, &iter, &inode->k_i, - BTREE_UPDATE_internal_snapshot_node| - inode_update_flags); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_extent_update(struct btree_trans *trans, - subvol_inum inum, - struct btree_iter *iter, - struct bkey_i *k, - struct disk_reservation *disk_res, - u64 new_i_size, - s64 *i_sectors_delta_total, - bool check_enospc) -{ - struct bpos next_pos; - bool usage_increasing; - s64 i_sectors_delta = 0, disk_sectors_delta = 0; - int ret; - - /* - * This traverses us the iterator without changing iter->path->pos to - * search_key() (which is pos + 1 for extents): we want there to be a - * path already traversed at iter->pos because - * bch2_trans_extent_update() will use it to attempt extent merging - */ - ret = __bch2_btree_iter_traverse(trans, iter); - if (ret) - return ret; - - ret = bch2_extent_trim_atomic(trans, iter, k); - if (ret) - return ret; - - next_pos = k->k.p; - - ret = bch2_sum_sector_overwrites(trans, iter, k, - &usage_increasing, - &i_sectors_delta, - &disk_sectors_delta); - if (ret) - return ret; - - if (disk_res && - disk_sectors_delta > (s64) disk_res->sectors) { - ret = bch2_disk_reservation_add(trans->c, disk_res, - disk_sectors_delta - disk_res->sectors, - !check_enospc || !usage_increasing - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - return ret; - } - - /* - * Note: - * We always have to do an inode update - even when i_size/i_sectors - * aren't changing - for fsync to work properly; fsync relies on - * inode->bi_journal_seq which is updated by the trigger code: - */ - ret = bch2_extent_update_i_size_sectors(trans, iter, - min(k->k.p.offset << 9, new_i_size), - i_sectors_delta) ?: - bch2_trans_update(trans, iter, k, 0) ?: - bch2_trans_commit(trans, disk_res, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc); - if (unlikely(ret)) - return ret; - - if (i_sectors_delta_total) - *i_sectors_delta_total += i_sectors_delta; - bch2_btree_iter_set_pos(trans, iter, next_pos); - return 0; -} - -static int bch2_write_index_default(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct bkey_buf sk; - struct keylist *keys = &op->insert_keys; - struct bkey_i *k = bch2_keylist_front(keys); - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - subvol_inum inum = { - .subvol = op->subvol, - .inum = k->k.p.inode, - }; - int ret; - - BUG_ON(!inum.subvol); - - bch2_bkey_buf_init(&sk); - - do { - bch2_trans_begin(trans); - - k = bch2_keylist_front(keys); - bch2_bkey_buf_copy(&sk, c, k); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, - &sk.k->k.p.snapshot); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - bkey_start_pos(&sk.k->k), - BTREE_ITER_slots|BTREE_ITER_intent); - - ret = bch2_extent_update(trans, inum, &iter, sk.k, - &op->res, - op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_check_enospc); - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (bkey_ge(iter.pos, k->k.p)) - bch2_keylist_pop_front(&op->insert_keys); - else - bch2_cut_front(iter.pos, k); - } while (!bch2_keylist_empty(keys)); - - bch2_trans_put(trans); - bch2_bkey_buf_exit(&sk, c); - - return ret; -} - -/* Writes */ - -void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) -{ - struct printbuf buf = PRINTBUF; - - if (op->subvol) { - bch2_inum_offset_err_msg(op->c, &buf, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - } else { - struct bpos pos = op->pos; - pos.offset = offset; - bch2_inum_snap_offset_err_msg(op->c, &buf, pos); - } - - prt_str(&buf, "write error: "); - - va_list args; - va_start(args, fmt); - prt_vprintf(&buf, fmt, args); - va_end(args); - - if (op->flags & BCH_WRITE_move) { - struct data_update *u = container_of(op, struct data_update, op); - - prt_printf(&buf, "\n from internal move "); - bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k)); - } - - bch_err_ratelimited(op->c, "%s", buf.buf); - printbuf_exit(&buf); -} - -void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, - enum bch_data_type type, - const struct bkey_i *k, - bool nocow) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - struct bch_write_bio *n; - unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE; - unsigned ref_idx = type == BCH_DATA_btree - ? BCH_DEV_READ_REF_btree_node_write - : BCH_DEV_WRITE_REF_io_write; - - BUG_ON(c->opts.nochanges); - - const struct bch_extent_ptr *last = NULL; - bkey_for_each_ptr(ptrs, ptr) - last = ptr; - - bkey_for_each_ptr(ptrs, ptr) { - /* - * XXX: btree writes should be using io_ref[WRITE], but we - * aren't retrying failed btree writes yet (due to device - * removal/ro): - */ - struct bch_dev *ca = nocow - ? bch2_dev_have_ref(c, ptr->dev) - : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx); - - if (ptr != last) { - n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); - - n->bio.bi_end_io = wbio->bio.bi_end_io; - n->bio.bi_private = wbio->bio.bi_private; - n->parent = wbio; - n->split = true; - n->bounce = false; - n->put_bio = true; - n->bio.bi_opf = wbio->bio.bi_opf; - bio_inc_remaining(&wbio->bio); - } else { - n = wbio; - n->split = false; - } - - n->c = c; - n->dev = ptr->dev; - n->have_ioref = ca != NULL; - n->nocow = nocow; - n->submit_time = local_clock(); - n->inode_offset = bkey_start_offset(&k->k); - if (nocow) - n->nocow_bucket = PTR_BUCKET_NR(ca, ptr); - n->bio.bi_iter.bi_sector = ptr->offset; - - if (likely(n->have_ioref)) { - this_cpu_add(ca->io_done->sectors[WRITE][type], - bio_sectors(&n->bio)); - - bio_set_dev(&n->bio, ca->disk_sb.bdev); - - if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { - bio_endio(&n->bio); - continue; - } - - submit_bio(&n->bio); - } else { - n->bio.bi_status = BLK_STS_REMOVED; - bio_endio(&n->bio); - } - } -} - -static void __bch2_write(struct bch_write_op *); - -static void bch2_write_done(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_fs *c = op->c; - - EBUG_ON(op->open_buckets.nr); - - bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - bch2_disk_reservation_put(c, &op->res); - - if (!(op->flags & BCH_WRITE_move)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_write); - bch2_keylist_free(&op->insert_keys, op->inline_keys); - - EBUG_ON(cl->parent); - closure_debug_destroy(cl); - async_object_list_del(c, write_op, op->list_idx); - if (op->end_io) - op->end_io(op); -} - -static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct keylist *keys = &op->insert_keys; - struct bkey_i *src, *dst = keys->keys, *n; - - for (src = keys->keys; src != keys->top; src = n) { - n = bkey_next(src); - - if (bkey_extent_is_direct_data(&src->k)) { - bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, - test_bit(ptr->dev, op->failed.d)); - - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return bch_err_throw(c, data_write_io); - } - - if (dst != src) - memmove_u64s_down(dst, src, src->k.u64s); - dst = bkey_next(dst); - } - - keys->top = dst; - return 0; -} - -/** - * __bch2_write_index - after a write, update index to point to new data - * @op: bch_write_op to process - */ -static void __bch2_write_index(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct keylist *keys = &op->insert_keys; - unsigned dev; - int ret = 0; - - if (unlikely(op->flags & BCH_WRITE_io_error)) { - ret = bch2_write_drop_io_error_ptrs(op); - if (ret) - goto err; - } - - if (!bch2_keylist_empty(keys)) { - u64 sectors_start = keylist_sectors(keys); - - ret = !(op->flags & BCH_WRITE_move) - ? bch2_write_index_default(op) - : bch2_data_update_index_update(op); - - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); - BUG_ON(keylist_sectors(keys) && !ret); - - op->written += sectors_start - keylist_sectors(keys); - - if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - - bch2_write_op_error(op, bkey_start_offset(&insert->k), - "btree update error: %s", bch2_err_str(ret)); - } - - if (ret) - goto err; - } -out: - /* If some a bucket wasn't written, we can't erasure code it: */ - for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) - bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io); - - bch2_open_buckets_put(c, &op->open_buckets); - return; -err: - keys->top = keys->keys; - op->error = ret; - op->flags |= BCH_WRITE_submitted; - goto out; -} - -static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) -{ - if (state != wp->state) { - struct task_struct *p = current; - u64 now = ktime_get_ns(); - u64 runtime = p->se.sum_exec_runtime + - (now - p->se.exec_start); - - if (state == WRITE_POINT_runnable) - wp->last_runtime = runtime; - else if (wp->state == WRITE_POINT_runnable) - wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; - - if (wp->last_state_change && - time_after64(now, wp->last_state_change)) - wp->time[wp->state] += now - wp->last_state_change; - wp->state = state; - wp->last_state_change = now; - } -} - -static inline void wp_update_state(struct write_point *wp, bool running) -{ - enum write_point_state state; - - state = running ? WRITE_POINT_runnable: - !list_empty(&wp->writes) ? WRITE_POINT_waiting_io - : WRITE_POINT_stopped; - - __wp_update_state(wp, state); -} - -static CLOSURE_CALLBACK(bch2_write_index) -{ - closure_type(op, struct bch_write_op, cl); - struct write_point *wp = op->wp; - struct workqueue_struct *wq = index_update_wq(op); - unsigned long flags; - - if ((op->flags & BCH_WRITE_submitted) && - (op->flags & BCH_WRITE_move)) - bch2_bio_free_pages_pool(op->c, &op->wbio.bio); - - spin_lock_irqsave(&wp->writes_lock, flags); - if (wp->state == WRITE_POINT_waiting_io) - __wp_update_state(wp, WRITE_POINT_waiting_work); - list_add_tail(&op->wp_list, &wp->writes); - spin_unlock_irqrestore (&wp->writes_lock, flags); - - queue_work(wq, &wp->index_update_work); -} - -static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) -{ - op->wp = wp; - - if (wp->state == WRITE_POINT_stopped) { - spin_lock_irq(&wp->writes_lock); - __wp_update_state(wp, WRITE_POINT_waiting_io); - spin_unlock_irq(&wp->writes_lock); - } -} - -void bch2_write_point_do_index_updates(struct work_struct *work) -{ - struct write_point *wp = - container_of(work, struct write_point, index_update_work); - struct bch_write_op *op; - - while (1) { - spin_lock_irq(&wp->writes_lock); - op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list); - wp_update_state(wp, op != NULL); - spin_unlock_irq(&wp->writes_lock); - - if (!op) - break; - - op->flags |= BCH_WRITE_in_worker; - - __bch2_write_index(op); - - if (!(op->flags & BCH_WRITE_submitted)) - __bch2_write(op); - else - bch2_write_done(&op->cl); - } -} - -static void bch2_write_endio(struct bio *bio) -{ - struct closure *cl = bio->bi_private; - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_write_bio *wbio = to_wbio(bio); - struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; - struct bch_fs *c = wbio->c; - struct bch_dev *ca = wbio->have_ioref - ? bch2_dev_have_ref(c, wbio->dev) - : NULL; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, - wbio->submit_time, !bio->bi_status); - - if (unlikely(bio->bi_status)) { - if (ca) - bch_err_inum_offset_ratelimited(ca, - op->pos.inode, - wbio->inode_offset << 9, - "data write error: %s", - bch2_blk_status_to_str(bio->bi_status)); - else - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - wbio->inode_offset << 9, - "data write error: %s", - bch2_blk_status_to_str(bio->bi_status)); - set_bit(wbio->dev, op->failed.d); - op->flags |= BCH_WRITE_io_error; - } - - if (wbio->nocow) { - bch2_bucket_nocow_unlock(&c->nocow_locks, - POS(ca->dev_idx, wbio->nocow_bucket), - BUCKET_NOCOW_LOCK_UPDATE); - set_bit(wbio->dev, op->devs_need_flush->d); - } - - if (wbio->have_ioref) - enumerated_ref_put(&ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_io_write); - - if (wbio->bounce) - bch2_bio_free_pages_pool(c, bio); - - if (wbio->put_bio) - bio_put(bio); - - if (parent) - bio_endio(&parent->bio); - else - closure_put(cl); -} - -static void init_append_extent(struct bch_write_op *op, - struct write_point *wp, - struct bversion version, - struct bch_extent_crc_unpacked crc) -{ - struct bkey_i_extent *e; - - op->pos.offset += crc.uncompressed_size; - - e = bkey_extent_init(op->insert_keys.top); - e->k.p = op->pos; - e->k.size = crc.uncompressed_size; - e->k.bversion = version; - - if (crc.csum_type || - crc.compression_type || - crc.nonce) - bch2_extent_crc_append(&e->k_i, crc); - - bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, - op->flags & BCH_WRITE_cached); - - if (!(op->flags & BCH_WRITE_move)) - bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); - - bch2_keylist_push(&op->insert_keys); -} - -static struct bio *bch2_write_bio_alloc(struct bch_fs *c, - struct write_point *wp, - struct bio *src, - bool *page_alloc_failed, - void *buf) -{ - struct bch_write_bio *wbio; - struct bio *bio; - unsigned output_available = - min(wp->sectors_free << 9, src->bi_iter.bi_size); - unsigned pages = DIV_ROUND_UP(output_available + - (buf - ? ((unsigned long) buf & (PAGE_SIZE - 1)) - : 0), PAGE_SIZE); - - pages = min(pages, BIO_MAX_VECS); - - bio = bio_alloc_bioset(NULL, pages, 0, - GFP_NOFS, &c->bio_write); - wbio = wbio_init(bio); - wbio->put_bio = true; - /* copy WRITE_SYNC flag */ - wbio->bio.bi_opf = src->bi_opf; - - if (buf) { - bch2_bio_map(bio, buf, output_available); - return bio; - } - - wbio->bounce = true; - - /* - * We can't use mempool for more than c->sb.encoded_extent_max - * worth of pages, but we'd like to allocate more if we can: - */ - bch2_bio_alloc_pages_pool(c, bio, - min_t(unsigned, output_available, - c->opts.encoded_extent_max)); - - if (bio->bi_iter.bi_size < output_available) - *page_alloc_failed = - bch2_bio_alloc_pages(bio, - output_available - - bio->bi_iter.bi_size, - GFP_NOFS) != 0; - - return bio; -} - -static int bch2_write_rechecksum(struct bch_fs *c, - struct bch_write_op *op, - unsigned new_csum_type) -{ - struct bio *bio = &op->wbio.bio; - struct bch_extent_crc_unpacked new_crc; - - /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ - - if (bch2_csum_type_is_encryption(op->crc.csum_type) != - bch2_csum_type_is_encryption(new_csum_type)) - new_csum_type = op->crc.csum_type; - - int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, - NULL, &new_crc, - op->crc.offset, op->crc.live_size, - new_csum_type); - if (ret) - return ret; - - bio_advance(bio, op->crc.offset << 9); - bio->bi_iter.bi_size = op->crc.live_size << 9; - op->crc = new_crc; - return 0; -} - -static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) -{ - struct bch_fs *c = op->c; - struct bio *bio = &op->wbio.bio; - struct bch_csum csum; - int ret = 0; - - BUG_ON(bio_sectors(bio) != op->crc.compressed_size); - - /* Can we just write the entire extent as is? */ - if (op->crc.uncompressed_size == op->crc.live_size && - op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 && - op->crc.compressed_size <= wp->sectors_free && - (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || - op->incompressible)) { - if (!crc_is_compressed(op->crc) && - op->csum_type != op->crc.csum_type) { - ret = bch2_write_rechecksum(c, op, op->csum_type); - if (ret) - return ret; - } - - return 1; - } - - /* - * If the data is compressed and we couldn't write the entire extent as - * is, we have to decompress it: - */ - if (crc_is_compressed(op->crc)) { - /* Last point we can still verify checksum: */ - struct nonce nonce = extent_nonce(op->version, op->crc); - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - goto csum_err; - - if (bch2_csum_type_is_encryption(op->crc.csum_type)) { - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); - if (ret) - return ret; - - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - } - - ret = bch2_bio_uncompress_inplace(op, bio); - if (ret) - return ret; - } - - /* - * No longer have compressed data after this point - data might be - * encrypted: - */ - - /* - * If the data is checksummed and we're only writing a subset, - * rechecksum and adjust bio to point to currently live data: - */ - if (op->crc.live_size != op->crc.uncompressed_size || - op->crc.csum_type != op->csum_type) { - ret = bch2_write_rechecksum(c, op, op->csum_type); - if (ret) - return ret; - } - - /* - * If we want to compress the data, it has to be decrypted: - */ - if (bch2_csum_type_is_encryption(op->crc.csum_type) && - (op->compression_opt || op->crc.csum_type != op->csum_type)) { - struct nonce nonce = extent_nonce(op->version, op->crc); - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - goto csum_err; - - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); - if (ret) - return ret; - - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - } - - return 0; -csum_err: - bch2_write_op_error(op, op->pos.offset, - "error verifying existing checksum while moving existing data (memory corruption?)\n" - " expected %0llx:%0llx got %0llx:%0llx type %s", - op->crc.csum.hi, - op->crc.csum.lo, - csum.hi, - csum.lo, - op->crc.csum_type < BCH_CSUM_NR - ? __bch2_csum_types[op->crc.csum_type] - : "(unknown)"); - return bch_err_throw(c, data_write_csum); -} - -static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - struct bio **_dst) -{ - struct bch_fs *c = op->c; - struct bio *src = &op->wbio.bio, *dst = src; - struct bvec_iter saved_iter; - void *ec_buf; - unsigned total_output = 0, total_input = 0; - bool bounce = false; - bool page_alloc_failed = false; - int ret, more = 0; - - if (op->incompressible) - op->compression_opt = 0; - - BUG_ON(!bio_sectors(src)); - - ec_buf = bch2_writepoint_ec_buf(c, wp); - - if (unlikely(op->flags & BCH_WRITE_data_encoded)) { - ret = bch2_write_prep_encoded_data(op, wp); - if (ret < 0) - goto err; - if (ret) { - if (ec_buf) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bio_copy_data(dst, src); - bounce = true; - } - init_append_extent(op, wp, op->version, op->crc); - goto do_write; - } - } - - if (ec_buf || - op->compression_opt || - (op->csum_type && - !(op->flags & BCH_WRITE_pages_stable)) || - (bch2_csum_type_is_encryption(op->csum_type) && - !(op->flags & BCH_WRITE_pages_owned))) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bounce = true; - } - -#ifdef CONFIG_BCACHEFS_DEBUG - unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio); - if (!bounce && write_corrupt_ratio) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bounce = true; - } -#endif - saved_iter = dst->bi_iter; - - do { - struct bch_extent_crc_unpacked crc = { 0 }; - struct bversion version = op->version; - size_t dst_len = 0, src_len = 0; - - if (page_alloc_failed && - dst->bi_iter.bi_size < (wp->sectors_free << 9) && - dst->bi_iter.bi_size < c->opts.encoded_extent_max) - break; - - BUG_ON(op->compression_opt && - (op->flags & BCH_WRITE_data_encoded) && - bch2_csum_type_is_encryption(op->crc.csum_type)); - BUG_ON(op->compression_opt && !bounce); - - crc.compression_type = op->incompressible - ? BCH_COMPRESSION_TYPE_incompressible - : op->compression_opt - ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, - op->compression_opt) - : 0; - if (!crc_is_compressed(crc)) { - dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); - dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); - - if (op->csum_type) - dst_len = min_t(unsigned, dst_len, - c->opts.encoded_extent_max); - - if (bounce) { - swap(dst->bi_iter.bi_size, dst_len); - bio_copy_data(dst, src); - swap(dst->bi_iter.bi_size, dst_len); - } - - src_len = dst_len; - } - - BUG_ON(!src_len || !dst_len); - - if (bch2_csum_type_is_encryption(op->csum_type)) { - if (bversion_zero(version)) { - version.lo = atomic64_inc_return(&c->key_version); - } else { - crc.nonce = op->nonce; - op->nonce += src_len >> 9; - } - } - - if ((op->flags & BCH_WRITE_data_encoded) && - !crc_is_compressed(crc) && - bch2_csum_type_is_encryption(op->crc.csum_type) == - bch2_csum_type_is_encryption(op->csum_type)) { - u8 compression_type = crc.compression_type; - u16 nonce = crc.nonce; - /* - * Note: when we're using rechecksum(), we need to be - * checksumming @src because it has all the data our - * existing checksum covers - if we bounced (because we - * were trying to compress), @dst will only have the - * part of the data the new checksum will cover. - * - * But normally we want to be checksumming post bounce, - * because part of the reason for bouncing is so the - * data can't be modified (by userspace) while it's in - * flight. - */ - ret = bch2_rechecksum_bio(c, src, version, op->crc, - &crc, &op->crc, - src_len >> 9, - bio_sectors(src) - (src_len >> 9), - op->csum_type); - if (ret) - goto err; - /* - * rchecksum_bio sets compression_type on crc from op->crc, - * this isn't always correct as sometimes we're changing - * an extent from uncompressed to incompressible. - */ - crc.compression_type = compression_type; - crc.nonce = nonce; - } else { - if ((op->flags & BCH_WRITE_data_encoded) && - (ret = bch2_rechecksum_bio(c, src, version, op->crc, - NULL, &op->crc, - src_len >> 9, - bio_sectors(src) - (src_len >> 9), - op->crc.csum_type))) - goto err; - - crc.compressed_size = dst_len >> 9; - crc.uncompressed_size = src_len >> 9; - crc.live_size = src_len >> 9; - - swap(dst->bi_iter.bi_size, dst_len); - ret = bch2_encrypt_bio(c, op->csum_type, - extent_nonce(version, crc), dst); - if (ret) - goto err; - - crc.csum = bch2_checksum_bio(c, op->csum_type, - extent_nonce(version, crc), dst); - crc.csum_type = op->csum_type; - swap(dst->bi_iter.bi_size, dst_len); - } - - init_append_extent(op, wp, version, crc); - -#ifdef CONFIG_BCACHEFS_DEBUG - if (write_corrupt_ratio) { - swap(dst->bi_iter.bi_size, dst_len); - bch2_maybe_corrupt_bio(dst, write_corrupt_ratio); - swap(dst->bi_iter.bi_size, dst_len); - } -#endif - - if (dst != src) - bio_advance(dst, dst_len); - bio_advance(src, src_len); - total_output += dst_len; - total_input += src_len; - } while (dst->bi_iter.bi_size && - src->bi_iter.bi_size && - wp->sectors_free && - !bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_EXTENT_U64s_MAX)); - - more = src->bi_iter.bi_size != 0; - - dst->bi_iter = saved_iter; - - if (dst == src && more) { - BUG_ON(total_output != total_input); - - dst = bio_split(src, total_input >> 9, - GFP_NOFS, &c->bio_write); - wbio_init(dst)->put_bio = true; - /* copy WRITE_SYNC flag */ - dst->bi_opf = src->bi_opf; - } - - dst->bi_iter.bi_size = total_output; -do_write: - *_dst = dst; - return more; -err: - if (to_wbio(dst)->bounce) - bch2_bio_free_pages_pool(c, dst); - if (to_wbio(dst)->put_bio) - bio_put(dst); - - return ret; -} - -static bool bch2_extent_is_writeable(struct bch_write_op *op, - struct bkey_s_c k) -{ - struct bch_fs *c = op->c; - struct bkey_s_c_extent e; - struct extent_ptr_decoded p; - const union bch_extent_entry *entry; - unsigned replicas = 0; - - if (k.k->type != KEY_TYPE_extent) - return false; - - e = bkey_s_c_to_extent(k); - - guard(rcu)(); - extent_for_each_ptr_decode(e, p, entry) { - if (crc_is_encoded(p.crc) || p.has_ec) - return false; - - replicas += bch2_extent_ptr_durability(c, &p); - } - - return replicas >= op->opts.data_replicas; -} - -static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *orig, - struct bkey_s_c k, - u64 new_i_size) -{ - if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { - /* trace this */ - return 0; - } - - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bch2_cut_front(bkey_start_pos(&orig->k), new); - bch2_cut_back(orig->k.p, new); - - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr(ptrs, ptr) - ptr->unwritten = 0; - - /* - * Note that we're not calling bch2_subvol_get_snapshot() in this path - - * that was done when we kicked off the write, and here it's important - * that we update the extent that we wrote to - even if a snapshot has - * since been created. The write is still outstanding, so we're ok - * w.r.t. snapshot atomicity: - */ - return bch2_extent_update_i_size_sectors(trans, iter, - min(new->k.p.offset << 9, new_i_size), 0) ?: - bch2_trans_update(trans, iter, new, - BTREE_UPDATE_internal_snapshot_node); -} - -static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - - for_each_keylist_key(&op->insert_keys, orig) { - ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, - bkey_start_pos(&orig->k), orig->k.p, - BTREE_ITER_intent, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); - })); - if (ret) - break; - } - - bch2_trans_put(trans); - - if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - bch2_write_op_error(op, bkey_start_offset(&insert->k), - "btree update error: %s", bch2_err_str(ret)); - } - - if (ret) - op->error = ret; -} - -static void __bch2_nocow_write_done(struct bch_write_op *op) -{ - if (unlikely(op->flags & BCH_WRITE_io_error)) { - op->error = bch_err_throw(op->c, data_write_io); - } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) - bch2_nocow_write_convert_unwritten(op); -} - -static CLOSURE_CALLBACK(bch2_nocow_write_done) -{ - closure_type(op, struct bch_write_op, cl); - - __bch2_nocow_write_done(op); - bch2_write_done(cl); -} - -struct bucket_to_lock { - struct bpos b; - unsigned gen; - struct nocow_lock_bucket *l; -}; - -static void bch2_nocow_write(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; - u32 snapshot; - struct bucket_to_lock *stale_at; - int stale, ret; - - if (op->flags & BCH_WRITE_move) - return; - - darray_init(&buckets); - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot); - if (unlikely(ret)) - goto err; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(op->pos.inode, op->pos.offset, snapshot), - BTREE_ITER_slots); - while (1) { - struct bio *bio = &op->wbio.bio; - - buckets.nr = 0; - - ret = bch2_trans_relock(trans); - if (ret) - break; - - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - break; - - /* fall back to normal cow write path? */ - if (unlikely(k.k->p.snapshot != snapshot || - !bch2_extent_is_writeable(op, k))) - break; - - if (bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - k.k->u64s)) - break; - - /* Get iorefs before dropping btree locks: */ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, - BCH_DEV_WRITE_REF_io_write); - if (unlikely(!ca)) - goto err_get_ioref; - - struct bpos b = PTR_BUCKET_POS(ca, ptr); - struct nocow_lock_bucket *l = - bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); - prefetch(l); - - /* XXX allocating memory with btree locks held - rare */ - darray_push_gfp(&buckets, ((struct bucket_to_lock) { - .b = b, .gen = ptr->gen, .l = l, - }), GFP_KERNEL|__GFP_NOFAIL); - - if (ptr->unwritten) - op->flags |= BCH_WRITE_convert_unwritten; - } - - /* Unlock before taking nocow locks, doing IO: */ - bkey_reassemble(op->insert_keys.top, k); - bch2_trans_unlock(trans); - - bch2_cut_front(op->pos, op->insert_keys.top); - if (op->flags & BCH_WRITE_convert_unwritten) - bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); - - darray_for_each(buckets, i) { - struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode); - - __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, - bucket_to_u64(i->b), - BUCKET_NOCOW_LOCK_UPDATE); - - int gen = bucket_gen_get(ca, i->b.offset); - stale = gen < 0 ? gen : gen_after(gen, i->gen); - if (unlikely(stale)) { - stale_at = i; - goto err_bucket_stale; - } - } - - bio = &op->wbio.bio; - if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { - bio = bio_split(bio, k.k->p.offset - op->pos.offset, - GFP_KERNEL, &c->bio_write); - wbio_init(bio)->put_bio = true; - bio->bi_opf = op->wbio.bio.bi_opf; - } else { - op->flags |= BCH_WRITE_submitted; - } - - op->pos.offset += bio_sectors(bio); - op->written += bio_sectors(bio); - - bio->bi_end_io = bch2_write_endio; - bio->bi_private = &op->cl; - bio->bi_opf |= REQ_OP_WRITE; - closure_get(&op->cl); - - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, - op->insert_keys.top, true); - - bch2_keylist_push(&op->insert_keys); - if (op->flags & BCH_WRITE_submitted) - break; - bch2_btree_iter_advance(trans, &iter); - } -out: - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); - darray_exit(&buckets); - - if (ret) { - bch2_write_op_error(op, op->pos.offset, - "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); - op->error = ret; - op->flags |= BCH_WRITE_submitted; - } - - /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_submitted)) { - closure_sync(&op->cl); - __bch2_nocow_write_done(op); - op->insert_keys.top = op->insert_keys.keys; - } else if (op->flags & BCH_WRITE_sync) { - closure_sync(&op->cl); - bch2_nocow_write_done(&op->cl.work); - } else { - /* - * XXX - * needs to run out of process context because ei_quota_lock is - * a mutex - */ - continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); - } - return; -err_get_ioref: - darray_for_each(buckets, i) - enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE], - BCH_DEV_WRITE_REF_io_write); - - /* Fall back to COW path: */ - goto out; -err_bucket_stale: - darray_for_each(buckets, i) { - bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE); - if (i == stale_at) - break; - } - - struct printbuf buf = PRINTBUF; - if (bch2_fs_inconsistent_on(stale < 0, c, - "pointer to invalid bucket in nocow path on device %llu\n %s", - stale_at->b.inode, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch_err_throw(c, data_write_invalid_ptr); - } else { - /* We can retry this: */ - ret = bch_err_throw(c, transaction_restart); - } - printbuf_exit(&buf); - - goto err_get_ioref; -} - -static void __bch2_write(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct write_point *wp = NULL; - struct bio *bio = NULL; - unsigned nofs_flags; - int ret; - - nofs_flags = memalloc_nofs_save(); - - if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { - bch2_nocow_write(op); - if (op->flags & BCH_WRITE_submitted) - goto out_nofs_restore; - } -again: - memset(&op->failed, 0, sizeof(op->failed)); - - do { - struct bkey_i *key_to_write; - unsigned key_to_write_offset = op->insert_keys.top_p - - op->insert_keys.keys_p; - - /* +1 for possible cache device: */ - if (op->open_buckets.nr + op->nr_replicas + 1 > - ARRAY_SIZE(op->open_buckets.v)) - break; - - if (bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_EXTENT_U64s_MAX)) - break; - - /* - * The copygc thread is now global, which means it's no longer - * freeing up space on specific disks, which means that - * allocations for specific disks may hang arbitrarily long: - */ - ret = bch2_trans_run(c, lockrestart_do(trans, - bch2_alloc_sectors_start_trans(trans, - op->target, - op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), - op->write_point, - &op->devs_have, - op->nr_replicas, - op->nr_replicas_required, - op->watermark, - op->flags, - &op->cl, &wp))); - if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - break; - - goto err; - } - - EBUG_ON(!wp); - - bch2_open_bucket_get(c, wp, &op->open_buckets); - ret = bch2_write_extent(op, wp, &bio); - - bch2_alloc_sectors_done_inlined(c, wp); -err: - if (ret <= 0) { - op->flags |= BCH_WRITE_submitted; - - if (unlikely(ret < 0)) { - if (!(op->flags & BCH_WRITE_alloc_nowait)) - bch2_write_op_error(op, op->pos.offset, - "%s(): %s", __func__, bch2_err_str(ret)); - op->error = ret; - break; - } - } - - bio->bi_end_io = bch2_write_endio; - bio->bi_private = &op->cl; - bio->bi_opf |= REQ_OP_WRITE; - - closure_get(bio->bi_private); - - key_to_write = (void *) (op->insert_keys.keys_p + - key_to_write_offset); - - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, - key_to_write, false); - } while (ret); - - /* - * Sync or no? - * - * If we're running asynchronously, wne may still want to block - * synchronously here if we weren't able to submit all of the IO at - * once, as that signals backpressure to the caller. - */ - if ((op->flags & BCH_WRITE_sync) || - (!(op->flags & BCH_WRITE_submitted) && - !(op->flags & BCH_WRITE_in_worker))) { - bch2_wait_on_allocator(c, &op->cl); - - __bch2_write_index(op); - - if (!(op->flags & BCH_WRITE_submitted)) - goto again; - bch2_write_done(&op->cl); - } else { - bch2_write_queue(op, wp); - continue_at(&op->cl, bch2_write_index, NULL); - } -out_nofs_restore: - memalloc_nofs_restore(nofs_flags); -} - -static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) -{ - struct bio *bio = &op->wbio.bio; - struct bvec_iter iter; - struct bkey_i_inline_data *id; - unsigned sectors; - int ret; - - memset(&op->failed, 0, sizeof(op->failed)); - - op->flags |= BCH_WRITE_wrote_data_inline; - op->flags |= BCH_WRITE_submitted; - - bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); - - ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_U64s + DIV_ROUND_UP(data_len, 8)); - if (ret) { - op->error = ret; - goto err; - } - - sectors = bio_sectors(bio); - op->pos.offset += sectors; - - id = bkey_inline_data_init(op->insert_keys.top); - id->k.p = op->pos; - id->k.bversion = op->version; - id->k.size = sectors; - - iter = bio->bi_iter; - iter.bi_size = data_len; - memcpy_from_bio(id->v.data, bio, iter); - - while (data_len & 7) - id->v.data[data_len++] = '\0'; - set_bkey_val_bytes(&id->k, data_len); - bch2_keylist_push(&op->insert_keys); - - __bch2_write_index(op); -err: - bch2_write_done(&op->cl); -} - -/** - * bch2_write() - handle a write to a cache device or flash only volume - * @cl: &bch_write_op->cl - * - * This is the starting point for any data to end up in a cache device; it could - * be from a normal write, or a writeback write, or a write to a flash only - * volume - it's also used by the moving garbage collector to compact data in - * mostly empty buckets. - * - * It first writes the data to the cache, creating a list of keys to be inserted - * (if the data won't fit in a single open bucket, there will be multiple keys); - * after the data is written it calls bch_journal, and after the keys have been - * added to the next journal write they're inserted into the btree. - * - * If op->discard is true, instead of inserting the data it invalidates the - * region of the cache represented by op->bio and op->inode. - */ -CLOSURE_CALLBACK(bch2_write) -{ - closure_type(op, struct bch_write_op, cl); - struct bio *bio = &op->wbio.bio; - struct bch_fs *c = op->c; - unsigned data_len; - - EBUG_ON(op->cl.parent); - BUG_ON(!op->nr_replicas); - BUG_ON(!op->write_point.v); - BUG_ON(bkey_eq(op->pos, POS_MAX)); - - async_object_list_add(c, write_op, op, &op->list_idx); - - if (op->flags & BCH_WRITE_only_specified_devs) - op->flags |= BCH_WRITE_alloc_nowait; - - op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); - op->start_time = local_clock(); - bch2_keylist_init(&op->insert_keys, op->inline_keys); - wbio_init(bio)->put_bio = false; - - if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { - bch2_write_op_error(op, op->pos.offset, "misaligned write"); - op->error = bch_err_throw(c, data_write_misaligned); - goto err; - } - - if (c->opts.nochanges) { - op->error = bch_err_throw(c, erofs_no_writes); - goto err; - } - - if (!(op->flags & BCH_WRITE_move) && - !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) { - op->error = bch_err_throw(c, erofs_no_writes); - goto err; - } - - if (!(op->flags & BCH_WRITE_move)) - this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); - bch2_increment_clock(c, bio_sectors(bio), WRITE); - - data_len = min_t(u64, bio->bi_iter.bi_size, - op->new_i_size - (op->pos.offset << 9)); - - if (c->opts.inline_data && - data_len <= min(block_bytes(c) / 2, 1024U)) { - bch2_write_data_inline(op, data_len); - return; - } - - __bch2_write(op); - return; -err: - bch2_disk_reservation_put(c, &op->res); - - closure_debug_destroy(&op->cl); - async_object_list_del(c, write_op, op->list_idx); - if (op->end_io) - op->end_io(op); -} - -static const char * const bch2_write_flags[] = { -#define x(f) #f, - BCH_WRITE_FLAGS() -#undef x - NULL -}; - -void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - prt_printf(out, "pos:\t"); - bch2_bpos_to_text(out, op->pos); - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "started:\t"); - bch2_pr_time_units(out, local_clock() - op->start_time); - prt_newline(out); - - prt_printf(out, "flags:\t"); - prt_bitflags(out, bch2_write_flags, op->flags); - prt_newline(out); - - prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas); - prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); - - prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); - prt_printf(out, "ret\t%s\n", bch2_err_str(op->error)); - - printbuf_indent_sub(out, 2); -} - -void bch2_fs_io_write_exit(struct bch_fs *c) -{ - bioset_exit(&c->replica_set); - bioset_exit(&c->bio_write); -} - -int bch2_fs_io_write_init(struct bch_fs *c) -{ - if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || - bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) - return bch_err_throw(c, ENOMEM_bio_write_init); - - return 0; -} diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h deleted file mode 100644 index 2c0a8f35ee1f..000000000000 --- a/fs/bcachefs/io_write.h +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_WRITE_H -#define _BCACHEFS_IO_WRITE_H - -#include "checksum.h" -#include "io_write_types.h" - -#define to_wbio(_bio) \ - container_of((_bio), struct bch_write_bio, bio) - -void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); -void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); - -void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - enum bch_data_type, const struct bkey_i *, bool); - -__printf(3, 4) -void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); - -static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) -{ - return op->watermark == BCH_WATERMARK_copygc - ? op->c->copygc_wq - : op->c->btree_update_wq; -} - -int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, - struct bkey_i *, bool *, s64 *, s64 *); -int bch2_extent_update(struct btree_trans *, subvol_inum, - struct btree_iter *, struct bkey_i *, - struct disk_reservation *, u64, s64 *, bool); - -static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, - struct bch_io_opts opts) -{ - op->c = c; - op->end_io = NULL; - op->flags = 0; - op->written = 0; - op->error = 0; - op->csum_type = bch2_data_checksum_type(c, opts); - op->compression_opt = opts.compression; - op->nr_replicas = 0; - op->nr_replicas_required = c->opts.data_replicas_required; - op->watermark = BCH_WATERMARK_normal; - op->incompressible = 0; - op->open_buckets.nr = 0; - op->devs_have.nr = 0; - op->target = 0; - op->opts = opts; - op->subvol = 0; - op->pos = POS_MAX; - op->version = ZERO_VERSION; - op->write_point = (struct write_point_specifier) { 0 }; - op->res = (struct disk_reservation) { 0 }; - op->new_i_size = U64_MAX; - op->i_sectors_delta = 0; - op->devs_need_flush = NULL; -} - -CLOSURE_CALLBACK(bch2_write); -void bch2_write_point_do_index_updates(struct work_struct *); - -static inline struct bch_write_bio *wbio_init(struct bio *bio) -{ - struct bch_write_bio *wbio = to_wbio(bio); - - memset(&wbio->wbio, 0, sizeof(wbio->wbio)); - return wbio; -} - -void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); - -void bch2_fs_io_write_exit(struct bch_fs *); -int bch2_fs_io_write_init(struct bch_fs *); - -#endif /* _BCACHEFS_IO_WRITE_H */ diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h deleted file mode 100644 index 5da4eb8bb6f6..000000000000 --- a/fs/bcachefs/io_write_types.h +++ /dev/null @@ -1,129 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_WRITE_TYPES_H -#define _BCACHEFS_IO_WRITE_TYPES_H - -#include "alloc_types.h" -#include "btree_types.h" -#include "buckets_types.h" -#include "extents_types.h" -#include "keylist_types.h" -#include "opts.h" -#include "super_types.h" - -#include <linux/llist.h> -#include <linux/workqueue.h> - -#define BCH_WRITE_FLAGS() \ - x(alloc_nowait) \ - x(cached) \ - x(data_encoded) \ - x(pages_stable) \ - x(pages_owned) \ - x(only_specified_devs) \ - x(wrote_data_inline) \ - x(check_enospc) \ - x(sync) \ - x(move) \ - x(in_worker) \ - x(submitted) \ - x(io_error) \ - x(convert_unwritten) - -enum __bch_write_flags { -#define x(f) __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -enum bch_write_flags { -#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), - BCH_WRITE_FLAGS() -#undef x -}; - -struct bch_write_bio { - struct_group(wbio, - struct bch_fs *c; - struct bch_write_bio *parent; - - u64 submit_time; - u64 inode_offset; - u64 nocow_bucket; - - struct bch_devs_list failed; - u8 dev; - - unsigned split:1, - bounce:1, - put_bio:1, - have_ioref:1, - nocow:1, - used_mempool:1, - first_btree_write:1; - ); - - struct bio bio; -}; - -struct bch_write_op { - struct closure cl; - struct bch_fs *c; - void (*end_io)(struct bch_write_op *); - u64 start_time; - -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - - unsigned written; /* sectors */ - u16 flags; - s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ - - unsigned compression_opt:8; - unsigned csum_type:4; - unsigned nr_replicas:4; - unsigned nr_replicas_required:4; - unsigned watermark:3; - unsigned incompressible:1; - unsigned stripe_waited:1; - - struct bch_devs_list devs_have; - u16 target; - u16 nonce; - struct bch_io_opts opts; - - u32 subvol; - struct bpos pos; - struct bversion version; - - /* For BCH_WRITE_data_encoded: */ - struct bch_extent_crc_unpacked crc; - - struct write_point_specifier write_point; - - struct write_point *wp; - struct list_head wp_list; - - struct disk_reservation res; - - struct open_buckets open_buckets; - - u64 new_i_size; - s64 i_sectors_delta; - - struct bch_devs_mask failed; - - struct keylist insert_keys; - u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; - - /* - * Bitmask of devices that have had nocow writes issued to them since - * last flush: - */ - struct bch_devs_mask *devs_need_flush; - - /* Must be last: */ - struct bch_write_bio wbio; -}; - -#endif /* _BCACHEFS_IO_WRITE_TYPES_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c deleted file mode 100644 index ddfeb0dafc9d..000000000000 --- a/fs/bcachefs/journal.c +++ /dev/null @@ -1,1832 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bcachefs journalling code, for btree insertions - * - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_methods.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "enumerated_ref.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "journal_sb.h" -#include "journal_seq_blacklist.h" -#include "trace.h" - -static inline bool journal_seq_unwritten(struct journal *j, u64 seq) -{ - return seq > j->seq_ondisk; -} - -static bool __journal_entry_is_open(union journal_res_state state) -{ - return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -} - -static inline unsigned nr_unwritten_journal_entries(struct journal *j) -{ - return atomic64_read(&j->seq) - j->seq_ondisk; -} - -static bool journal_entry_is_open(struct journal *j) -{ - return __journal_entry_is_open(j->reservations); -} - -static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) -{ - union journal_res_state s = READ_ONCE(j->reservations); - unsigned i = seq & JOURNAL_BUF_MASK; - struct journal_buf *buf = j->buf + i; - - prt_printf(out, "seq:\t%llu\n", seq); - printbuf_indent_add(out, 2); - - if (!buf->write_started) - prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK)); - - struct closure *cl = &buf->io; - int r = atomic_read(&cl->remaining); - prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK); - - if (buf->data) { - prt_printf(out, "size:\t"); - prt_human_readable_u64(out, vstruct_bytes(buf->data)); - prt_newline(out); - } - - prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies); - - prt_printf(out, "flags:\t"); - if (buf->noflush) - prt_str(out, "noflush "); - if (buf->must_flush) - prt_str(out, "must_flush "); - if (buf->separate_flush) - prt_str(out, "separate_flush "); - if (buf->need_flush_to_write_buffer) - prt_str(out, "need_flush_to_write_buffer "); - if (buf->write_started) - prt_str(out, "write_started "); - if (buf->write_allocated) - prt_str(out, "write_allocated "); - if (buf->write_done) - prt_str(out, "write_done"); - prt_newline(out); - - printbuf_indent_sub(out, 2); -} - -static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) -{ - lockdep_assert_held(&j->lock); - out->atomic++; - - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 24); - - for (u64 seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); - seq++) - bch2_journal_buf_to_text(out, j, seq); - prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); - - --out->atomic; -} - -static inline struct journal_buf * -journal_seq_to_buf(struct journal *j, u64 seq) -{ - struct journal_buf *buf = NULL; - - EBUG_ON(seq > journal_cur_seq(j)); - - if (journal_seq_unwritten(j, seq)) - buf = j->buf + (seq & JOURNAL_BUF_MASK); - return buf; -} - -static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) -{ - for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++) - INIT_LIST_HEAD(&p->unflushed[i]); - for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) - INIT_LIST_HEAD(&p->flushed[i]); - atomic_set(&p->count, count); - p->devs.nr = 0; -} - -/* - * Detect stuck journal conditions and trigger shutdown. Technically the journal - * can end up stuck for a variety of reasons, such as a blocked I/O, journal - * reservation lockup, etc. Since this is a fatal error with potentially - * unpredictable characteristics, we want to be fairly conservative before we - * decide to shut things down. - * - * Consider the journal stuck when it appears full with no ability to commit - * btree transactions, to discard journal buckets, nor acquire priority - * (reserved watermark) reservation. - */ -static inline bool -journal_error_check_stuck(struct journal *j, int error, unsigned flags) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - bool stuck = false; - struct printbuf buf = PRINTBUF; - - buf.atomic++; - - if (!(error == -BCH_ERR_journal_full || - error == -BCH_ERR_journal_pin_full) || - nr_unwritten_journal_entries(j) || - (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) - return stuck; - - spin_lock(&j->lock); - - if (j->can_discard) { - spin_unlock(&j->lock); - return stuck; - } - - stuck = true; - - /* - * The journal shutdown path will set ->err_seq, but do it here first to - * serialize against concurrent failures and avoid duplicate error - * reports. - */ - if (j->err_seq) { - spin_unlock(&j->lock); - return stuck; - } - j->err_seq = journal_cur_seq(j); - - __bch2_journal_debug_to_text(&buf, j); - spin_unlock(&j->lock); - prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"), - bch2_err_str(error)); - bch2_print_str(c, KERN_ERR, buf.buf); - - printbuf_reset(&buf); - bch2_journal_pins_to_text(&buf, j); - bch_err(c, "Journal pins:\n%s", buf.buf); - printbuf_exit(&buf); - - bch2_fatal_error(c); - dump_stack(); - - return stuck; -} - -void bch2_journal_do_writes(struct journal *j) -{ - for (u64 seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); - seq++) { - unsigned idx = seq & JOURNAL_BUF_MASK; - struct journal_buf *w = j->buf + idx; - - if (w->write_started && !w->write_allocated) - break; - if (w->write_started) - continue; - - if (!journal_state_seq_count(j, j->reservations, seq)) { - j->seq_write_started = seq; - w->write_started = true; - closure_call(&w->io, bch2_journal_write, j->wq, NULL); - } - - break; - } -} - -/* - * Final processing when the last reference of a journal buffer has been - * dropped. Drop the pin list reference acquired at journal entry open and write - * the buffer, if requested. - */ -void bch2_journal_buf_put_final(struct journal *j, u64 seq) -{ - lockdep_assert_held(&j->lock); - - if (__bch2_journal_pin_put(j, seq)) - bch2_journal_reclaim_fast(j); - bch2_journal_do_writes(j); - - /* - * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an - * open journal entry - */ - wake_up(&j->wait); -} - -/* - * Returns true if journal entry is now closed: - * - * We don't close a journal_buf until the next journal_buf is finished writing, - * and can be opened again - this also initializes the next journal_buf: - */ -static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf = journal_cur_buf(j); - union journal_res_state old, new; - unsigned sectors; - - BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && - closed_val != JOURNAL_ENTRY_ERROR_VAL); - - lockdep_assert_held(&j->lock); - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - new.cur_entry_offset = closed_val; - - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || - old.cur_entry_offset == new.cur_entry_offset) - return; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - - if (!__journal_entry_is_open(old)) - return; - - if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) - old.cur_entry_offset = j->cur_entry_offset_if_blocked; - - /* Close out old buffer: */ - buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - - if (trace_journal_entry_close_enabled() && trace) { - struct printbuf pbuf = PRINTBUF; - pbuf.atomic++; - - prt_str(&pbuf, "entry size: "); - prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); - prt_newline(&pbuf); - bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT); - trace_journal_entry_close(c, pbuf.buf); - printbuf_exit(&pbuf); - } - - sectors = vstruct_blocks_plus(buf->data, c->block_bits, - buf->u64s_reserved) << c->block_bits; - if (unlikely(sectors > buf->sectors)) { - struct printbuf err = PRINTBUF; - err.atomic++; - - prt_printf(&err, "journal entry overran reserved space: %u > %u\n", - sectors, buf->sectors); - prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n", - le32_to_cpu(buf->data->u64s), buf->u64s_reserved, - j->cur_entry_u64s, - c->block_bits); - prt_printf(&err, "fatal error - emergency read only"); - bch2_journal_halt_locked(j); - - bch_err(c, "%s", err.buf); - printbuf_exit(&err); - return; - } - - buf->sectors = sectors; - - /* - * We have to set last_seq here, _before_ opening a new journal entry: - * - * A threads may replace an old pin with a new pin on their current - * journal reservation - the expectation being that the journal will - * contain either what the old pin protected or what the new pin - * protects. - * - * After the old pin is dropped journal_last_seq() won't include the old - * pin, so we can only write the updated last_seq on the entry that - * contains whatever the new pin protects. - * - * Restated, we can _not_ update last_seq for a given entry if there - * could be a newer entry open with reservations/pins that have been - * taken against it. - * - * Hence, we want update/set last_seq on the current journal entry right - * before we open a new one: - */ - buf->last_seq = journal_last_seq(j); - buf->data->last_seq = cpu_to_le64(buf->last_seq); - BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); - - cancel_delayed_work(&j->write_work); - - bch2_journal_space_available(j); - - __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); -} - -void bch2_journal_halt_locked(struct journal *j) -{ - lockdep_assert_held(&j->lock); - - __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); - if (!j->err_seq) - j->err_seq = journal_cur_seq(j); - journal_wake(j); -} - -void bch2_journal_halt(struct journal *j) -{ - spin_lock(&j->lock); - bch2_journal_halt_locked(j); - spin_unlock(&j->lock); -} - -static bool journal_entry_want_write(struct journal *j) -{ - bool ret = !journal_entry_is_open(j) || - journal_cur_seq(j) == journal_last_unwritten_seq(j); - - /* Don't close it yet if we already have a write in flight: */ - if (ret) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - else if (nr_unwritten_journal_entries(j)) { - struct journal_buf *buf = journal_cur_buf(j); - - if (!buf->flush_time) { - buf->flush_time = local_clock() ?: 1; - buf->expires = jiffies; - } - } - - return ret; -} - -bool bch2_journal_entry_close(struct journal *j) -{ - bool ret; - - spin_lock(&j->lock); - ret = journal_entry_want_write(j); - spin_unlock(&j->lock); - - return ret; -} - -/* - * should _only_ called from journal_res_get() - when we actually want a - * journal reservation - journal entry is open means journal is dirty: - */ -static int journal_entry_open(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf = j->buf + - ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); - union journal_res_state old, new; - int u64s; - - lockdep_assert_held(&j->lock); - BUG_ON(journal_entry_is_open(j)); - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - - if (j->blocked) - return bch_err_throw(c, journal_blocked); - - if (j->cur_entry_error) - return j->cur_entry_error; - - int ret = bch2_journal_error(j); - if (unlikely(ret)) - return ret; - - if (!fifo_free(&j->pin)) - return bch_err_throw(c, journal_pin_full); - - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return bch_err_throw(c, journal_max_in_flight); - - if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) - return bch_err_throw(c, journal_max_open); - - if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) { - bch_err(c, "cannot start: journal seq overflow"); - if (bch2_fs_emergency_read_only_locked(c)) - bch_err(c, "fatal error - emergency read only"); - return bch_err_throw(c, journal_shutdown); - } - - if (!j->free_buf && !buf->data) - return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */ - - BUG_ON(!j->cur_entry_sectors); - - if (!buf->data) { - swap(buf->data, j->free_buf); - swap(buf->buf_size, j->free_buf_size); - } - - buf->expires = - (journal_cur_seq(j) == j->flushed_seq_ondisk - ? jiffies - : j->last_flush_write) + - msecs_to_jiffies(c->opts.journal_flush_delay); - - buf->u64s_reserved = j->entry_u64s_reserved; - buf->disk_sectors = j->cur_entry_sectors; - buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); - - u64s = (int) (buf->sectors << 9) / sizeof(u64) - - journal_entry_overhead(j); - u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); - - if (u64s <= (ssize_t) j->early_journal_entries.nr) - return bch_err_throw(c, journal_full); - - if (fifo_empty(&j->pin) && j->reclaim_thread) - wake_up_process(j->reclaim_thread); - - /* - * The fifo_push() needs to happen at the same time as j->seq is - * incremented for journal_last_seq() to be calculated correctly - */ - atomic64_inc(&j->seq); - journal_pin_list_init(fifo_push_ref(&j->pin), 1); - - if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) { - bch_err(c, "attempting to open blacklisted journal seq %llu", - journal_cur_seq(j)); - if (bch2_fs_emergency_read_only_locked(c)) - bch_err(c, "fatal error - emergency read only"); - return bch_err_throw(c, journal_shutdown); - } - - BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); - - BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); - - bkey_extent_init(&buf->key); - buf->noflush = false; - buf->must_flush = false; - buf->separate_flush = false; - buf->flush_time = 0; - buf->need_flush_to_write_buffer = true; - buf->write_started = false; - buf->write_allocated = false; - buf->write_done = false; - - memset(buf->data, 0, sizeof(*buf->data)); - buf->data->seq = cpu_to_le64(journal_cur_seq(j)); - buf->data->u64s = 0; - - if (j->early_journal_entries.nr) { - memcpy(buf->data->_data, j->early_journal_entries.data, - j->early_journal_entries.nr * sizeof(u64)); - le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr); - } - - /* - * Must be set before marking the journal entry as open: - */ - j->cur_entry_u64s = u64s; - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - - BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); - - new.idx++; - BUG_ON(journal_state_count(new, new.idx)); - BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK)); - - journal_state_inc(&new); - - /* Handle any already added entries */ - new.cur_entry_offset = le32_to_cpu(buf->data->u64s); - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - - if (nr_unwritten_journal_entries(j) == 1) - mod_delayed_work(j->wq, - &j->write_work, - msecs_to_jiffies(c->opts.journal_flush_delay)); - journal_wake(j); - - if (j->early_journal_entries.nr) - darray_exit(&j->early_journal_entries); - return 0; -} - -static bool journal_quiesced(struct journal *j) -{ - bool ret = atomic64_read(&j->seq) == j->seq_ondisk; - - if (!ret) - bch2_journal_entry_close(j); - return ret; -} - -static void journal_quiesce(struct journal *j) -{ - wait_event(j->wait, journal_quiesced(j)); -} - -static void journal_write_work(struct work_struct *work) -{ - struct journal *j = container_of(work, struct journal, write_work.work); - - spin_lock(&j->lock); - if (__journal_entry_is_open(j->reservations)) { - long delta = journal_cur_buf(j)->expires - jiffies; - - if (delta > 0) - mod_delayed_work(j->wq, &j->write_work, delta); - else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - } - spin_unlock(&j->lock); -} - -static void journal_buf_prealloc(struct journal *j) -{ - if (j->free_buf && - j->free_buf_size >= j->buf_size_want) - return; - - unsigned buf_size = j->buf_size_want; - - spin_unlock(&j->lock); - void *buf = kvmalloc(buf_size, GFP_NOFS); - spin_lock(&j->lock); - - if (buf && - (!j->free_buf || - buf_size > j->free_buf_size)) { - swap(buf, j->free_buf); - swap(buf_size, j->free_buf_size); - } - - if (unlikely(buf)) { - spin_unlock(&j->lock); - /* kvfree can sleep */ - kvfree(buf); - spin_lock(&j->lock); - } -} - -static int __journal_res_get(struct journal *j, struct journal_res *res, - unsigned flags) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf; - bool can_discard; - int ret; -retry: - if (journal_res_get_fast(j, res, flags)) - return 0; - - ret = bch2_journal_error(j); - if (unlikely(ret)) - return ret; - - if (j->blocked) - return bch_err_throw(c, journal_blocked); - - if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - ret = bch_err_throw(c, journal_full); - can_discard = j->can_discard; - goto out; - } - - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { - ret = bch_err_throw(c, journal_max_in_flight); - goto out; - } - - spin_lock(&j->lock); - - journal_buf_prealloc(j); - - /* - * Recheck after taking the lock, so we don't race with another thread - * that just did journal_entry_open() and call bch2_journal_entry_close() - * unnecessarily - */ - if (journal_res_get_fast(j, res, flags)) { - ret = 0; - goto unlock; - } - - /* - * If we couldn't get a reservation because the current buf filled up, - * and we had room for a bigger entry on disk, signal that we want to - * realloc the journal bufs: - */ - buf = journal_cur_buf(j); - if (journal_entry_is_open(j) && - buf->buf_size >> 9 < buf->disk_sectors && - buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) - j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); - ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open; -unlock: - can_discard = j->can_discard; - spin_unlock(&j->lock); -out: - if (likely(!ret)) - return 0; - if (ret == -BCH_ERR_journal_retry_open) - goto retry; - - if (journal_error_check_stuck(j, ret, flags)) - ret = bch_err_throw(c, journal_stuck); - - if (ret == -BCH_ERR_journal_max_in_flight && - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && - trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_printbuf_make_room(&buf, 4096); - - spin_lock(&j->lock); - prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); - bch2_journal_bufs_to_text(&buf, j); - spin_unlock(&j->lock); - - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - count_event(c, journal_entry_full); - } - - if (ret == -BCH_ERR_journal_max_open && - track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && - trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_printbuf_make_room(&buf, 4096); - - spin_lock(&j->lock); - prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); - bch2_journal_bufs_to_text(&buf, j); - spin_unlock(&j->lock); - - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - count_event(c, journal_entry_full); - } - - /* - * Journal is full - can't rely on reclaim from work item due to - * freezing: - */ - if ((ret == -BCH_ERR_journal_full || - ret == -BCH_ERR_journal_pin_full) && - !(flags & JOURNAL_RES_GET_NONBLOCK)) { - if (can_discard) { - bch2_journal_do_discards(j); - goto retry; - } - - if (mutex_trylock(&j->reclaim_lock)) { - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); - } - } - - return ret; -} - -static unsigned max_dev_latency(struct bch_fs *c) -{ - u64 nsecs = 0; - - guard(rcu)(); - for_each_rw_member_rcu(c, ca) - nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); - - return nsecs_to_jiffies(nsecs); -} - -/* - * Essentially the entry function to the journaling code. When bcachefs is doing - * a btree insert, it calls this function to get the current journal write. - * Journal write is the structure used set up journal writes. The calling - * function will then add its keys to the structure, queuing them for the next - * write. - * - * To ensure forward progress, the current task must not be holding any - * btree node write locks. - */ -int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - unsigned flags, - struct btree_trans *trans) -{ - int ret; - - if (closure_wait_event_timeout(&j->async_wait, - !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || - (flags & JOURNAL_RES_GET_NONBLOCK), - HZ)) - return ret; - - if (trans) - bch2_trans_unlock_long(trans); - - struct bch_fs *c = container_of(j, struct bch_fs, journal); - int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10); - - remaining_wait = max(0, remaining_wait - HZ); - - if (closure_wait_event_timeout(&j->async_wait, - !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || - (flags & JOURNAL_RES_GET_NONBLOCK), - remaining_wait)) - return ret; - - struct printbuf buf = PRINTBUF; - bch2_journal_debug_to_text(&buf, j); - bch2_print_str(c, KERN_ERR, buf.buf); - prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); - printbuf_exit(&buf); - - closure_wait_event(&j->async_wait, - !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || - (flags & JOURNAL_RES_GET_NONBLOCK)); - return ret; -} - -/* journal_entry_res: */ - -void bch2_journal_entry_res_resize(struct journal *j, - struct journal_entry_res *res, - unsigned new_u64s) -{ - union journal_res_state state; - int d = new_u64s - res->u64s; - - spin_lock(&j->lock); - - j->entry_u64s_reserved += d; - if (d <= 0) - goto out; - - j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); - state = READ_ONCE(j->reservations); - - if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && - state.cur_entry_offset > j->cur_entry_u64s) { - j->cur_entry_u64s += d; - /* - * Not enough room in current journal entry, have to flush it: - */ - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - } else { - journal_cur_buf(j)->u64s_reserved += d; - } -out: - spin_unlock(&j->lock); - res->u64s += d; -} - -/* journal flushing: */ - -/** - * bch2_journal_flush_seq_async - wait for a journal entry to be written - * @j: journal object - * @seq: seq to flush - * @parent: closure object to wait with - * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, - * -BCH_ERR_journal_flush_err if @seq will never be flushed - * - * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if - * necessary - */ -int bch2_journal_flush_seq_async(struct journal *j, u64 seq, - struct closure *parent) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf; - int ret = 0; - - if (seq <= j->flushed_seq_ondisk) - return 1; - - spin_lock(&j->lock); - - if (WARN_ONCE(seq > journal_cur_seq(j), - "requested to flush journal seq %llu, but currently at %llu", - seq, journal_cur_seq(j))) - goto out; - - /* Recheck under lock: */ - if (j->err_seq && seq >= j->err_seq) { - ret = bch_err_throw(c, journal_flush_err); - goto out; - } - - if (seq <= j->flushed_seq_ondisk) { - ret = 1; - goto out; - } - - /* if seq was written, but not flushed - flush a newer one instead */ - seq = max(seq, journal_last_unwritten_seq(j)); - -recheck_need_open: - if (seq > journal_cur_seq(j)) { - struct journal_res res = { 0 }; - - if (journal_entry_is_open(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - - spin_unlock(&j->lock); - - /* - * We're called from bch2_journal_flush_seq() -> wait_event(); - * but this might block. We won't usually block, so we won't - * livelock: - */ - sched_annotate_sleep(); - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); - if (ret) - return ret; - - seq = res.seq; - buf = journal_seq_to_buf(j, seq); - buf->must_flush = true; - - if (!buf->flush_time) { - buf->flush_time = local_clock() ?: 1; - buf->expires = jiffies; - } - - if (parent && !closure_wait(&buf->wait, parent)) - BUG(); - - bch2_journal_res_put(j, &res); - - spin_lock(&j->lock); - goto want_write; - } - - /* - * if write was kicked off without a flush, or if we promised it - * wouldn't be a flush, flush the next sequence number instead - */ - buf = journal_seq_to_buf(j, seq); - if (buf->noflush) { - seq++; - goto recheck_need_open; - } - - buf->must_flush = true; - j->flushing_seq = max(j->flushing_seq, seq); - - if (parent && !closure_wait(&buf->wait, parent)) - BUG(); -want_write: - if (seq == journal_cur_seq(j)) - journal_entry_want_write(j); -out: - spin_unlock(&j->lock); - return ret; -} - -int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state) -{ - u64 start_time = local_clock(); - int ret, ret2; - - /* - * Don't update time_stats when @seq is already flushed: - */ - if (seq <= j->flushed_seq_ondisk) - return 0; - - ret = wait_event_state(j->wait, - (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)), - task_state); - - if (!ret) - bch2_time_stats_update(j->flush_seq_time, start_time); - - return ret ?: ret2 < 0 ? ret2 : 0; -} - -/* - * bch2_journal_flush_async - if there is an open journal entry, or a journal - * still being written, write it and wait for the write to complete - */ -void bch2_journal_flush_async(struct journal *j, struct closure *parent) -{ - bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent); -} - -int bch2_journal_flush(struct journal *j) -{ - return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE); -} - -/* - * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the - * range [start, end) - * @seq - */ -bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - u64 unwritten_seq; - bool ret = false; - - if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) - return false; - - if (c->journal.flushed_seq_ondisk >= start) - return false; - - spin_lock(&j->lock); - if (c->journal.flushed_seq_ondisk >= start) - goto out; - - for (unwritten_seq = journal_last_unwritten_seq(j); - unwritten_seq < end; - unwritten_seq++) { - struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); - - /* journal flush already in flight, or flush requseted */ - if (buf->must_flush) - goto out; - - buf->noflush = true; - } - - ret = true; -out: - spin_unlock(&j->lock); - return ret; -} - -static int __bch2_journal_meta(struct journal *j) -{ - struct journal_res res = {}; - int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); - if (ret) - return ret; - - struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK); - buf->must_flush = true; - - if (!buf->flush_time) { - buf->flush_time = local_clock() ?: 1; - buf->expires = jiffies; - } - - bch2_journal_res_put(j, &res); - - return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE); -} - -int bch2_journal_meta(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal)) - return bch_err_throw(c, erofs_no_writes); - - int ret = __bch2_journal_meta(j); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal); - return ret; -} - -/* block/unlock the journal: */ - -void bch2_journal_unblock(struct journal *j) -{ - spin_lock(&j->lock); - if (!--j->blocked && - j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && - j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { - union journal_res_state old, new; - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - new.cur_entry_offset = j->cur_entry_offset_if_blocked; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - } - spin_unlock(&j->lock); - - journal_wake(j); -} - -static void __bch2_journal_block(struct journal *j) -{ - if (!j->blocked++) { - union journal_res_state old, new; - - old.v = atomic64_read(&j->reservations.counter); - do { - j->cur_entry_offset_if_blocked = old.cur_entry_offset; - - if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL) - break; - - new.v = old.v; - new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - - if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL) - journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); - } -} - -void bch2_journal_block(struct journal *j) -{ - spin_lock(&j->lock); - __bch2_journal_block(j); - spin_unlock(&j->lock); - - journal_quiesce(j); -} - -static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, - u64 max_seq, bool *blocked) -{ - struct journal_buf *ret = NULL; - - /* We're inside wait_event(), but using mutex_lock(: */ - sched_annotate_sleep(); - mutex_lock(&j->buf_lock); - spin_lock(&j->lock); - max_seq = min(max_seq, journal_cur_seq(j)); - - for (u64 seq = journal_last_unwritten_seq(j); - seq <= max_seq; - seq++) { - unsigned idx = seq & JOURNAL_BUF_MASK; - struct journal_buf *buf = j->buf + idx; - - if (buf->need_flush_to_write_buffer) { - union journal_res_state s; - s.v = atomic64_read_acquire(&j->reservations.counter); - - unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s); - - if (open && !*blocked) { - __bch2_journal_block(j); - s.v = atomic64_read_acquire(&j->reservations.counter); - *blocked = true; - } - - ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open - ? ERR_PTR(-EAGAIN) - : buf; - break; - } - } - - spin_unlock(&j->lock); - if (IS_ERR_OR_NULL(ret)) - mutex_unlock(&j->buf_lock); - return ret; -} - -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, - u64 max_seq, bool *blocked) -{ - struct journal_buf *ret; - *blocked = false; - - wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, - max_seq, blocked)) != ERR_PTR(-EAGAIN)); - if (IS_ERR_OR_NULL(ret) && *blocked) - bch2_journal_unblock(j); - - return ret; -} - -/* allocate journal on a device: */ - -static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, - bool new_fs, struct closure *cl) -{ - struct bch_fs *c = ca->fs; - struct journal_device *ja = &ca->journal; - u64 *new_bucket_seq = NULL, *new_buckets = NULL; - struct open_bucket **ob = NULL; - long *bu = NULL; - unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr; - int ret = 0; - - BUG_ON(nr <= ja->nr); - - bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL); - ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL); - new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); - new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); - if (!bu || !ob || !new_buckets || !new_bucket_seq) { - ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); - goto err_free; - } - - for (nr_got = 0; nr_got < nr_want; nr_got++) { - enum bch_watermark watermark = new_fs - ? BCH_WATERMARK_btree - : BCH_WATERMARK_normal; - - ob[nr_got] = bch2_bucket_alloc(c, ca, watermark, - BCH_DATA_journal, cl); - ret = PTR_ERR_OR_ZERO(ob[nr_got]); - if (ret) - break; - - if (!new_fs) { - ret = bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(trans, ca, - ob[nr_got]->bucket, BCH_DATA_journal, - ca->mi.bucket_size, BTREE_TRIGGER_transactional)); - if (ret) { - bch2_open_bucket_put(c, ob[nr_got]); - bch_err_msg(c, ret, "marking new journal buckets"); - break; - } - } - - bu[nr_got] = ob[nr_got]->bucket; - } - - if (!nr_got) - goto err_free; - - /* Don't return an error if we successfully allocated some buckets: */ - ret = 0; - - if (c) { - bch2_journal_flush_all_pins(&c->journal); - bch2_journal_block(&c->journal); - mutex_lock(&c->sb_lock); - } - - memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); - memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); - - BUG_ON(ja->discard_idx > ja->nr); - - pos = ja->discard_idx ?: ja->nr; - - memmove(new_buckets + pos + nr_got, - new_buckets + pos, - sizeof(new_buckets[0]) * (ja->nr - pos)); - memmove(new_bucket_seq + pos + nr_got, - new_bucket_seq + pos, - sizeof(new_bucket_seq[0]) * (ja->nr - pos)); - - for (i = 0; i < nr_got; i++) { - new_buckets[pos + i] = bu[i]; - new_bucket_seq[pos + i] = 0; - } - - nr = ja->nr + nr_got; - - ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr); - if (ret) - goto err_unblock; - - bch2_write_super(c); - - /* Commit: */ - if (c) - spin_lock(&c->journal.lock); - - swap(new_buckets, ja->buckets); - swap(new_bucket_seq, ja->bucket_seq); - ja->nr = nr; - - if (pos <= ja->discard_idx) - ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr; - if (pos <= ja->dirty_idx_ondisk) - ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr; - if (pos <= ja->dirty_idx) - ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr; - if (pos <= ja->cur_idx) - ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr; - - if (c) - spin_unlock(&c->journal.lock); -err_unblock: - if (c) { - bch2_journal_unblock(&c->journal); - mutex_unlock(&c->sb_lock); - } - - if (ret && !new_fs) - for (i = 0; i < nr_got; i++) - bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(trans, ca, - bu[i], BCH_DATA_free, 0, - BTREE_TRIGGER_transactional)); -err_free: - for (i = 0; i < nr_got; i++) - bch2_open_bucket_put(c, ob[i]); - - kfree(new_bucket_seq); - kfree(new_buckets); - kfree(ob); - kfree(bu); - return ret; -} - -static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca, - unsigned nr, bool new_fs) -{ - struct journal_device *ja = &ca->journal; - int ret = 0; - - struct closure cl; - closure_init_stack(&cl); - - /* don't handle reducing nr of buckets yet: */ - if (nr < ja->nr) - return 0; - - while (!ret && ja->nr < nr) { - struct disk_reservation disk_res = { 0, 0, 0 }; - - /* - * note: journal buckets aren't really counted as _sectors_ used yet, so - * we don't need the disk reservation to avoid the BUG_ON() in buckets.c - * when space used goes up without a reservation - but we do need the - * reservation to ensure we'll actually be able to allocate: - * - * XXX: that's not right, disk reservations only ensure a - * filesystem-wide allocation will succeed, this is a device - * specific allocation - we can hang here: - */ - if (!new_fs) { - ret = bch2_disk_reservation_get(c, &disk_res, - bucket_to_sector(ca, nr - ja->nr), 1, 0); - if (ret) - break; - } - - ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl); - - if (ret == -BCH_ERR_bucket_alloc_blocked || - ret == -BCH_ERR_open_buckets_empty) - ret = 0; /* wait and retry */ - - bch2_disk_reservation_put(c, &disk_res); - bch2_wait_on_allocator(c, &cl); - } - - return ret; -} - -/* - * Allocate more journal space at runtime - not currently making use if it, but - * the code works: - */ -int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, - unsigned nr) -{ - down_write(&c->state_lock); - int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false); - up_write(&c->state_lock); - - bch_err_fn(c, ret); - return ret; -} - -int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b) -{ - struct bch_fs *c = ca->fs; - struct journal *j = &c->journal; - struct journal_device *ja = &ca->journal; - - guard(mutex)(&c->sb_lock); - unsigned pos; - for (pos = 0; pos < ja->nr; pos++) - if (ja->buckets[pos] == b) - break; - - if (pos == ja->nr) { - bch_err(ca, "journal bucket %llu not found when deleting", b); - return -EINVAL; - } - - u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);; - if (!new_buckets) - return bch_err_throw(c, ENOMEM_set_nr_journal_buckets); - - memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); - memmove(&new_buckets[pos], - &new_buckets[pos + 1], - (ja->nr - 1 - pos) * sizeof(new_buckets[0])); - - int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?: - bch2_write_super(c); - if (ret) { - kfree(new_buckets); - return ret; - } - - scoped_guard(spinlock, &j->lock) { - if (pos < ja->discard_idx) - --ja->discard_idx; - if (pos < ja->dirty_idx_ondisk) - --ja->dirty_idx_ondisk; - if (pos < ja->dirty_idx) - --ja->dirty_idx; - if (pos < ja->cur_idx) - --ja->cur_idx; - - ja->nr--; - - memmove(&ja->buckets[pos], - &ja->buckets[pos + 1], - (ja->nr - pos) * sizeof(ja->buckets[0])); - - memmove(&ja->bucket_seq[pos], - &ja->bucket_seq[pos + 1], - (ja->nr - pos) * sizeof(ja->bucket_seq[0])); - - bch2_journal_space_available(j); - } - - kfree(new_buckets); - return 0; -} - -int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) -{ - struct bch_fs *c = ca->fs; - - if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal))) - return 0; - - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { - bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); - return bch_err_throw(c, erofs_filesystem_full); - } - - unsigned nr; - int ret; - - if (dynamic_fault("bcachefs:add:journal_alloc")) { - ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); - goto err; - } - - /* 1/128th of the device by default: */ - nr = ca->mi.nbuckets >> 7; - - /* - * clamp journal size to 8192 buckets or 8GB (in sectors), whichever - * is smaller: - */ - nr = clamp_t(unsigned, nr, - BCH_JOURNAL_BUCKETS_MIN, - min(1 << 13, - (1 << 24) / ca->mi.bucket_size)); - - ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); -err: - bch_err_fn(ca, ret); - return ret; -} - -int bch2_fs_journal_alloc(struct bch_fs *c) -{ - for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) { - if (ca->journal.nr) - continue; - - int ret = bch2_dev_journal_alloc(ca, true); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_fs_journal_alloc); - return ret; - } - } - - return 0; -} - -/* startup/shutdown: */ - -static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) -{ - bool ret = false; - u64 seq; - - spin_lock(&j->lock); - for (seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j) && !ret; - seq++) { - struct journal_buf *buf = journal_seq_to_buf(j, seq); - - if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx)) - ret = true; - } - spin_unlock(&j->lock); - - return ret; -} - -void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) -{ - wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); -} - -void bch2_fs_journal_stop(struct journal *j) -{ - if (!test_bit(JOURNAL_running, &j->flags)) - return; - - bch2_journal_reclaim_stop(j); - bch2_journal_flush_all_pins(j); - - wait_event(j->wait, bch2_journal_entry_close(j)); - - /* - * Always write a new journal entry, to make sure the clock hands are up - * to date (and match the superblock) - */ - __bch2_journal_meta(j); - - journal_quiesce(j); - cancel_delayed_work_sync(&j->write_work); - - WARN(!bch2_journal_error(j) && - test_bit(JOURNAL_replay_done, &j->flags) && - j->last_empty_seq != journal_cur_seq(j), - "journal shutdown error: cur seq %llu but last empty seq %llu", - journal_cur_seq(j), j->last_empty_seq); - - if (!bch2_journal_error(j)) - clear_bit(JOURNAL_running, &j->flags); -} - -int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_entry_pin_list *p; - struct journal_replay *i, **_i; - struct genradix_iter iter; - bool had_entries = false; - - /* - * - * XXX pick most recent non blacklisted sequence number - */ - - cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c)); - - if (cur_seq >= JOURNAL_SEQ_MAX) { - bch_err(c, "cannot start: journal seq overflow"); - return -EINVAL; - } - - /* Clean filesystem? */ - if (!last_seq) - last_seq = cur_seq; - - u64 nr = cur_seq - last_seq; - - /* - * Extra fudge factor, in case we crashed when the journal pin fifo was - * nearly or completely full. We'll need to be able to open additional - * journal entries (at least a few) in order for journal replay to get - * going: - */ - nr += nr / 4; - - nr = max(nr, JOURNAL_PIN); - init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return bch_err_throw(c, ENOMEM_journal_pin_fifo); - } - - j->replay_journal_seq = last_seq; - j->replay_journal_seq_end = cur_seq; - j->last_seq_ondisk = last_seq; - j->flushed_seq_ondisk = cur_seq - 1; - j->seq_write_started = cur_seq - 1; - j->seq_ondisk = cur_seq - 1; - j->pin.front = last_seq; - j->pin.back = cur_seq; - atomic64_set(&j->seq, cur_seq - 1); - - u64 seq; - fifo_for_each_entry_ptr(p, &j->pin, seq) - journal_pin_list_init(p, 1); - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - seq = le64_to_cpu(i->j.seq); - BUG_ON(seq >= cur_seq); - - if (seq < last_seq) - continue; - - if (journal_entry_empty(&i->j)) - j->last_empty_seq = le64_to_cpu(i->j.seq); - - p = journal_seq_pin(j, seq); - - p->devs.nr = 0; - darray_for_each(i->ptrs, ptr) - bch2_dev_list_add_dev(&p->devs, ptr->dev); - - had_entries = true; - } - - if (!had_entries) - j->last_empty_seq = cur_seq - 1; /* to match j->seq */ - - spin_lock(&j->lock); - j->last_flush_write = jiffies; - - j->reservations.idx = journal_cur_seq(j); - - c->last_bucket_seq_cleanup = journal_cur_seq(j); - spin_unlock(&j->lock); - - return 0; -} - -void bch2_journal_set_replay_done(struct journal *j) -{ - /* - * journal_space_available must happen before setting JOURNAL_running - * JOURNAL_running must happen before JOURNAL_replay_done - */ - spin_lock(&j->lock); - bch2_journal_space_available(j); - - set_bit(JOURNAL_need_flush_write, &j->flags); - set_bit(JOURNAL_running, &j->flags); - set_bit(JOURNAL_replay_done, &j->flags); - spin_unlock(&j->lock); -} - -/* init/exit: */ - -void bch2_dev_journal_exit(struct bch_dev *ca) -{ - struct journal_device *ja = &ca->journal; - - for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - kfree(ja->bio[i]); - ja->bio[i] = NULL; - } - - kfree(ja->buckets); - kfree(ja->bucket_seq); - ja->buckets = NULL; - ja->bucket_seq = NULL; -} - -int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) -{ - struct bch_fs *c = ca->fs; - struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets = - bch2_sb_field_get(sb, journal); - struct bch_sb_field_journal_v2 *journal_buckets_v2 = - bch2_sb_field_get(sb, journal_v2); - - ja->nr = 0; - - if (journal_buckets_v2) { - unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - - for (unsigned i = 0; i < nr; i++) - ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); - } else if (journal_buckets) { - ja->nr = bch2_nr_journal_buckets(journal_buckets); - } - - ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); - if (!ja->bucket_seq) - return bch_err_throw(c, ENOMEM_dev_journal_init); - - unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); - - for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, - nr_bvecs), GFP_KERNEL); - if (!ja->bio[i]) - return bch_err_throw(c, ENOMEM_dev_journal_init); - - ja->bio[i]->ca = ca; - ja->bio[i]->buf_idx = i; - bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); - } - - ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); - if (!ja->buckets) - return bch_err_throw(c, ENOMEM_dev_journal_init); - - if (journal_buckets_v2) { - unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - unsigned dst = 0; - - for (unsigned i = 0; i < nr; i++) - for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) - ja->buckets[dst++] = - le64_to_cpu(journal_buckets_v2->d[i].start) + j; - } else if (journal_buckets) { - for (unsigned i = 0; i < ja->nr; i++) - ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); - } - - return 0; -} - -void bch2_fs_journal_exit(struct journal *j) -{ - if (j->wq) - destroy_workqueue(j->wq); - - darray_exit(&j->early_journal_entries); - - for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) - kvfree(j->buf[i].data); - kvfree(j->free_buf); - free_fifo(&j->pin); -} - -void bch2_fs_journal_init_early(struct journal *j) -{ - static struct lock_class_key res_key; - - mutex_init(&j->buf_lock); - spin_lock_init(&j->lock); - spin_lock_init(&j->err_lock); - init_waitqueue_head(&j->wait); - INIT_DELAYED_WORK(&j->write_work, journal_write_work); - init_waitqueue_head(&j->reclaim_wait); - init_waitqueue_head(&j->pin_flush_wait); - mutex_init(&j->reclaim_lock); - mutex_init(&j->discard_lock); - - lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - - atomic64_set(&j->reservations.counter, - ((union journal_res_state) - { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); -} - -int bch2_fs_journal_init(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; - j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); - if (!j->free_buf) - return bch_err_throw(c, ENOMEM_journal_buf); - - for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) - j->buf[i].idx = i; - - j->wq = alloc_workqueue("bcachefs_journal", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); - if (!j->wq) - return bch_err_throw(c, ENOMEM_fs_other_alloc); - return 0; -} - -/* debug: */ - -static const char * const bch2_journal_flags_strs[] = { -#define x(n) #n, - JOURNAL_FLAGS() -#undef x - NULL -}; - -void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - union journal_res_state s; - unsigned long now = jiffies; - u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 28); - out->atomic++; - - guard(rcu)(); - s = READ_ONCE(j->reservations); - - prt_printf(out, "flags:\t"); - prt_bitflags(out, bch2_journal_flags_strs, j->flags); - prt_newline(out); - prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); - prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j)); - prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j)); - prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); - prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]); - prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); - prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); - prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); - prt_printf(out, "average write size:\t"); - prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); - prt_newline(out); - prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0); - prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); - prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); - prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); - prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) - ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); - prt_printf(out, "blocked:\t%u\n", j->blocked); - prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error)); - prt_printf(out, "current entry:\t"); - - switch (s.cur_entry_offset) { - case JOURNAL_ENTRY_ERROR_VAL: - prt_printf(out, "error\n"); - break; - case JOURNAL_ENTRY_CLOSED_VAL: - prt_printf(out, "closed\n"); - break; - case JOURNAL_ENTRY_BLOCKED_VAL: - prt_printf(out, "blocked\n"); - break; - default: - prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); - break; - } - - prt_printf(out, "unwritten entries:\n"); - bch2_journal_bufs_to_text(out, j); - - prt_printf(out, "space:\n"); - printbuf_indent_add(out, 2); - prt_printf(out, "discarded\t%u:%u\n", - j->space[journal_space_discarded].next_entry, - j->space[journal_space_discarded].total); - prt_printf(out, "clean ondisk\t%u:%u\n", - j->space[journal_space_clean_ondisk].next_entry, - j->space[journal_space_clean_ondisk].total); - prt_printf(out, "clean\t%u:%u\n", - j->space[journal_space_clean].next_entry, - j->space[journal_space_clean].total); - prt_printf(out, "total\t%u:%u\n", - j->space[journal_space_total].next_entry, - j->space[journal_space_total].total); - printbuf_indent_sub(out, 2); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - if (!ca->mi.durability) - continue; - - struct journal_device *ja = &ca->journal; - - if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) - continue; - - if (!ja->nr) - continue; - - prt_printf(out, "dev %u:\n", ca->dev_idx); - prt_printf(out, "durability %u:\n", ca->mi.durability); - printbuf_indent_add(out, 2); - prt_printf(out, "nr\t%u\n", ja->nr); - prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); - prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); - prt_printf(out, "discard_idx\t%u\n", ja->discard_idx); - prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); - prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); - prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); - printbuf_indent_sub(out, 2); - } - - prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); - - --out->atomic; -} - -void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) -{ - spin_lock(&j->lock); - __bch2_journal_debug_to_text(out, j); - spin_unlock(&j->lock); -} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h deleted file mode 100644 index 977907038d98..000000000000 --- a/fs/bcachefs/journal.h +++ /dev/null @@ -1,465 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_H -#define _BCACHEFS_JOURNAL_H - -/* - * THE JOURNAL: - * - * The primary purpose of the journal is to log updates (insertions) to the - * b-tree, to avoid having to do synchronous updates to the b-tree on disk. - * - * Without the journal, the b-tree is always internally consistent on - * disk - and in fact, in the earliest incarnations bcache didn't have a journal - * but did handle unclean shutdowns by doing all index updates synchronously - * (with coalescing). - * - * Updates to interior nodes still happen synchronously and without the journal - * (for simplicity) - this may change eventually but updates to interior nodes - * are rare enough it's not a huge priority. - * - * This means the journal is relatively separate from the b-tree; it consists of - * just a list of keys and journal replay consists of just redoing those - * insertions in same order that they appear in the journal. - * - * PERSISTENCE: - * - * For synchronous updates (where we're waiting on the index update to hit - * disk), the journal entry will be written out immediately (or as soon as - * possible, if the write for the previous journal entry was still in flight). - * - * Synchronous updates are specified by passing a closure (@flush_cl) to - * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter - * down to the journalling code. That closure will wait on the journal write to - * complete (via closure_wait()). - * - * If the index update wasn't synchronous, the journal entry will be - * written out after 10 ms have elapsed, by default (the delay_ms field - * in struct journal). - * - * JOURNAL ENTRIES: - * - * A journal entry is variable size (struct jset), it's got a fixed length - * header and then a variable number of struct jset_entry entries. - * - * Journal entries are identified by monotonically increasing 64 bit sequence - * numbers - jset->seq; other places in the code refer to this sequence number. - * - * A jset_entry entry contains one or more bkeys (which is what gets inserted - * into the b-tree). We need a container to indicate which b-tree the key is - * for; also, the roots of the various b-trees are stored in jset_entry entries - * (one for each b-tree) - this lets us add new b-tree types without changing - * the on disk format. - * - * We also keep some things in the journal header that are logically part of the - * superblock - all the things that are frequently updated. This is for future - * bcache on raw flash support; the superblock (which will become another - * journal) can't be moved or wear leveled, so it contains just enough - * information to find the main journal, and the superblock only has to be - * rewritten when we want to move/wear level the main journal. - * - * JOURNAL LAYOUT ON DISK: - * - * The journal is written to a ringbuffer of buckets (which is kept in the - * superblock); the individual buckets are not necessarily contiguous on disk - * which means that journal entries are not allowed to span buckets, but also - * that we can resize the journal at runtime if desired (unimplemented). - * - * The journal buckets exist in the same pool as all the other buckets that are - * managed by the allocator and garbage collection - garbage collection marks - * the journal buckets as metadata buckets. - * - * OPEN/DIRTY JOURNAL ENTRIES: - * - * Open/dirty journal entries are journal entries that contain b-tree updates - * that have not yet been written out to the b-tree on disk. We have to track - * which journal entries are dirty, and we also have to avoid wrapping around - * the journal and overwriting old but still dirty journal entries with new - * journal entries. - * - * On disk, this is represented with the "last_seq" field of struct jset; - * last_seq is the first sequence number that journal replay has to replay. - * - * To avoid overwriting dirty journal entries on disk, we keep a mapping (in - * journal_device->seq) of for each journal bucket, the highest sequence number - * any journal entry it contains. Then, by comparing that against last_seq we - * can determine whether that journal bucket contains dirty journal entries or - * not. - * - * To track which journal entries are dirty, we maintain a fifo of refcounts - * (where each entry corresponds to a specific sequence number) - when a ref - * goes to 0, that journal entry is no longer dirty. - * - * Journalling of index updates is done at the same time as the b-tree itself is - * being modified (see btree_insert_key()); when we add the key to the journal - * the pending b-tree write takes a ref on the journal entry the key was added - * to. If a pending b-tree write would need to take refs on multiple dirty - * journal entries, it only keeps the ref on the oldest one (since a newer - * journal entry will still be replayed if an older entry was dirty). - * - * JOURNAL FILLING UP: - * - * There are two ways the journal could fill up; either we could run out of - * space to write to, or we could have too many open journal entries and run out - * of room in the fifo of refcounts. Since those refcounts are decremented - * without any locking we can't safely resize that fifo, so we handle it the - * same way. - * - * If the journal fills up, we start flushing dirty btree nodes until we can - * allocate space for a journal write again - preferentially flushing btree - * nodes that are pinning the oldest journal entries first. - */ - -#include <linux/hash.h> - -#include "journal_types.h" - -struct bch_fs; - -static inline void journal_wake(struct journal *j) -{ - wake_up(&j->wait); - closure_wake_up(&j->async_wait); -} - -/* Sequence number of oldest dirty journal entry */ - -static inline u64 journal_last_seq(struct journal *j) -{ - return j->pin.front; -} - -static inline u64 journal_cur_seq(struct journal *j) -{ - return atomic64_read(&j->seq); -} - -static inline u64 journal_last_unwritten_seq(struct journal *j) -{ - return j->seq_ondisk + 1; -} - -static inline struct journal_buf *journal_cur_buf(struct journal *j) -{ - unsigned idx = (journal_cur_seq(j) & - JOURNAL_BUF_MASK & - ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx; - - return j->buf + idx; -} - -static inline int journal_state_count(union journal_res_state s, int idx) -{ - switch (idx) { - case 0: return s.buf0_count; - case 1: return s.buf1_count; - case 2: return s.buf2_count; - case 3: return s.buf3_count; - } - BUG(); -} - -static inline int journal_state_seq_count(struct journal *j, - union journal_res_state s, u64 seq) -{ - if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR) - return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK); - else - return 0; -} - -static inline void journal_state_inc(union journal_res_state *s) -{ - s->buf0_count += s->idx == 0; - s->buf1_count += s->idx == 1; - s->buf2_count += s->idx == 2; - s->buf3_count += s->idx == 3; -} - -/* - * Amount of space that will be taken up by some keys in the journal (i.e. - * including the jset header) - */ -static inline unsigned jset_u64s(unsigned u64s) -{ - return u64s + sizeof(struct jset_entry) / sizeof(u64); -} - -static inline int journal_entry_overhead(struct journal *j) -{ - return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; -} - -static inline struct jset_entry * -bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) -{ - struct jset *jset = buf->data; - struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); - - memset(entry, 0, sizeof(*entry)); - entry->u64s = cpu_to_le16(u64s); - - le32_add_cpu(&jset->u64s, jset_u64s(u64s)); - - return entry; -} - -static inline struct jset_entry * -journal_res_entry(struct journal *j, struct journal_res *res) -{ - return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset); -} - -static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, - enum btree_id id, unsigned level, - unsigned u64s) -{ - entry->u64s = cpu_to_le16(u64s); - entry->btree_id = id; - entry->level = level; - entry->type = type; - entry->pad[0] = 0; - entry->pad[1] = 0; - entry->pad[2] = 0; - return jset_u64s(u64s); -} - -static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, - enum btree_id id, unsigned level, - const void *data, unsigned u64s) -{ - unsigned ret = journal_entry_init(entry, type, id, level, u64s); - - memcpy_u64s_small(entry->_data, data, u64s); - return ret; -} - -static inline struct jset_entry * -bch2_journal_add_entry(struct journal *j, struct journal_res *res, - unsigned type, enum btree_id id, - unsigned level, unsigned u64s) -{ - struct jset_entry *entry = journal_res_entry(j, res); - unsigned actual = journal_entry_init(entry, type, id, level, u64s); - - EBUG_ON(!res->ref); - EBUG_ON(actual > res->u64s); - - res->offset += actual; - res->u64s -= actual; - return entry; -} - -static inline bool journal_entry_empty(struct jset *j) -{ - if (j->seq != j->last_seq) - return false; - - vstruct_for_each(j, i) - if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) - return false; - return true; -} - -/* - * Drop reference on a buffer index and return true if the count has hit zero. - */ -static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx) -{ - union journal_res_state s; - - s.v = atomic64_sub_return(((union journal_res_state) { - .buf0_count = idx == 0, - .buf1_count = idx == 1, - .buf2_count = idx == 2, - .buf3_count = idx == 3, - }).v, &j->reservations.counter); - return s; -} - -bool bch2_journal_entry_close(struct journal *); -void bch2_journal_do_writes(struct journal *); -void bch2_journal_buf_put_final(struct journal *, u64); - -static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) -{ - unsigned idx = seq & JOURNAL_STATE_BUF_MASK; - union journal_res_state s; - - s = journal_state_buf_put(j, idx); - if (!journal_state_count(s, idx)) - bch2_journal_buf_put_final(j, seq); -} - -static inline void bch2_journal_buf_put(struct journal *j, u64 seq) -{ - unsigned idx = seq & JOURNAL_STATE_BUF_MASK; - union journal_res_state s; - - s = journal_state_buf_put(j, idx); - if (!journal_state_count(s, idx)) { - spin_lock(&j->lock); - bch2_journal_buf_put_final(j, seq); - spin_unlock(&j->lock); - } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)) - wake_up(&j->wait); -} - -/* - * This function releases the journal write structure so other threads can - * then proceed to add their keys as well. - */ -static inline void bch2_journal_res_put(struct journal *j, - struct journal_res *res) -{ - if (!res->ref) - return; - - lock_release(&j->res_map, _THIS_IP_); - - while (res->u64s) - bch2_journal_add_entry(j, res, - BCH_JSET_ENTRY_btree_keys, - 0, 0, 0); - - bch2_journal_buf_put(j, res->seq); - - res->ref = 0; -} - -int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, - unsigned, struct btree_trans *); - -/* First bits for BCH_WATERMARK: */ -enum journal_res_flags { - __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS, - __JOURNAL_RES_GET_CHECK, -}; - -#define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK) -#define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK) - -static inline int journal_res_get_fast(struct journal *j, - struct journal_res *res, - unsigned flags) -{ - union journal_res_state old, new; - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - - /* - * Check if there is still room in the current journal - * entry, smp_rmb() guarantees that reads from reservations.counter - * occur before accessing cur_entry_u64s: - */ - smp_rmb(); - if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) - return 0; - - EBUG_ON(!journal_state_count(new, new.idx)); - - if ((flags & BCH_WATERMARK_MASK) < j->watermark) - return 0; - - new.cur_entry_offset += res->u64s; - journal_state_inc(&new); - - /* - * If the refcount would overflow, we have to wait: - * XXX - tracepoint this: - */ - if (!journal_state_count(new, new.idx)) - return 0; - - if (flags & JOURNAL_RES_GET_CHECK) - return 1; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - - res->ref = true; - res->offset = old.cur_entry_offset; - res->seq = journal_cur_seq(j); - res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK; - return 1; -} - -static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s, unsigned flags, - struct btree_trans *trans) -{ - int ret; - - EBUG_ON(res->ref); - EBUG_ON(!test_bit(JOURNAL_running, &j->flags)); - - res->u64s = u64s; - - if (journal_res_get_fast(j, res, flags)) - goto out; - - ret = bch2_journal_res_get_slowpath(j, res, flags, trans); - if (ret) - return ret; -out: - if (!(flags & JOURNAL_RES_GET_CHECK)) { - lock_acquire_shared(&j->res_map, 0, - (flags & JOURNAL_RES_GET_NONBLOCK) != 0, - NULL, _THIS_IP_); - EBUG_ON(!res->ref); - BUG_ON(!res->seq); - } - return 0; -} - -/* journal_entry_res: */ - -void bch2_journal_entry_res_resize(struct journal *, - struct journal_entry_res *, - unsigned); - -int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); -void bch2_journal_flush_async(struct journal *, struct closure *); - -int bch2_journal_flush_seq(struct journal *, u64, unsigned); -int bch2_journal_flush(struct journal *); -bool bch2_journal_noflush_seq(struct journal *, u64, u64); -int bch2_journal_meta(struct journal *); - -void bch2_journal_halt_locked(struct journal *); -void bch2_journal_halt(struct journal *); - -static inline int bch2_journal_error(struct journal *j) -{ - return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL - ? -BCH_ERR_journal_shutdown : 0; -} - -struct bch_dev; - -void bch2_journal_unblock(struct journal *); -void bch2_journal_block(struct journal *); -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); - -void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); -void bch2_journal_debug_to_text(struct printbuf *, struct journal *); - -int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned); -int bch2_dev_journal_bucket_delete(struct bch_dev *, u64); - -int bch2_dev_journal_alloc(struct bch_dev *, bool); -int bch2_fs_journal_alloc(struct bch_fs *); - -void bch2_dev_journal_stop(struct journal *, struct bch_dev *); - -void bch2_fs_journal_stop(struct journal *); -int bch2_fs_journal_start(struct journal *, u64, u64); -void bch2_journal_set_replay_done(struct journal *); - -void bch2_dev_journal_exit(struct bch_dev *); -int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); -void bch2_fs_journal_exit(struct journal *); -void bch2_fs_journal_init_early(struct journal *); -int bch2_fs_journal_init(struct journal *); - -#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c deleted file mode 100644 index 9e028dbcc3d0..000000000000 --- a/fs/bcachefs/journal_io.c +++ /dev/null @@ -1,2242 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "btree_io.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "checksum.h" -#include "disk_groups.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "journal_seq_blacklist.h" -#include "replicas.h" -#include "sb-clean.h" -#include "trace.h" - -#include <linux/ioprio.h> -#include <linux/string_choices.h> -#include <linux/sched/sysctl.h> - -void bch2_journal_pos_from_member_info_set(struct bch_fs *c) -{ - lockdep_assert_held(&c->sb_lock); - - for_each_member_device(c, ca) { - struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - - m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); - m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); - } -} - -void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - for_each_member_device(c, ca) { - struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); - - unsigned idx = le32_to_cpu(m.last_journal_bucket); - if (idx < ca->journal.nr) - ca->journal.cur_idx = idx; - unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); - if (offset <= ca->mi.bucket_size) - ca->journal.sectors_free = ca->mi.bucket_size - offset; - } - mutex_unlock(&c->sb_lock); -} - -static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p) -{ - struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev); - prt_printf(out, "%s %u:%u:%u (sector %llu)", - ca ? ca->name : "(invalid dev)", - p->dev, p->bucket, p->bucket_offset, p->sector); - bch2_dev_put(ca); -} - -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) -{ - darray_for_each(j->ptrs, i) { - if (i != j->ptrs.data) - prt_printf(out, " "); - bch2_journal_ptr_to_text(out, c, i); - } -} - -static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j) -{ - for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) { - struct jset_entry_datetime *datetime = - container_of(entry, struct jset_entry_datetime, entry); - bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); - break; - } -} - -static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) -{ - prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); - bch2_journal_datetime_to_text(out, &j->j); - prt_char(out, ' '); - bch2_journal_ptrs_to_text(out, c, j); -} - -static struct nonce journal_nonce(const struct jset *jset) -{ - return (struct nonce) {{ - [0] = 0, - [1] = ((__le32 *) &jset->seq)[0], - [2] = ((__le32 *) &jset->seq)[1], - [3] = BCH_NONCE_JOURNAL, - }}; -} - -static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) -{ - if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { - *csum = (struct bch_csum) {}; - return false; - } - - *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); - return !bch2_crc_cmp(j->csum, *csum); -} - -static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) -{ - return (seq - c->journal_entries_base_seq) & (~0U >> 1); -} - -static void __journal_replay_free(struct bch_fs *c, - struct journal_replay *i) -{ - struct journal_replay **p = - genradix_ptr(&c->journal_entries, - journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); - - BUG_ON(*p != i); - *p = NULL; - kvfree(i); -} - -static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) -{ - if (blacklisted) - i->ignore_blacklisted = true; - else - i->ignore_not_dirty = true; - - if (!c->opts.read_entire_journal) - __journal_replay_free(c, i); -} - -struct journal_list { - struct closure cl; - u64 last_seq; - struct mutex lock; - int ret; -}; - -#define JOURNAL_ENTRY_ADD_OK 0 -#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 - -/* - * Given a journal entry we just read, add it to the list of journal entries to - * be replayed: - */ -static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, - struct journal_ptr entry_ptr, - struct journal_list *jlist, struct jset *j) -{ - struct genradix_iter iter; - struct journal_replay **_i, *i, *dup; - size_t bytes = vstruct_bytes(j); - u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; - struct printbuf buf = PRINTBUF; - int ret = JOURNAL_ENTRY_ADD_OK; - - if (last_seq && c->opts.journal_rewind) - last_seq = min(last_seq, c->opts.journal_rewind); - - if (!c->journal.oldest_seq_found_ondisk || - le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) - c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); - - /* Is this entry older than the range we need? */ - if (!c->opts.read_entire_journal && - le64_to_cpu(j->seq) < jlist->last_seq) - return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; - - /* - * genradixes are indexed by a ulong, not a u64, so we can't index them - * by sequence number directly: Assume instead that they will all fall - * within the range of +-2billion of the filrst one we find. - */ - if (!c->journal_entries_base_seq) - c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); - - /* Drop entries we don't need anymore */ - if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { - genradix_for_each_from(&c->journal_entries, iter, _i, - journal_entry_radix_idx(c, jlist->last_seq)) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - if (le64_to_cpu(i->j.seq) >= last_seq) - break; - - journal_replay_free(c, i, false); - } - } - - jlist->last_seq = max(jlist->last_seq, last_seq); - - _i = genradix_ptr_alloc(&c->journal_entries, - journal_entry_radix_idx(c, le64_to_cpu(j->seq)), - GFP_KERNEL); - if (!_i) - return bch_err_throw(c, ENOMEM_journal_entry_add); - - /* - * Duplicate journal entries? If so we want the one that didn't have a - * checksum error: - */ - dup = *_i; - if (dup) { - bool identical = bytes == vstruct_bytes(&dup->j) && - !memcmp(j, &dup->j, bytes); - bool not_identical = !identical && - entry_ptr.csum_good && - dup->csum_good; - - bool same_device = false; - darray_for_each(dup->ptrs, ptr) - if (ptr->dev == ca->dev_idx) - same_device = true; - - ret = darray_push(&dup->ptrs, entry_ptr); - if (ret) - goto out; - - bch2_journal_replay_to_text(&buf, c, dup); - - fsck_err_on(same_device, - c, journal_entry_dup_same_device, - "duplicate journal entry on same device\n%s", - buf.buf); - - fsck_err_on(not_identical, - c, journal_entry_replicas_data_mismatch, - "found duplicate but non identical journal entries\n%s", - buf.buf); - - if (entry_ptr.csum_good && !identical) - goto replace; - - goto out; - } -replace: - i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); - if (!i) - return bch_err_throw(c, ENOMEM_journal_entry_add); - - darray_init(&i->ptrs); - i->csum_good = entry_ptr.csum_good; - i->ignore_blacklisted = false; - i->ignore_not_dirty = false; - unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); - - if (dup) { - /* The first ptr should represent the jset we kept: */ - darray_for_each(dup->ptrs, ptr) - darray_push(&i->ptrs, *ptr); - __journal_replay_free(c, dup); - } else { - darray_push(&i->ptrs, entry_ptr); - } - - *_i = i; -out: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* this fills in a range with empty jset_entries: */ -static void journal_entry_null_range(void *start, void *end) -{ - struct jset_entry *entry; - - for (entry = start; entry != end; entry = vstruct_next(entry)) - memset(entry, 0, sizeof(*entry)); -} - -#define JOURNAL_ENTRY_REREAD 5 -#define JOURNAL_ENTRY_NONE 6 -#define JOURNAL_ENTRY_BAD 7 - -static void journal_entry_err_msg(struct printbuf *out, - u32 version, - struct jset *jset, - struct jset_entry *entry) -{ - prt_str(out, "invalid journal entry, version="); - bch2_version_to_text(out, version); - - if (entry) { - prt_str(out, " type="); - bch2_prt_jset_entry_type(out, entry->type); - } - - if (!jset) { - prt_printf(out, " in superblock"); - } else { - - prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); - - if (entry) - prt_printf(out, " offset=%zi/%u", - (u64 *) entry - jset->_data, - le32_to_cpu(jset->u64s)); - } - - prt_str(out, ": "); -} - -#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ -({ \ - struct printbuf _buf = PRINTBUF; \ - \ - journal_entry_err_msg(&_buf, version, jset, entry); \ - prt_printf(&_buf, msg, ##__VA_ARGS__); \ - \ - switch (from.flags & BCH_VALIDATE_write) { \ - case READ: \ - mustfix_fsck_err(c, _err, "%s", _buf.buf); \ - break; \ - case WRITE: \ - bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ - if (bch2_fs_inconsistent(c, \ - "corrupt metadata before write: %s\n", _buf.buf)) {\ - ret = bch_err_throw(c, fsck_errors_not_fixed); \ - goto fsck_err; \ - } \ - break; \ - } \ - \ - printbuf_exit(&_buf); \ - true; \ -}) - -#define journal_entry_err_on(cond, ...) \ - ((cond) ? journal_entry_err(__VA_ARGS__) : false) - -#define FSCK_DELETED_KEY 5 - -static int journal_validate_key(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - struct bkey_i *k, - struct bkey_validate_context from, - unsigned version, int big_endian) -{ - enum bch_validate_flags flags = from.flags; - int write = flags & BCH_VALIDATE_write; - void *next = vstruct_next(entry); - int ret = 0; - - if (journal_entry_err_on(!k->k.u64s, - c, version, jset, entry, - journal_entry_bkey_u64s_0, - "k->u64s 0")) { - entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(vstruct_next(entry), next); - return FSCK_DELETED_KEY; - } - - if (journal_entry_err_on((void *) bkey_next(k) > - (void *) vstruct_next(entry), - c, version, jset, entry, - journal_entry_bkey_past_end, - "extends past end of journal entry")) { - entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(vstruct_next(entry), next); - return FSCK_DELETED_KEY; - } - - if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, - c, version, jset, entry, - journal_entry_bkey_bad_format, - "bad format %u", k->k.format)) { - le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); - memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(vstruct_next(entry), next); - return FSCK_DELETED_KEY; - } - - if (!write) - bch2_bkey_compat(from.level, from.btree, version, big_endian, - write, NULL, bkey_to_packed(k)); - - ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); - if (ret == -BCH_ERR_fsck_delete_bkey) { - le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); - memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(vstruct_next(entry), next); - return FSCK_DELETED_KEY; - } - if (ret) - goto fsck_err; - - if (write) - bch2_bkey_compat(from.level, from.btree, version, big_endian, - write, NULL, bkey_to_packed(k)); -fsck_err: - return ret; -} - -static int journal_entry_btree_keys_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct bkey_i *k = entry->start; - - from.level = entry->level; - from.btree = entry->btree_id; - - while (k != vstruct_last(entry)) { - int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); - if (ret == FSCK_DELETED_KEY) - continue; - else if (ret) - return ret; - - k = bkey_next(k); - } - - return 0; -} - -static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - bool first = true; - - jset_entry_for_each_key(entry, k) { - /* We may be called on entries that haven't been validated: */ - if (!k->k.u64s) - break; - - if (!first) { - prt_newline(out); - bch2_prt_jset_entry_type(out, entry->type); - prt_str(out, ": "); - } - bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); - prt_char(out, ' '); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); - first = false; - } -} - -static int journal_entry_btree_root_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct bkey_i *k = entry->start; - int ret = 0; - - from.root = true; - from.level = entry->level + 1; - from.btree = entry->btree_id; - - if (journal_entry_err_on(!entry->u64s || - le16_to_cpu(entry->u64s) != k->k.u64s, - c, version, jset, entry, - journal_entry_btree_root_bad_size, - "invalid btree root journal entry: wrong number of keys")) { - void *next = vstruct_next(entry); - /* - * we don't want to null out this jset_entry, - * just the contents, so that later we can tell - * we were _supposed_ to have a btree root - */ - entry->u64s = 0; - journal_entry_null_range(vstruct_next(entry), next); - return 0; - } - - ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); - if (ret == FSCK_DELETED_KEY) - ret = 0; -fsck_err: - return ret; -} - -static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - -static int journal_entry_prio_ptrs_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - /* obsolete, don't care: */ - return 0; -} - -static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ -} - -static int journal_entry_blacklist_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - int ret = 0; - - if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, - c, version, jset, entry, - journal_entry_blacklist_bad_size, - "invalid journal seq blacklist entry: bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - } -fsck_err: - return ret; -} - -static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_blacklist *bl = - container_of(entry, struct jset_entry_blacklist, entry); - - prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); -} - -static int journal_entry_blacklist_v2_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_blacklist_v2 *bl_entry; - int ret = 0; - - if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, - c, version, jset, entry, - journal_entry_blacklist_v2_bad_size, - "invalid journal seq blacklist entry: bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - goto out; - } - - bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); - - if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > - le64_to_cpu(bl_entry->end), - c, version, jset, entry, - journal_entry_blacklist_v2_start_past_end, - "invalid journal seq blacklist entry: start > end")) { - journal_entry_null_range(entry, vstruct_next(entry)); - } -out: -fsck_err: - return ret; -} - -static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_blacklist_v2 *bl = - container_of(entry, struct jset_entry_blacklist_v2, entry); - - prt_printf(out, "start=%llu end=%llu", - le64_to_cpu(bl->start), - le64_to_cpu(bl->end)); -} - -static int journal_entry_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - int ret = 0; - - if (journal_entry_err_on(bytes < sizeof(*u), - c, version, jset, entry, - journal_entry_usage_bad_size, - "invalid journal entry usage: bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - -fsck_err: - return ret; -} - -static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); - - prt_str(out, "type="); - bch2_prt_fs_usage_type(out, u->entry.btree_id); - prt_printf(out, " v=%llu", le64_to_cpu(u->v)); -} - -static int journal_entry_data_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - struct printbuf err = PRINTBUF; - int ret = 0; - - if (journal_entry_err_on(bytes < sizeof(*u) || - bytes < sizeof(*u) + u->r.nr_devs, - c, version, jset, entry, - journal_entry_data_usage_bad_size, - "invalid journal entry usage: bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - goto out; - } - - if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), - c, version, jset, entry, - journal_entry_data_usage_bad_size, - "invalid journal entry usage: %s", err.buf)) { - journal_entry_null_range(entry, vstruct_next(entry)); - goto out; - } -out: -fsck_err: - printbuf_exit(&err); - return ret; -} - -static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); - - bch2_replicas_entry_to_text(out, &u->r); - prt_printf(out, "=%llu", le64_to_cpu(u->v)); -} - -static int journal_entry_clock_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_clock *clock = - container_of(entry, struct jset_entry_clock, entry); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - int ret = 0; - - if (journal_entry_err_on(bytes != sizeof(*clock), - c, version, jset, entry, - journal_entry_clock_bad_size, - "bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - - if (journal_entry_err_on(clock->rw > 1, - c, version, jset, entry, - journal_entry_clock_bad_rw, - "bad rw")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - -fsck_err: - return ret; -} - -static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_clock *clock = - container_of(entry, struct jset_entry_clock, entry); - - prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); -} - -static int journal_entry_dev_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_dev_usage *u = - container_of(entry, struct jset_entry_dev_usage, entry); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - unsigned expected = sizeof(*u); - int ret = 0; - - if (journal_entry_err_on(bytes < expected, - c, version, jset, entry, - journal_entry_dev_usage_bad_size, - "bad size (%u < %u)", - bytes, expected)) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - - if (journal_entry_err_on(u->pad, - c, version, jset, entry, - journal_entry_dev_usage_bad_pad, - "bad pad")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - -fsck_err: - return ret; -} - -static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_dev_usage *u = - container_of(entry, struct jset_entry_dev_usage, entry); - unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); - - if (vstruct_bytes(entry) < sizeof(*u)) - return; - - prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); - - printbuf_indent_add(out, 2); - for (i = 0; i < nr_types; i++) { - prt_newline(out); - bch2_prt_data_type(out, i); - prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", - le64_to_cpu(u->d[i].buckets), - le64_to_cpu(u->d[i].sectors), - le64_to_cpu(u->d[i].fragmented)); - } - printbuf_indent_sub(out, 2); -} - -static int journal_entry_log_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - return 0; -} - -static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - - prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); -} - -static int journal_entry_overwrite_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - from.flags = 0; - return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, from); -} - -static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - -static int journal_entry_log_bkey_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - from.flags = 0; - return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, from); -} - -static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - -static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, from); -} - -static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - -static int journal_entry_datetime_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - unsigned bytes = vstruct_bytes(entry); - unsigned expected = 16; - int ret = 0; - - if (journal_entry_err_on(vstruct_bytes(entry) < expected, - c, version, jset, entry, - journal_entry_dev_usage_bad_size, - "bad size (%u < %u)", - bytes, expected)) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } -fsck_err: - return ret; -} - -static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_datetime *datetime = - container_of(entry, struct jset_entry_datetime, entry); - - bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); -} - -struct jset_entry_ops { - int (*validate)(struct bch_fs *, struct jset *, - struct jset_entry *, unsigned, int, - struct bkey_validate_context); - void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); -}; - -static const struct jset_entry_ops bch2_jset_entry_ops[] = { -#define x(f, nr) \ - [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ - .validate = journal_entry_##f##_validate, \ - .to_text = journal_entry_##f##_to_text, \ - }, - BCH_JSET_ENTRY_TYPES() -#undef x -}; - -int bch2_journal_entry_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - return entry->type < BCH_JSET_ENTRY_NR - ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, - version, big_endian, from) - : 0; -} - -void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - bch2_prt_jset_entry_type(out, entry->type); - - if (entry->type < BCH_JSET_ENTRY_NR) { - prt_str(out, ": "); - bch2_jset_entry_ops[entry->type].to_text(out, c, entry); - } -} - -static int jset_validate_entries(struct bch_fs *c, struct jset *jset, - enum bch_validate_flags flags) -{ - struct bkey_validate_context from = { - .flags = flags, - .from = BKEY_VALIDATE_journal, - .journal_seq = le64_to_cpu(jset->seq), - }; - - unsigned version = le32_to_cpu(jset->version); - int ret = 0; - - vstruct_for_each(jset, entry) { - from.journal_offset = (u64 *) entry - jset->_data; - - if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), - c, version, jset, entry, - journal_entry_past_jset_end, - "journal entry extends past end of jset")) { - jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); - break; - } - - ret = bch2_journal_entry_validate(c, jset, entry, version, - JSET_BIG_ENDIAN(jset), from); - if (ret) - break; - } -fsck_err: - return ret; -} - -static int jset_validate(struct bch_fs *c, - struct bch_dev *ca, - struct jset *jset, u64 sector, - enum bch_validate_flags flags) -{ - struct bkey_validate_context from = { - .flags = flags, - .from = BKEY_VALIDATE_journal, - .journal_seq = le64_to_cpu(jset->seq), - }; - int ret = 0; - - if (le64_to_cpu(jset->magic) != jset_magic(c)) - return JOURNAL_ENTRY_NONE; - - unsigned version = le32_to_cpu(jset->version); - if (journal_entry_err_on(!bch2_version_compatible(version), - c, version, jset, NULL, - jset_unsupported_version, - "%s sector %llu seq %llu: incompatible journal entry version %u.%u", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), - BCH_VERSION_MAJOR(version), - BCH_VERSION_MINOR(version))) { - /* don't try to continue: */ - return -EINVAL; - } - - if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), - c, version, jset, NULL, - jset_unknown_csum, - "%s sector %llu seq %llu: journal entry with unknown csum type %llu", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), - JSET_CSUM_TYPE(jset))) - ret = JOURNAL_ENTRY_BAD; - - /* last_seq is ignored when JSET_NO_FLUSH is true */ - if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && - le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), - c, version, jset, NULL, - jset_last_seq_newer_than_seq, - "invalid journal entry: last_seq > seq (%llu > %llu)", - le64_to_cpu(jset->last_seq), - le64_to_cpu(jset->seq))) { - jset->last_seq = jset->seq; - return JOURNAL_ENTRY_BAD; - } - - ret = jset_validate_entries(c, jset, flags); -fsck_err: - return ret; -} - -static int jset_validate_early(struct bch_fs *c, - struct bch_dev *ca, - struct jset *jset, u64 sector, - unsigned bucket_sectors_left, - unsigned sectors_read) -{ - struct bkey_validate_context from = { - .from = BKEY_VALIDATE_journal, - .journal_seq = le64_to_cpu(jset->seq), - }; - int ret = 0; - - if (le64_to_cpu(jset->magic) != jset_magic(c)) - return JOURNAL_ENTRY_NONE; - - unsigned version = le32_to_cpu(jset->version); - if (journal_entry_err_on(!bch2_version_compatible(version), - c, version, jset, NULL, - jset_unsupported_version, - "%s sector %llu seq %llu: unknown journal entry version %u.%u", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), - BCH_VERSION_MAJOR(version), - BCH_VERSION_MINOR(version))) { - /* don't try to continue: */ - return -EINVAL; - } - - size_t bytes = vstruct_bytes(jset); - if (bytes > (sectors_read << 9) && - sectors_read < bucket_sectors_left) - return JOURNAL_ENTRY_REREAD; - - if (journal_entry_err_on(bytes > bucket_sectors_left << 9, - c, version, jset, NULL, - jset_past_bucket_end, - "%s sector %llu seq %llu: journal entry too big (%zu bytes)", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), bytes)) - le32_add_cpu(&jset->u64s, - -((bytes - (bucket_sectors_left << 9)) / 8)); -fsck_err: - return ret; -} - -struct journal_read_buf { - void *data; - size_t size; -}; - -static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b, - size_t new_size) -{ - void *n; - - /* the bios are sized for this many pages, max: */ - if (new_size > JOURNAL_ENTRY_SIZE_MAX) - return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); - - new_size = roundup_pow_of_two(new_size); - n = kvmalloc(new_size, GFP_KERNEL); - if (!n) - return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); - - kvfree(b->data); - b->data = n; - b->size = new_size; - return 0; -} - -static int journal_read_bucket(struct bch_dev *ca, - struct journal_read_buf *buf, - struct journal_list *jlist, - unsigned bucket) -{ - struct bch_fs *c = ca->fs; - struct journal_device *ja = &ca->journal; - struct jset *j = NULL; - unsigned sectors, sectors_read = 0; - u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), - end = offset + ca->mi.bucket_size; - bool saw_bad = false, csum_good; - int ret = 0; - - pr_debug("reading %u", bucket); - - while (offset < end) { - if (!sectors_read) { - struct bio *bio; - unsigned nr_bvecs; -reread: - sectors_read = min_t(unsigned, - end - offset, buf->size >> 9); - nr_bvecs = buf_pages(buf->data, sectors_read << 9); - - bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); - if (!bio) - return bch_err_throw(c, ENOMEM_journal_read_bucket); - bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); - - bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, buf->data, sectors_read << 9); - - u64 submit_time = local_clock(); - ret = submit_bio_wait(bio); - kfree(bio); - - if (!ret && bch2_meta_read_fault("journal")) - ret = bch_err_throw(c, EIO_fault_injected); - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, - submit_time, !ret); - - if (ret) { - bch_err_dev_ratelimited(ca, - "journal read error: sector %llu", offset); - /* - * We don't error out of the recovery process - * here, since the relevant journal entry may be - * found on a different device, and missing or - * no journal entries will be handled later - */ - return 0; - } - - j = buf->data; - } - - ret = jset_validate_early(c, ca, j, offset, - end - offset, sectors_read); - switch (ret) { - case 0: - sectors = vstruct_sectors(j, c->block_bits); - break; - case JOURNAL_ENTRY_REREAD: - if (vstruct_bytes(j) > buf->size) { - ret = journal_read_buf_realloc(c, buf, - vstruct_bytes(j)); - if (ret) - return ret; - } - goto reread; - case JOURNAL_ENTRY_NONE: - if (!saw_bad) - return 0; - /* - * On checksum error we don't really trust the size - * field of the journal entry we read, so try reading - * again at next block boundary: - */ - sectors = block_sectors(c); - goto next_block; - default: - return ret; - } - - if (le64_to_cpu(j->seq) > ja->highest_seq_found) { - ja->highest_seq_found = le64_to_cpu(j->seq); - ja->cur_idx = bucket; - ja->sectors_free = ca->mi.bucket_size - - bucket_remainder(ca, offset) - sectors; - } - - /* - * This happens sometimes if we don't have discards on - - * when we've partially overwritten a bucket with new - * journal entries. We don't need the rest of the - * bucket: - */ - if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - return 0; - - ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - - struct bch_csum csum; - csum_good = jset_csum_good(c, j, &csum); - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); - - if (!csum_good) { - /* - * Don't print an error here, we'll print the error - * later if we need this journal entry - */ - saw_bad = true; - } - - ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), - j->encrypted_start, - vstruct_end(j) - (void *) j->encrypted_start); - bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); - - mutex_lock(&jlist->lock); - ret = journal_entry_add(c, ca, (struct journal_ptr) { - .csum_good = csum_good, - .csum = csum, - .dev = ca->dev_idx, - .bucket = bucket, - .bucket_offset = offset - - bucket_to_sector(ca, ja->buckets[bucket]), - .sector = offset, - }, jlist, j); - mutex_unlock(&jlist->lock); - - switch (ret) { - case JOURNAL_ENTRY_ADD_OK: - break; - case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: - break; - default: - return ret; - } -next_block: - pr_debug("next"); - offset += sectors; - sectors_read -= sectors; - j = ((void *) j) + (sectors << 9); - } - - return 0; -} - -static CLOSURE_CALLBACK(bch2_journal_read_device) -{ - closure_type(ja, struct journal_device, read); - struct bch_dev *ca = container_of(ja, struct bch_dev, journal); - struct bch_fs *c = ca->fs; - struct journal_list *jlist = - container_of(cl->parent, struct journal_list, cl); - struct journal_read_buf buf = { NULL, 0 }; - unsigned i; - int ret = 0; - - if (!ja->nr) - goto out; - - ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE); - if (ret) - goto err; - - pr_debug("%u journal buckets", ja->nr); - - for (i = 0; i < ja->nr; i++) { - ret = journal_read_bucket(ca, &buf, jlist, i); - if (ret) - goto err; - } - - /* - * Set dirty_idx to indicate the entire journal is full and needs to be - * reclaimed - journal reclaim will immediately reclaim whatever isn't - * pinned when it first runs: - */ - ja->discard_idx = ja->dirty_idx_ondisk = - ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; -out: - bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); - kvfree(buf.data); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read); - closure_return(cl); - return; -err: - mutex_lock(&jlist->lock); - jlist->ret = ret; - mutex_unlock(&jlist->lock); - goto out; -} - -noinline_for_stack -static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j) -{ - struct printbuf buf = PRINTBUF; - enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j); - bool have_good = false; - - prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq)); - bch2_journal_datetime_to_text(&buf, &j->j); - prt_newline(&buf); - - darray_for_each(j->ptrs, ptr) - if (!ptr->csum_good) { - bch2_journal_ptr_to_text(&buf, c, ptr); - prt_char(&buf, ' '); - bch2_csum_to_text(&buf, csum_type, ptr->csum); - prt_newline(&buf); - } else { - have_good = true; - } - - prt_printf(&buf, "should be "); - bch2_csum_to_text(&buf, csum_type, j->j.csum); - - if (have_good) - prt_printf(&buf, "\n(had good copy on another device)"); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -} - -noinline_for_stack -static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) -{ - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct genradix_iter radix_iter; - struct journal_replay *i, **_i, *prev = NULL; - u64 seq = start_seq; - - genradix_for_each(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - BUG_ON(seq > le64_to_cpu(i->j.seq)); - - while (seq < le64_to_cpu(i->j.seq)) { - while (seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (seq == le64_to_cpu(i->j.seq)) - break; - - u64 missing_start = seq; - - while (seq < le64_to_cpu(i->j.seq) && - !bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - u64 missing_end = seq - 1; - - printbuf_reset(&buf); - prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", - missing_start, missing_end, - start_seq, end_seq); - - prt_printf(&buf, "\nprev at "); - if (prev) { - bch2_journal_ptrs_to_text(&buf, c, prev); - prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); - } else - prt_printf(&buf, "(none)"); - - prt_printf(&buf, "\nnext at "); - bch2_journal_ptrs_to_text(&buf, c, i); - prt_printf(&buf, ", continue?"); - - fsck_err(c, journal_entries_missing, "%s", buf.buf); - } - - prev = i; - seq++; - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_journal_read(struct bch_fs *c, - u64 *last_seq, - u64 *blacklist_seq, - u64 *start_seq) -{ - struct journal_list jlist; - struct journal_replay *i, **_i; - struct genradix_iter radix_iter; - struct printbuf buf = PRINTBUF; - bool degraded = false, last_write_torn = false; - u64 seq; - int ret = 0; - - closure_init_stack(&jlist.cl); - mutex_init(&jlist.lock); - jlist.last_seq = 0; - jlist.ret = 0; - - for_each_member_device(c, ca) { - if (!c->opts.fsck && - !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) - continue; - - if ((ca->mi.state == BCH_MEMBER_STATE_rw || - ca->mi.state == BCH_MEMBER_STATE_ro) && - enumerated_ref_tryget(&ca->io_ref[READ], - BCH_DEV_READ_REF_journal_read)) - closure_call(&ca->journal.read, - bch2_journal_read_device, - system_unbound_wq, - &jlist.cl); - else - degraded = true; - } - - while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) - ; - - if (jlist.ret) - return jlist.ret; - - *last_seq = 0; - *start_seq = 0; - *blacklist_seq = 0; - - /* - * Find most recent flush entry, and ignore newer non flush entries - - * those entries will be blacklisted: - */ - genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - if (!*start_seq) - *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; - - if (JSET_NO_FLUSH(&i->j)) { - i->ignore_blacklisted = true; - continue; - } - - if (!last_write_torn && !i->csum_good) { - last_write_torn = true; - i->ignore_blacklisted = true; - continue; - } - - struct bkey_validate_context from = { - .from = BKEY_VALIDATE_journal, - .journal_seq = le64_to_cpu(i->j.seq), - }; - if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), - c, le32_to_cpu(i->j.version), &i->j, NULL, - jset_last_seq_newer_than_seq, - "invalid journal entry: last_seq > seq (%llu > %llu)", - le64_to_cpu(i->j.last_seq), - le64_to_cpu(i->j.seq))) - i->j.last_seq = i->j.seq; - - *last_seq = le64_to_cpu(i->j.last_seq); - *blacklist_seq = le64_to_cpu(i->j.seq) + 1; - break; - } - - if (!*start_seq) { - bch_info(c, "journal read done, but no entries found"); - return 0; - } - - if (!*last_seq) { - fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, - "journal read done, but no entries found after dropping non-flushes"); - return 0; - } - - printbuf_reset(&buf); - prt_printf(&buf, "journal read done, replaying entries %llu-%llu", - *last_seq, *blacklist_seq - 1); - - /* - * Drop blacklisted entries and entries older than last_seq (or start of - * journal rewind: - */ - u64 drop_before = *last_seq; - if (c->opts.journal_rewind) { - drop_before = min(drop_before, c->opts.journal_rewind); - prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind); - } - - *last_seq = drop_before; - if (*start_seq != *blacklist_seq) - prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); - bch_info(c, "%s", buf.buf); - genradix_for_each(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - seq = le64_to_cpu(i->j.seq); - if (seq < drop_before) { - journal_replay_free(c, i, false); - continue; - } - - if (bch2_journal_seq_is_blacklisted(c, seq, true)) { - fsck_err_on(!JSET_NO_FLUSH(&i->j), c, - jset_seq_blacklisted, - "found blacklisted journal entry %llu", seq); - i->ignore_blacklisted = true; - } - } - - ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1); - if (ret) - goto err; - - genradix_for_each(&c->journal_entries, radix_iter, _i) { - union bch_replicas_padded replicas = { - .e.data_type = BCH_DATA_journal, - .e.nr_devs = 0, - .e.nr_required = 1, - }; - - i = *_i; - if (journal_replay_ignore(i)) - continue; - - /* - * Don't print checksum errors until we know we're going to use - * a given journal entry: - */ - darray_for_each(i->ptrs, ptr) - if (!ptr->csum_good) { - bch2_journal_print_checksum_error(c, i); - break; - } - - ret = jset_validate(c, - bch2_dev_have_ref(c, i->ptrs.data[0].dev), - &i->j, - i->ptrs.data[0].sector, - READ); - if (ret) - goto err; - - darray_for_each(i->ptrs, ptr) - replicas_entry_add_dev(&replicas.e, ptr->dev); - - bch2_replicas_entry_sort(&replicas.e); - - printbuf_reset(&buf); - bch2_replicas_entry_to_text(&buf, &replicas.e); - - if (!degraded && - !bch2_replicas_marked(c, &replicas.e) && - (le64_to_cpu(i->j.seq) == *last_seq || - fsck_err(c, journal_entry_replicas_not_marked, - "superblock not marked as containing replicas for journal entry %llu\n%s", - le64_to_cpu(i->j.seq), buf.buf))) { - ret = bch2_mark_replicas(c, &replicas.e); - if (ret) - goto err; - } - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* journal write: */ - -static void journal_advance_devs_to_next_bucket(struct journal *j, - struct dev_alloc_list *devs, - unsigned sectors, __le64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - guard(rcu)(); - darray_for_each(*devs, i) { - struct bch_dev *ca = rcu_dereference(c->devs[*i]); - if (!ca) - continue; - - struct journal_device *ja = &ca->journal; - - if (sectors > ja->sectors_free && - sectors <= ca->mi.bucket_size && - bch2_journal_dev_buckets_available(j, ja, - journal_space_discarded)) { - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = ca->mi.bucket_size; - - /* - * ja->bucket_seq[ja->cur_idx] must always have - * something sensible: - */ - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); - } - } -} - -static void __journal_write_alloc(struct journal *j, - struct journal_buf *w, - struct dev_alloc_list *devs, - unsigned sectors, - unsigned *replicas, - unsigned replicas_want) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - darray_for_each(*devs, i) { - struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE, - BCH_DEV_WRITE_REF_journal_write); - if (!ca) - continue; - - struct journal_device *ja = &ca->journal; - - /* - * Check that we can use this device, and aren't already using - * it: - */ - if (!ca->mi.durability || - ca->mi.state != BCH_MEMBER_STATE_rw || - !ja->nr || - bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || - sectors > ja->sectors_free) { - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); - continue; - } - - bch2_dev_stripe_increment(ca, &j->wp.stripe); - - bch2_bkey_append_ptr(&w->key, - (struct bch_extent_ptr) { - .offset = bucket_to_sector(ca, - ja->buckets[ja->cur_idx]) + - ca->mi.bucket_size - - ja->sectors_free, - .dev = ca->dev_idx, - }); - - ja->sectors_free -= sectors; - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - - *replicas += ca->mi.durability; - - if (*replicas >= replicas_want) - break; - } -} - -static int journal_write_alloc(struct journal *j, struct journal_buf *w, - unsigned *replicas) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_devs_mask devs; - struct dev_alloc_list devs_sorted; - unsigned sectors = vstruct_sectors(w->data, c->block_bits); - unsigned target = c->opts.metadata_target ?: - c->opts.foreground_target; - unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas); - unsigned replicas_need = min_t(unsigned, replicas_want, - READ_ONCE(c->opts.metadata_replicas_required)); - bool advance_done = false; - -retry_target: - devs = target_rw_devs(c, BCH_DATA_journal, target); - bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted); -retry_alloc: - __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); - - if (likely(*replicas >= replicas_want)) - goto done; - - if (!advance_done) { - journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); - advance_done = true; - goto retry_alloc; - } - - if (*replicas < replicas_want && target) { - /* Retry from all devices: */ - target = 0; - advance_done = false; - goto retry_target; - } -done: - BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - -#if 0 - /* - * XXX: we need a way to alert the user when we go degraded for any - * reason - */ - if (*replicas < min(replicas_want, - dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) { - } -#endif - - return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; -} - -static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - /* we aren't holding j->lock: */ - unsigned new_size = READ_ONCE(j->buf_size_want); - void *new_buf; - - if (buf->buf_size >= new_size) - return; - - size_t btree_write_buffer_size = new_size / 64; - - if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) - return; - - new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); - if (!new_buf) - return; - - memcpy(new_buf, buf->data, buf->buf_size); - - spin_lock(&j->lock); - swap(buf->data, new_buf); - swap(buf->buf_size, new_size); - spin_unlock(&j->lock); - - kvfree(new_buf); -} - -static CLOSURE_CALLBACK(journal_write_done) -{ - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - union bch_replicas_padded replicas; - u64 seq = le64_to_cpu(w->data->seq); - int err = 0; - - bch2_time_stats_update(!JSET_NO_FLUSH(w->data) - ? j->flush_write_time - : j->noflush_write_time, j->write_start_time); - - if (!w->devs_written.nr) { - err = bch_err_throw(c, journal_write_err); - } else { - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - w->devs_written); - err = bch2_mark_replicas(c, &replicas.e); - } - - if (err && !bch2_journal_error(j)) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - if (err == -BCH_ERR_journal_write_err) - prt_printf(&buf, "unable to write journal to sufficient devices\n"); - else - prt_printf(&buf, "journal write error marking replicas: %s\n", - bch2_err_str(err)); - - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - - closure_debug_destroy(cl); - - spin_lock(&j->lock); - if (seq >= j->pin.front) - journal_seq_pin(j, seq)->devs = w->devs_written; - if (err && (!j->err_seq || seq < j->err_seq)) - j->err_seq = seq; - w->write_done = true; - - if (!j->free_buf || j->free_buf_size < w->buf_size) { - swap(j->free_buf, w->data); - swap(j->free_buf_size, w->buf_size); - } - - if (w->data) { - void *buf = w->data; - w->data = NULL; - w->buf_size = 0; - - spin_unlock(&j->lock); - kvfree(buf); - spin_lock(&j->lock); - } - - bool completed = false; - bool do_discards = false; - - for (seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); - seq++) { - w = j->buf + (seq & JOURNAL_BUF_MASK); - if (!w->write_done) - break; - - if (!j->err_seq && !w->noflush) { - j->flushed_seq_ondisk = seq; - j->last_seq_ondisk = w->last_seq; - - closure_wake_up(&c->freelist_wait); - bch2_reset_alloc_cursors(c); - do_discards = true; - } - - j->seq_ondisk = seq; - - /* - * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard - * more buckets: - * - * Must come before signaling write completion, for - * bch2_fs_journal_stop(): - */ - if (j->watermark != BCH_WATERMARK_stripe) - journal_reclaim_kick(&c->journal); - - closure_wake_up(&w->wait); - completed = true; - } - - if (completed) { - bch2_journal_reclaim_fast(j); - bch2_journal_space_available(j); - - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); - - journal_wake(j); - } - - if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && - j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { - struct journal_buf *buf = journal_cur_buf(j); - long delta = buf->expires - jiffies; - - /* - * We don't close a journal entry to write it while there's - * previous entries still in flight - the current journal entry - * might want to be written now: - */ - mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); - } - - /* - * We don't typically trigger journal writes from her - the next journal - * write will be triggered immediately after the previous one is - * allocated, in bch2_journal_write() - but the journal write error path - * is special: - */ - bch2_journal_do_writes(j); - spin_unlock(&j->lock); - - if (do_discards) - bch2_do_discards(c); -} - -static void journal_write_endio(struct bio *bio) -{ - struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); - struct bch_dev *ca = jbio->ca; - struct journal *j = &ca->fs->journal; - struct journal_buf *w = j->buf + jbio->buf_idx; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, - jbio->submit_time, !bio->bi_status); - - if (bio->bi_status) { - bch_err_dev_ratelimited(ca, - "error writing journal entry %llu: %s", - le64_to_cpu(w->data->seq), - bch2_blk_status_to_str(bio->bi_status)); - - unsigned long flags; - spin_lock_irqsave(&j->err_lock, flags); - bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); - spin_unlock_irqrestore(&j->err_lock, flags); - } - - closure_put(&w->io); - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); -} - -static CLOSURE_CALLBACK(journal_write_submit) -{ - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned sectors = vstruct_sectors(w->data, c->block_bits); - - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], - sectors); - - struct journal_device *ja = &ca->journal; - struct journal_bio *jbio = ja->bio[w->idx]; - struct bio *bio = &jbio->bio; - - jbio->submit_time = local_clock(); - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = ptr->offset; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); - - BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); - ca->prev_journal_sector = bio->bi_iter.bi_sector; - - if (!JSET_NO_FLUSH(w->data)) - bio->bi_opf |= REQ_FUA; - if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) - bio->bi_opf |= REQ_PREFLUSH; - - bch2_bio_map(bio, w->data, sectors << 9); - - trace_and_count(c, journal_write, bio); - closure_bio_submit(bio, cl); - - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - } - - continue_at(cl, journal_write_done, j->wq); -} - -static CLOSURE_CALLBACK(journal_write_preflush) -{ - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - /* - * Wait for previous journal writes to comelete; they won't necessarily - * be flushed if they're still in flight - */ - if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { - spin_lock(&j->lock); - if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { - closure_wait(&j->async_wait, cl); - spin_unlock(&j->lock); - continue_at(cl, journal_write_preflush, j->wq); - return; - } - spin_unlock(&j->lock); - } - - if (w->separate_flush) { - for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) { - enumerated_ref_get(&ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_journal_write); - - struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->bio[w->idx]->bio; - bio_reset(bio, ca->disk_sb.bdev, - REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } - - continue_at(cl, journal_write_submit, j->wq); - } else { - /* - * no need to punt to another work item if we're not waiting on - * preflushes - */ - journal_write_submit(&cl->work); - } -} - -static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset_entry *start, *end; - struct jset *jset = w->data; - struct journal_keys_to_wb wb = { NULL }; - unsigned u64s; - unsigned long btree_roots_have = 0; - u64 seq = le64_to_cpu(jset->seq); - int ret; - - /* - * Simple compaction, dropping empty jset_entries (from journal - * reservations that weren't fully used) and merging jset_entries that - * can be. - * - * If we wanted to be really fancy here, we could sort all the keys in - * the jset and drop keys that were overwritten - probably not worth it: - */ - vstruct_for_each(jset, i) { - unsigned u64s = le16_to_cpu(i->u64s); - - /* Empty entry: */ - if (!u64s) - continue; - - /* - * New btree roots are set by journalling them; when the journal - * entry gets written we have to propagate them to - * c->btree_roots - * - * But, every journal entry we write has to contain all the - * btree roots (at least for now); so after we copy btree roots - * to c->btree_roots we have to get any missing btree roots and - * add them to this journal entry: - */ - switch (i->type) { - case BCH_JSET_ENTRY_btree_root: - bch2_journal_entry_to_btree_root(c, i); - __set_bit(i->btree_id, &btree_roots_have); - break; - case BCH_JSET_ENTRY_write_buffer_keys: - EBUG_ON(!w->need_flush_to_write_buffer); - - if (!wb.wb) - bch2_journal_keys_to_write_buffer_start(c, &wb, seq); - - jset_entry_for_each_key(i, k) { - ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); - if (ret) { - bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", - bch2_err_str(ret)); - bch2_journal_keys_to_write_buffer_end(c, &wb); - return ret; - } - } - i->type = BCH_JSET_ENTRY_btree_keys; - break; - } - } - - if (wb.wb) { - ret = bch2_journal_keys_to_write_buffer_end(c, &wb); - if (ret) { - bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", - bch2_err_str(ret)); - return ret; - } - } - - spin_lock(&c->journal.lock); - w->need_flush_to_write_buffer = false; - spin_unlock(&c->journal.lock); - - start = end = vstruct_last(jset); - - end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); - - struct jset_entry_datetime *d = - container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); - d->entry.type = BCH_JSET_ENTRY_datetime; - d->seconds = cpu_to_le64(ktime_get_real_seconds()); - - bch2_journal_super_entries_add_common(c, &end, seq); - u64s = (u64 *) end - (u64 *) start; - - WARN_ON(u64s > j->entry_u64s_reserved); - - le32_add_cpu(&jset->u64s, u64s); - - unsigned sectors = vstruct_sectors(jset, c->block_bits); - - if (sectors > w->sectors) { - bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", - vstruct_bytes(jset), w->sectors << 9, - u64s, w->u64s_reserved, j->entry_u64s_reserved); - return -EINVAL; - } - - return 0; -} - -static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset *jset = w->data; - u64 seq = le64_to_cpu(jset->seq); - bool validate_before_checksum = false; - int ret = 0; - - jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = cpu_to_le32(c->sb.version); - - SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); - SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); - - if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) - j->last_empty_seq = seq; - - if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) - validate_before_checksum = true; - - if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) - validate_before_checksum = true; - - if (validate_before_checksum && - (ret = jset_validate(c, NULL, jset, 0, WRITE))) - return ret; - - ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), - jset->encrypted_start, - vstruct_end(jset) - (void *) jset->encrypted_start); - if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret))) - return ret; - - jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), - journal_nonce(jset), jset); - - if (!validate_before_checksum && - (ret = jset_validate(c, NULL, jset, 0, WRITE))) - return ret; - - unsigned sectors = vstruct_sectors(jset, c->block_bits); - unsigned bytes = vstruct_bytes(jset); - memset((void *) jset + bytes, 0, (sectors << 9) - bytes); - return 0; -} - -static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - int error = bch2_journal_error(j); - - /* - * If the journal is in an error state - we did an emergency shutdown - - * we prefer to continue doing journal writes. We just mark them as - * noflush so they'll never be used, but they'll still be visible by the - * list_journal tool - this helps in debugging. - * - * There's a caveat: the first journal write after marking the - * superblock dirty must always be a flush write, because on startup - * from a clean shutdown we didn't necessarily read the journal and the - * new journal write might overwrite whatever was in the journal - * previously - we can't leave the journal without any flush writes in - * it. - * - * So if we're in an error state, and we're still starting up, we don't - * write anything at all. - */ - if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) - return error; - - if (error || - w->noflush || - (!w->must_flush && - time_before(jiffies, j->last_flush_write + - msecs_to_jiffies(c->opts.journal_flush_delay)) && - test_bit(JOURNAL_may_skip_flush, &j->flags))) { - w->noflush = true; - SET_JSET_NO_FLUSH(w->data, true); - w->data->last_seq = 0; - w->last_seq = 0; - - j->nr_noflush_writes++; - } else { - w->must_flush = true; - j->last_flush_write = jiffies; - j->nr_flush_writes++; - clear_bit(JOURNAL_need_flush_write, &j->flags); - } - - return 0; -} - -CLOSURE_CALLBACK(bch2_journal_write) -{ - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - union bch_replicas_padded replicas; - unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); - int ret; - - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - BUG_ON(!w->write_started); - BUG_ON(w->write_allocated); - BUG_ON(w->write_done); - - j->write_start_time = local_clock(); - - spin_lock(&j->lock); - if (nr_rw_members > 1) - w->separate_flush = true; - - ret = bch2_journal_write_pick_flush(j, w); - spin_unlock(&j->lock); - - if (unlikely(ret)) - goto err; - - mutex_lock(&j->buf_lock); - journal_buf_realloc(j, w); - - ret = bch2_journal_write_prep(j, w); - mutex_unlock(&j->buf_lock); - - if (unlikely(ret)) - goto err; - - unsigned replicas_allocated = 0; - while (1) { - ret = journal_write_alloc(j, w, &replicas_allocated); - if (!ret || !j->can_discard) - break; - - bch2_journal_do_discards(j); - } - - if (unlikely(ret)) - goto err_allocate_write; - - ret = bch2_journal_write_checksum(j, w); - if (unlikely(ret)) - goto err; - - spin_lock(&j->lock); - /* - * write is allocated, no longer need to account for it in - * bch2_journal_space_available(): - */ - w->sectors = 0; - w->write_allocated = true; - j->entry_bytes_written += vstruct_bytes(w->data); - - /* - * journal entry has been compacted and allocated, recalculate space - * available: - */ - bch2_journal_space_available(j); - bch2_journal_do_writes(j); - spin_unlock(&j->lock); - - w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - - /* - * Mark journal replicas before we submit the write to guarantee - * recovery will find the journal entries after a crash. - */ - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - w->devs_written); - ret = bch2_mark_replicas(c, &replicas.e); - if (ret) - goto err; - - if (c->opts.nochanges) - goto no_io; - - if (!JSET_NO_FLUSH(w->data)) - continue_at(cl, journal_write_preflush, j->wq); - else - continue_at(cl, journal_write_submit, j->wq); - return; -err_allocate_write: - if (!bch2_journal_error(j)) { - struct printbuf buf = PRINTBUF; - - bch2_journal_debug_to_text(&buf, j); - prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), - le64_to_cpu(w->data->seq), - vstruct_sectors(w->data, c->block_bits), - bch2_err_str(ret)); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } -err: - bch2_fatal_error(c); -no_io: - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); - } - - continue_at(cl, journal_write_done, j->wq); -} diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h deleted file mode 100644 index 6fa82c4050fe..000000000000 --- a/fs/bcachefs/journal_io.h +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_IO_H -#define _BCACHEFS_JOURNAL_IO_H - -#include "darray.h" - -void bch2_journal_pos_from_member_info_set(struct bch_fs *); -void bch2_journal_pos_from_member_info_resume(struct bch_fs *); - -struct journal_ptr { - bool csum_good; - struct bch_csum csum; - u8 dev; - u32 bucket; - u32 bucket_offset; - u64 sector; -}; - -/* - * Only used for holding the journal entries we read in btree_journal_read() - * during cache_registration - */ -struct journal_replay { - DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; - - bool csum_good; - bool ignore_blacklisted; - bool ignore_not_dirty; - /* must be last: */ - struct jset j; -}; - -static inline bool journal_replay_ignore(struct journal_replay *i) -{ - return !i || i->ignore_blacklisted || i->ignore_not_dirty; -} - -static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, - struct jset_entry *entry, unsigned type) -{ - while (entry < vstruct_last(jset)) { - if (entry->type == type) - return entry; - - entry = vstruct_next(entry); - } - - return NULL; -} - -#define for_each_jset_entry_type(entry, jset, type) \ - for (struct jset_entry *entry = (jset)->start; \ - (entry = __jset_entry_type_next(jset, entry, type)); \ - entry = vstruct_next(entry)) - -#define jset_entry_for_each_key(_e, _k) \ - for (struct bkey_i *_k = (_e)->start; \ - _k < vstruct_last(_e); \ - _k = bkey_next(_k)) - -#define for_each_jset_key(k, entry, jset) \ - for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\ - jset_entry_for_each_key(entry, k) - -int bch2_journal_entry_validate(struct bch_fs *, struct jset *, - struct jset_entry *, unsigned, int, - struct bkey_validate_context); -void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, - struct jset_entry *); - -void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, - struct journal_replay *); - -int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); - -CLOSURE_CALLBACK(bch2_journal_write); - -static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) -{ - struct jset_entry *entry = *end; - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - - memset(entry, 0, u64s * sizeof(u64)); - /* - * The u64s field counts from the start of data, ignoring the shared - * fields. - */ - entry->u64s = cpu_to_le16(u64s - 1); - - *end = vstruct_next(*end); - return entry; -} - -#endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c deleted file mode 100644 index 0042d43b8e57..000000000000 --- a/fs/bcachefs/journal_reclaim.c +++ /dev/null @@ -1,1037 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "errcode.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "replicas.h" -#include "sb-members.h" -#include "trace.h" - -#include <linux/kthread.h> -#include <linux/sched/mm.h> - -static bool __should_discard_bucket(struct journal *, struct journal_device *); - -/* Free space calculations: */ - -static unsigned journal_space_from(struct journal_device *ja, - enum journal_space_from from) -{ - switch (from) { - case journal_space_discarded: - return ja->discard_idx; - case journal_space_clean_ondisk: - return ja->dirty_idx_ondisk; - case journal_space_clean: - return ja->dirty_idx; - default: - BUG(); - } -} - -unsigned bch2_journal_dev_buckets_available(struct journal *j, - struct journal_device *ja, - enum journal_space_from from) -{ - if (!ja->nr) - return 0; - - unsigned available = (journal_space_from(ja, from) - - ja->cur_idx - 1 + ja->nr) % ja->nr; - - /* - * Don't use the last bucket unless writing the new last_seq - * will make another bucket available: - */ - if (available && ja->dirty_idx_ondisk == ja->dirty_idx) - --available; - - return available; -} - -void bch2_journal_set_watermark(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - bool low_on_space = j->space[journal_space_clean].total * 4 <= - j->space[journal_space_total].total; - bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; - bool low_on_wb = bch2_btree_write_buffer_must_wait(c); - unsigned watermark = low_on_space || low_on_pin || low_on_wb - ? BCH_WATERMARK_reclaim - : BCH_WATERMARK_stripe; - - if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) || - track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) || - track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) - trace_and_count(c, journal_full, c); - - mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin); - - swap(watermark, j->watermark); - if (watermark > j->watermark) - journal_wake(j); -} - -static struct journal_space -journal_dev_space_available(struct journal *j, struct bch_dev *ca, - enum journal_space_from from) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_device *ja = &ca->journal; - unsigned sectors, buckets, unwritten; - unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c)); - u64 seq; - - if (from == journal_space_total) - return (struct journal_space) { - .next_entry = bucket_size_aligned, - .total = bucket_size_aligned * ja->nr, - }; - - buckets = bch2_journal_dev_buckets_available(j, ja, from); - sectors = round_down(ja->sectors_free, block_sectors(c)); - - /* - * We that we don't allocate the space for a journal entry - * until we write it out - thus, account for it here: - */ - for (seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); - seq++) { - unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors; - - if (!unwritten) - continue; - - /* entry won't fit on this device, skip: */ - if (unwritten > bucket_size_aligned) - continue; - - if (unwritten >= sectors) { - if (!buckets) { - sectors = 0; - break; - } - - buckets--; - sectors = bucket_size_aligned; - } - - sectors -= unwritten; - } - - if (sectors < ca->mi.bucket_size && buckets) { - buckets--; - sectors = bucket_size_aligned; - } - - return (struct journal_space) { - .next_entry = sectors, - .total = sectors + buckets * bucket_size_aligned, - }; -} - -static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want, - enum journal_space_from from) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned pos, nr_devs = 0; - struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; - unsigned min_bucket_size = U32_MAX; - - BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - if (!ca->journal.nr || - !ca->mi.durability) - continue; - - min_bucket_size = min(min_bucket_size, ca->mi.bucket_size); - - space = journal_dev_space_available(j, ca, from); - if (!space.next_entry) - continue; - - for (pos = 0; pos < nr_devs; pos++) - if (space.total > dev_space[pos].total) - break; - - array_insert_item(dev_space, nr_devs, pos, space); - } - - if (nr_devs < nr_devs_want) - return (struct journal_space) { 0, 0 }; - - /* - * It's possible for bucket size to be misaligned w.r.t. the filesystem - * block size: - */ - min_bucket_size = round_down(min_bucket_size, block_sectors(c)); - - /* - * We sorted largest to smallest, and we want the smallest out of the - * @nr_devs_want largest devices: - */ - space = dev_space[nr_devs_want - 1]; - space.next_entry = min(space.next_entry, min_bucket_size); - return space; -} - -void bch2_journal_space_available(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned clean, clean_ondisk, total; - unsigned max_entry_size = min(j->buf[0].buf_size >> 9, - j->buf[1].buf_size >> 9); - unsigned nr_online = 0, nr_devs_want; - bool can_discard = false; - int ret = 0; - - lockdep_assert_held(&j->lock); - guard(rcu)(); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - struct journal_device *ja = &ca->journal; - - if (!ja->nr) - continue; - - while (ja->dirty_idx != ja->cur_idx && - ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) - ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; - - while (ja->dirty_idx_ondisk != ja->dirty_idx && - ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) - ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; - - can_discard |= __should_discard_bucket(j, ja); - - max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); - nr_online++; - } - - j->can_discard = can_discard; - - if (nr_online < metadata_replicas_required(c)) { - if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" - "rw journal devs:", nr_online, metadata_replicas_required(c)); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) - prt_printf(&buf, " %s", ca->name); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - } - ret = bch_err_throw(c, insufficient_journal_devices); - goto out; - } - - nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); - - for (unsigned i = 0; i < journal_space_nr; i++) - j->space[i] = __journal_space_available(j, nr_devs_want, i); - - clean_ondisk = j->space[journal_space_clean_ondisk].total; - clean = j->space[journal_space_clean].total; - total = j->space[journal_space_total].total; - - if (!j->space[journal_space_discarded].next_entry) - ret = bch_err_throw(c, journal_full); - - if ((j->space[journal_space_clean_ondisk].next_entry < - j->space[journal_space_clean_ondisk].total) && - (clean - clean_ondisk <= total / 8) && - (clean_ondisk * 2 > clean)) - set_bit(JOURNAL_may_skip_flush, &j->flags); - else - clear_bit(JOURNAL_may_skip_flush, &j->flags); - - bch2_journal_set_watermark(j); -out: - j->cur_entry_sectors = !ret - ? j->space[journal_space_discarded].next_entry - : 0; - j->cur_entry_error = ret; - - if (!ret) - journal_wake(j); -} - -/* Discards - last part of journal reclaim: */ - -static bool __should_discard_bucket(struct journal *j, struct journal_device *ja) -{ - unsigned min_free = max(4, ja->nr / 8); - - return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < - min_free && - ja->discard_idx != ja->dirty_idx_ondisk; -} - -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) -{ - spin_lock(&j->lock); - bool ret = __should_discard_bucket(j, ja); - spin_unlock(&j->lock); - - return ret; -} - -/* - * Advance ja->discard_idx as long as it points to buckets that are no longer - * dirty, issuing discards if necessary: - */ -void bch2_journal_do_discards(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - mutex_lock(&j->discard_lock); - - for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) { - struct journal_device *ja = &ca->journal; - - while (should_discard_bucket(j, ja)) { - if (!c->opts.nochanges && - bch2_discard_opt_enabled(c, ca) && - bdev_max_discard_sectors(ca->disk_sb.bdev)) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, - ja->buckets[ja->discard_idx]), - ca->mi.bucket_size, GFP_NOFS); - - spin_lock(&j->lock); - ja->discard_idx = (ja->discard_idx + 1) % ja->nr; - - bch2_journal_space_available(j); - spin_unlock(&j->lock); - } - } - - mutex_unlock(&j->discard_lock); -} - -/* - * Journal entry pinning - machinery for holding a reference on a given journal - * entry, holding it open to ensure it gets replayed during recovery: - */ - -void bch2_journal_reclaim_fast(struct journal *j) -{ - bool popped = false; - - lockdep_assert_held(&j->lock); - - /* - * Unpin journal entries whose reference counts reached zero, meaning - * all btree nodes got written out - */ - while (!fifo_empty(&j->pin) && - j->pin.front <= j->seq_ondisk && - !atomic_read(&fifo_peek_front(&j->pin).count)) { - j->pin.front++; - popped = true; - } - - if (popped) { - bch2_journal_space_available(j); - __closure_wake_up(&j->reclaim_flush_wait); - } -} - -bool __bch2_journal_pin_put(struct journal *j, u64 seq) -{ - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); - - return atomic_dec_and_test(&pin_list->count); -} - -void bch2_journal_pin_put(struct journal *j, u64 seq) -{ - if (__bch2_journal_pin_put(j, seq)) { - spin_lock(&j->lock); - bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); - } -} - -static inline bool __journal_pin_drop(struct journal *j, - struct journal_entry_pin *pin) -{ - struct journal_entry_pin_list *pin_list; - - if (!journal_pin_active(pin)) - return false; - - if (j->flush_in_progress == pin) - j->flush_in_progress_dropped = true; - - pin_list = journal_seq_pin(j, pin->seq); - pin->seq = 0; - list_del_init(&pin->list); - - if (j->reclaim_flush_wait.list.first) - __closure_wake_up(&j->reclaim_flush_wait); - - /* - * Unpinning a journal entry may make journal_next_bucket() succeed, if - * writing a new last_seq will now make another bucket available: - */ - return atomic_dec_and_test(&pin_list->count) && - pin_list == &fifo_peek_front(&j->pin); -} - -void bch2_journal_pin_drop(struct journal *j, - struct journal_entry_pin *pin) -{ - spin_lock(&j->lock); - if (__journal_pin_drop(j, pin)) - bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); -} - -static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, - journal_pin_flush_fn fn) -{ - if (fn == bch2_btree_node_flush0 || - fn == bch2_btree_node_flush1) { - unsigned idx = fn == bch2_btree_node_flush1; - struct btree *b = container_of(pin, struct btree, writes[idx].journal); - - return JOURNAL_PIN_TYPE_btree0 - b->c.level; - } else if (fn == bch2_btree_key_cache_journal_flush) - return JOURNAL_PIN_TYPE_key_cache; - else - return JOURNAL_PIN_TYPE_other; -} - -static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn, - enum journal_pin_type type) -{ - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); - - /* - * flush_fn is how we identify journal pins in debugfs, so must always - * exist, even if it doesn't do anything: - */ - BUG_ON(!flush_fn); - - atomic_inc(&pin_list->count); - pin->seq = seq; - pin->flush = flush_fn; - - if (list_empty(&pin_list->unflushed[type]) && - j->reclaim_flush_wait.list.first) - __closure_wake_up(&j->reclaim_flush_wait); - - list_add(&pin->list, &pin_list->unflushed[type]); -} - -void bch2_journal_pin_copy(struct journal *j, - struct journal_entry_pin *dst, - struct journal_entry_pin *src, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - - u64 seq = READ_ONCE(src->seq); - - if (seq < journal_last_seq(j)) { - /* - * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on - * the src pin - with the pin dropped, the entry to pin might no - * longer to exist, but that means there's no longer anything to - * copy and we can bail out here: - */ - spin_unlock(&j->lock); - return; - } - - bool reclaim = __journal_pin_drop(j, dst); - - bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); - - if (reclaim) - bch2_journal_reclaim_fast(j); - - /* - * If the journal is currently full, we might want to call flush_fn - * immediately: - */ - if (seq == journal_last_seq(j)) - journal_wake(j); - spin_unlock(&j->lock); -} - -void bch2_journal_pin_set(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - - BUG_ON(seq < journal_last_seq(j)); - - bool reclaim = __journal_pin_drop(j, pin); - - bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); - - if (reclaim) - bch2_journal_reclaim_fast(j); - /* - * If the journal is currently full, we might want to call flush_fn - * immediately: - */ - if (seq == journal_last_seq(j)) - journal_wake(j); - - spin_unlock(&j->lock); -} - -/** - * bch2_journal_pin_flush: ensure journal pin callback is no longer running - * @j: journal object - * @pin: pin to flush - */ -void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) -{ - BUG_ON(journal_pin_active(pin)); - - wait_event(j->pin_flush_wait, j->flush_in_progress != pin); -} - -/* - * Journal reclaim: flush references to open journal entries to reclaim space in - * the journal - * - * May be done by the journal code in the background as needed to free up space - * for more journal entries, or as part of doing a clean shutdown, or to migrate - * data off of a specific device: - */ - -static struct journal_entry_pin * -journal_get_next_pin(struct journal *j, - u64 seq_to_flush, - unsigned allowed_below_seq, - unsigned allowed_above_seq, - u64 *seq) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *ret = NULL; - - fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { - if (*seq > seq_to_flush && !allowed_above_seq) - break; - - for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) - if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) || - (BIT(i) & allowed_above_seq)) { - ret = list_first_entry_or_null(&pin_list->unflushed[i], - struct journal_entry_pin, list); - if (ret) - return ret; - } - } - - return NULL; -} - -/* returns true if we did work */ -static size_t journal_flush_pins(struct journal *j, - u64 seq_to_flush, - unsigned allowed_below_seq, - unsigned allowed_above_seq, - unsigned min_any, - unsigned min_key_cache) -{ - struct journal_entry_pin *pin; - size_t nr_flushed = 0; - journal_pin_flush_fn flush_fn; - u64 seq; - int err; - - lockdep_assert_held(&j->reclaim_lock); - - while (1) { - unsigned allowed_above = allowed_above_seq; - unsigned allowed_below = allowed_below_seq; - - if (min_any) { - allowed_above |= ~0; - allowed_below |= ~0; - } - - if (min_key_cache) { - allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache); - allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache); - } - - cond_resched(); - - j->last_flushed = jiffies; - - spin_lock(&j->lock); - pin = journal_get_next_pin(j, seq_to_flush, - allowed_below, - allowed_above, &seq); - if (pin) { - BUG_ON(j->flush_in_progress); - j->flush_in_progress = pin; - j->flush_in_progress_dropped = false; - flush_fn = pin->flush; - } - spin_unlock(&j->lock); - - if (!pin) - break; - - if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush) - min_key_cache--; - - if (min_any) - min_any--; - - err = flush_fn(j, pin, seq); - - spin_lock(&j->lock); - /* Pin might have been dropped or rearmed: */ - if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); - j->flush_in_progress = NULL; - j->flush_in_progress_dropped = false; - spin_unlock(&j->lock); - - wake_up(&j->pin_flush_wait); - - if (err) - break; - - nr_flushed++; - } - - return nr_flushed; -} - -static u64 journal_seq_to_flush(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - u64 seq_to_flush = 0; - - guard(spinlock)(&j->lock); - guard(rcu)(); - - for_each_rw_member_rcu(c, ca) { - struct journal_device *ja = &ca->journal; - unsigned nr_buckets, bucket_to_flush; - - if (!ja->nr) - continue; - - /* Try to keep the journal at most half full: */ - nr_buckets = ja->nr / 2; - - bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; - seq_to_flush = max(seq_to_flush, - ja->bucket_seq[bucket_to_flush]); - } - - /* Also flush if the pin fifo is more than half full */ - return max_t(s64, seq_to_flush, - (s64) journal_cur_seq(j) - - (j->pin.size >> 1)); -} - -/** - * __bch2_journal_reclaim - free up journal buckets - * @j: journal object - * @direct: direct or background reclaim? - * @kicked: requested to run since we last ran? - * - * Background journal reclaim writes out btree nodes. It should be run - * early enough so that we never completely run out of journal buckets. - * - * High watermarks for triggering background reclaim: - * - FIFO has fewer than 512 entries left - * - fewer than 25% journal buckets free - * - * Background reclaim runs until low watermarks are reached: - * - FIFO has more than 1024 entries left - * - more than 50% journal buckets free - * - * As long as a reclaim can complete in the time it takes to fill up - * 512 journal entries or 25% of all journal buckets, then - * journal_next_bucket() should not stall. - */ -static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct btree_cache *bc = &c->btree_cache; - bool kthread = (current->flags & PF_KTHREAD) != 0; - u64 seq_to_flush; - size_t min_nr, min_key_cache, nr_flushed; - unsigned flags; - int ret = 0; - - /* - * We can't invoke memory reclaim while holding the reclaim_lock - - * journal reclaim is required to make progress for memory reclaim - * (cleaning the caches), so we can't get stuck in memory reclaim while - * we're holding the reclaim lock: - */ - lockdep_assert_held(&j->reclaim_lock); - flags = memalloc_noreclaim_save(); - - do { - if (kthread && kthread_should_stop()) - break; - - ret = bch2_journal_error(j); - if (ret) - break; - - /* XXX shove journal discards off to another thread */ - bch2_journal_do_discards(j); - - seq_to_flush = journal_seq_to_flush(j); - min_nr = 0; - - /* - * If it's been longer than j->reclaim_delay_ms since we last flushed, - * make sure to flush at least one journal pin: - */ - if (time_after(jiffies, j->last_flushed + - msecs_to_jiffies(c->opts.journal_reclaim_delay))) - min_nr = 1; - - if (j->watermark != BCH_WATERMARK_stripe) - min_nr = 1; - - size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr; - if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live) - min_nr = 1; - - min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); - - trace_and_count(c, journal_reclaim_start, c, - direct, kicked, - min_nr, min_key_cache, - atomic_long_read(&bc->nr_dirty), btree_cache_live, - atomic_long_read(&c->btree_key_cache.nr_dirty), - atomic_long_read(&c->btree_key_cache.nr_keys)); - - nr_flushed = journal_flush_pins(j, seq_to_flush, - ~0, 0, - min_nr, min_key_cache); - - if (direct) - j->nr_direct_reclaim += nr_flushed; - else - j->nr_background_reclaim += nr_flushed; - trace_and_count(c, journal_reclaim_finish, c, nr_flushed); - - if (nr_flushed) - wake_up(&j->reclaim_wait); - } while ((min_nr || min_key_cache) && nr_flushed && !direct); - - memalloc_noreclaim_restore(flags); - - return ret; -} - -int bch2_journal_reclaim(struct journal *j) -{ - return __bch2_journal_reclaim(j, true, true); -} - -static int bch2_journal_reclaim_thread(void *arg) -{ - struct journal *j = arg; - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned long delay, now; - bool journal_empty; - int ret = 0; - - set_freezable(); - - j->last_flushed = jiffies; - - while (!ret && !kthread_should_stop()) { - bool kicked = j->reclaim_kicked; - - j->reclaim_kicked = false; - - mutex_lock(&j->reclaim_lock); - ret = __bch2_journal_reclaim(j, false, kicked); - mutex_unlock(&j->reclaim_lock); - - now = jiffies; - delay = msecs_to_jiffies(c->opts.journal_reclaim_delay); - j->next_reclaim = j->last_flushed + delay; - - if (!time_in_range(j->next_reclaim, now, now + delay)) - j->next_reclaim = now + delay; - - while (1) { - set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); - if (kthread_should_stop()) - break; - if (j->reclaim_kicked) - break; - - spin_lock(&j->lock); - journal_empty = fifo_empty(&j->pin); - spin_unlock(&j->lock); - - long timeout = j->next_reclaim - jiffies; - - if (journal_empty) - schedule(); - else if (timeout > 0) - schedule_timeout(timeout); - else - break; - } - __set_current_state(TASK_RUNNING); - } - - return 0; -} - -void bch2_journal_reclaim_stop(struct journal *j) -{ - struct task_struct *p = j->reclaim_thread; - - j->reclaim_thread = NULL; - - if (p) { - kthread_stop(p); - put_task_struct(p); - } -} - -int bch2_journal_reclaim_start(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct task_struct *p; - int ret; - - if (j->reclaim_thread) - return 0; - - p = kthread_create(bch2_journal_reclaim_thread, j, - "bch-reclaim/%s", c->name); - ret = PTR_ERR_OR_ZERO(p); - bch_err_msg(c, ret, "creating journal reclaim thread"); - if (ret) - return ret; - - get_task_struct(p); - j->reclaim_thread = p; - wake_up_process(p); - return 0; -} - -static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, - unsigned types) -{ - struct journal_entry_pin_list *pin_list; - u64 seq; - - spin_lock(&j->lock); - fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { - if (seq > seq_to_flush) - break; - - for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) - if ((BIT(i) & types) && - (!list_empty(&pin_list->unflushed[i]) || - !list_empty(&pin_list->flushed[i]))) { - spin_unlock(&j->lock); - return true; - } - } - spin_unlock(&j->lock); - - return false; -} - -static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush, - unsigned types) -{ - return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) || - journal_pins_still_flushing(j, seq_to_flush, types); -} - -static int journal_flush_done(struct journal *j, u64 seq_to_flush, - bool *did_work) -{ - int ret = 0; - - ret = bch2_journal_error(j); - if (ret) - return ret; - - mutex_lock(&j->reclaim_lock); - - for (int type = JOURNAL_PIN_TYPE_NR - 1; - type >= 0; - --type) - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { - *did_work = true; - goto unlock; - } - - if (seq_to_flush > journal_cur_seq(j)) - bch2_journal_entry_close(j); - - spin_lock(&j->lock); - /* - * If journal replay hasn't completed, the unreplayed journal entries - * hold refs on their corresponding sequence numbers - */ - ret = !test_bit(JOURNAL_replay_done, &j->flags) || - journal_last_seq(j) > seq_to_flush || - !fifo_used(&j->pin); - - spin_unlock(&j->lock); -unlock: - mutex_unlock(&j->reclaim_lock); - - return ret; -} - -bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) -{ - /* time_stats this */ - bool did_work = false; - - if (!test_bit(JOURNAL_running, &j->flags)) - return false; - - closure_wait_event(&j->reclaim_flush_wait, - journal_flush_done(j, seq_to_flush, &did_work)); - - return did_work; -} - -int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_entry_pin_list *p; - u64 iter, seq = 0; - int ret = 0; - - spin_lock(&j->lock); - fifo_for_each_entry_ptr(p, &j->pin, iter) - if (dev_idx >= 0 - ? bch2_dev_list_has_dev(p->devs, dev_idx) - : p->devs.nr < c->opts.metadata_replicas) - seq = iter; - spin_unlock(&j->lock); - - bch2_journal_flush_pins(j, seq); - - ret = bch2_journal_error(j); - if (ret) - return ret; - - mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); - - /* - * Now that we've populated replicas_gc, write to the journal to mark - * active journal devices. This handles the case where the journal might - * be empty. Otherwise we could clear all journal replicas and - * temporarily put the fs into an unrecoverable state. Journal recovery - * expects to find devices marked for journal data on unclean mount. - */ - ret = bch2_journal_meta(&c->journal); - if (ret) - goto err; - - seq = 0; - spin_lock(&j->lock); - while (!ret) { - union bch_replicas_padded replicas; - - seq = max(seq, journal_last_seq(j)); - if (seq >= j->pin.back) - break; - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - journal_seq_pin(j, seq)->devs); - seq++; - - if (replicas.e.nr_devs) { - spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, &replicas.e); - spin_lock(&j->lock); - } - } - spin_unlock(&j->lock); -err: - ret = bch2_replicas_gc_end(c, ret); - mutex_unlock(&c->replicas_gc_lock); - - return ret; -} - -bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *pin; - - spin_lock(&j->lock); - if (!test_bit(JOURNAL_running, &j->flags)) { - spin_unlock(&j->lock); - return true; - } - - *seq = max(*seq, j->pin.front); - - if (*seq >= j->pin.back) { - spin_unlock(&j->lock); - return true; - } - - out->atomic++; - - pin_list = journal_seq_pin(j, *seq); - - prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); - printbuf_indent_add(out, 2); - - prt_printf(out, "unflushed:\n"); - for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) - list_for_each_entry(pin, &pin_list->unflushed[i], list) - prt_printf(out, "\t%px %ps\n", pin, pin->flush); - - prt_printf(out, "flushed:\n"); - for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++) - list_for_each_entry(pin, &pin_list->flushed[i], list) - prt_printf(out, "\t%px %ps\n", pin, pin->flush); - - printbuf_indent_sub(out, 2); - - --out->atomic; - spin_unlock(&j->lock); - - return false; -} - -void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) -{ - u64 seq = 0; - - while (!bch2_journal_seq_pins_to_text(out, j, &seq)) - seq++; -} diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h deleted file mode 100644 index 0a73d7134e1c..000000000000 --- a/fs/bcachefs/journal_reclaim.h +++ /dev/null @@ -1,84 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_RECLAIM_H -#define _BCACHEFS_JOURNAL_RECLAIM_H - -#define JOURNAL_PIN (32 * 1024) - -static inline void journal_reclaim_kick(struct journal *j) -{ - struct task_struct *p = READ_ONCE(j->reclaim_thread); - - j->reclaim_kicked = true; - if (p) - wake_up_process(p); -} - -unsigned bch2_journal_dev_buckets_available(struct journal *, - struct journal_device *, - enum journal_space_from); -void bch2_journal_set_watermark(struct journal *); -void bch2_journal_space_available(struct journal *); - -static inline bool journal_pin_active(struct journal_entry_pin *pin) -{ - return pin->seq != 0; -} - -static inline struct journal_entry_pin_list * -journal_seq_pin(struct journal *j, u64 seq) -{ - EBUG_ON(seq < j->pin.front || seq >= j->pin.back); - - return &j->pin.data[seq & j->pin.mask]; -} - -void bch2_journal_reclaim_fast(struct journal *); -bool __bch2_journal_pin_put(struct journal *, u64); -void bch2_journal_pin_put(struct journal *, u64); -void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); - -void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *, - journal_pin_flush_fn); - -static inline void bch2_journal_pin_add(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) - bch2_journal_pin_set(j, seq, pin, flush_fn); -} - -void bch2_journal_pin_copy(struct journal *, - struct journal_entry_pin *, - struct journal_entry_pin *, - journal_pin_flush_fn); - -static inline void bch2_journal_pin_update(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - if (unlikely(!journal_pin_active(pin) || pin->seq < seq)) - bch2_journal_pin_set(j, seq, pin, flush_fn); -} - -void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); - -void bch2_journal_do_discards(struct journal *); -int bch2_journal_reclaim(struct journal *); - -void bch2_journal_reclaim_stop(struct journal *); -int bch2_journal_reclaim_start(struct journal *); - -bool bch2_journal_flush_pins(struct journal *, u64); - -static inline bool bch2_journal_flush_all_pins(struct journal *j) -{ - return bch2_journal_flush_pins(j, U64_MAX); -} - -int bch2_journal_flush_device_pins(struct journal *, int); - -void bch2_journal_pins_to_text(struct printbuf *, struct journal *); -bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); - -#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c deleted file mode 100644 index 0cb9b93f13e7..000000000000 --- a/fs/bcachefs/journal_sb.c +++ /dev/null @@ -1,232 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "journal_sb.h" -#include "darray.h" - -#include <linux/sort.h> - -/* BCH_SB_FIELD_journal: */ - -static int u64_cmp(const void *_l, const void *_r) -{ - const u64 *l = _l; - const u64 *r = _r; - - return cmp_int(*l, *r); -} - -static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_journal *journal = field_to_type(f, journal); - struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); - int ret = -BCH_ERR_invalid_sb_journal; - unsigned nr; - unsigned i; - u64 *b; - - nr = bch2_nr_journal_buckets(journal); - if (!nr) - return 0; - - b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL); - if (!b) - return -BCH_ERR_ENOMEM_sb_journal_validate; - - for (i = 0; i < nr; i++) - b[i] = le64_to_cpu(journal->buckets[i]); - - sort(b, nr, sizeof(u64), u64_cmp, NULL); - - if (!b[0]) { - prt_printf(err, "journal bucket at sector 0"); - goto err; - } - - if (b[0] < le16_to_cpu(m.first_bucket)) { - prt_printf(err, "journal bucket %llu before first bucket %u", - b[0], le16_to_cpu(m.first_bucket)); - goto err; - } - - if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) { - prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", - b[nr - 1], le64_to_cpu(m.nbuckets)); - goto err; - } - - for (i = 0; i + 1 < nr; i++) - if (b[i] == b[i + 1]) { - prt_printf(err, "duplicate journal buckets %llu", b[i]); - goto err; - } - - ret = 0; -err: - kfree(b); - return ret; -} - -static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_journal *journal = field_to_type(f, journal); - unsigned i, nr = bch2_nr_journal_buckets(journal); - - prt_printf(out, "Buckets: "); - for (i = 0; i < nr; i++) - prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i])); - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_journal = { - .validate = bch2_sb_journal_validate, - .to_text = bch2_sb_journal_to_text, -}; - -struct u64_range { - u64 start; - u64 end; -}; - -static int u64_range_cmp(const void *_l, const void *_r) -{ - const struct u64_range *l = _l; - const struct u64_range *r = _r; - - return cmp_int(l->start, r->start); -} - -static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); - struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); - int ret = -BCH_ERR_invalid_sb_journal; - u64 sum = 0; - unsigned nr; - unsigned i; - struct u64_range *b; - - nr = bch2_sb_field_journal_v2_nr_entries(journal); - if (!nr) - return 0; - - b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL); - if (!b) - return -BCH_ERR_ENOMEM_sb_journal_v2_validate; - - for (i = 0; i < nr; i++) { - b[i].start = le64_to_cpu(journal->d[i].start); - b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); - - if (b[i].end <= b[i].start) { - prt_printf(err, "journal buckets entry with bad nr: %llu+%llu", - le64_to_cpu(journal->d[i].start), - le64_to_cpu(journal->d[i].nr)); - goto err; - } - - sum += le64_to_cpu(journal->d[i].nr); - } - - sort(b, nr, sizeof(*b), u64_range_cmp, NULL); - - if (!b[0].start) { - prt_printf(err, "journal bucket at sector 0"); - goto err; - } - - if (b[0].start < le16_to_cpu(m.first_bucket)) { - prt_printf(err, "journal bucket %llu before first bucket %u", - b[0].start, le16_to_cpu(m.first_bucket)); - goto err; - } - - if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) { - prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", - b[nr - 1].end - 1, le64_to_cpu(m.nbuckets)); - goto err; - } - - for (i = 0; i + 1 < nr; i++) { - if (b[i].end > b[i + 1].start) { - prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", - b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); - goto err; - } - } - - if (sum > UINT_MAX) { - prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX); - goto err; - } - - ret = 0; -err: - kfree(b); - return ret; -} - -static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); - unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal); - - prt_printf(out, "Buckets: "); - for (i = 0; i < nr; i++) - prt_printf(out, " %llu-%llu", - le64_to_cpu(journal->d[i].start), - le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr)); - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { - .validate = bch2_sb_journal_v2_validate, - .to_text = bch2_sb_journal_v2_to_text, -}; - -int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, - u64 *buckets, unsigned nr) -{ - struct bch_sb_field_journal_v2 *j; - unsigned i, dst = 0, nr_compacted = 1; - - if (c) - lockdep_assert_held(&c->sb_lock); - - if (!nr) { - bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); - bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); - return 0; - } - - for (i = 0; i + 1 < nr; i++) - if (buckets[i] + 1 != buckets[i + 1]) - nr_compacted++; - - j = bch2_sb_field_resize(&ca->disk_sb, journal_v2, - (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); - if (!j) - return bch_err_throw(c, ENOSPC_sb_journal); - - bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); - - j->d[dst].start = cpu_to_le64(buckets[0]); - j->d[dst].nr = cpu_to_le64(1); - - for (i = 1; i < nr; i++) { - if (buckets[i] == buckets[i - 1] + 1) { - le64_add_cpu(&j->d[dst].nr, 1); - } else { - dst++; - j->d[dst].start = cpu_to_le64(buckets[i]); - j->d[dst].nr = cpu_to_le64(1); - } - } - - BUG_ON(dst + 1 != nr_compacted); - return 0; -} diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h deleted file mode 100644 index ba40a7e8d90a..000000000000 --- a/fs/bcachefs/journal_sb.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#include "super-io.h" -#include "vstructs.h" - -static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) -{ - return j - ? (__le64 *) vstruct_end(&j->field) - j->buckets - : 0; -} - -static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j) -{ - if (!j) - return 0; - - return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0]; -} - -extern const struct bch_sb_field_ops bch_sb_field_ops_journal; -extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; - -int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c deleted file mode 100644 index af4fe416d9ec..000000000000 --- a/fs/bcachefs/journal_seq_blacklist.c +++ /dev/null @@ -1,264 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "eytzinger.h" -#include "journal.h" -#include "journal_seq_blacklist.h" -#include "super-io.h" - -/* - * journal_seq_blacklist machinery: - * - * To guarantee order of btree updates after a crash, we need to detect when a - * btree node entry (bset) is newer than the newest journal entry that was - * successfully written, and ignore it - effectively ignoring any btree updates - * that didn't make it into the journal. - * - * If we didn't do this, we might have two btree nodes, a and b, both with - * updates that weren't written to the journal yet: if b was updated after a, - * but b was flushed and not a - oops; on recovery we'll find that the updates - * to b happened, but not the updates to a that happened before it. - * - * Ignoring bsets that are newer than the newest journal entry is always safe, - * because everything they contain will also have been journalled - and must - * still be present in the journal on disk until a journal entry has been - * written _after_ that bset was written. - * - * To accomplish this, bsets record the newest journal sequence number they - * contain updates for; then, on startup, the btree code queries the journal - * code to ask "Is this sequence number newer than the newest journal entry? If - * so, ignore it." - * - * When this happens, we must blacklist that journal sequence number: the - * journal must not write any entries with that sequence number, and it must - * record that it was blacklisted so that a) on recovery we don't think we have - * missing journal entries and b) so that the btree code continues to ignore - * that bset, until that btree node is rewritten. - */ - -static unsigned sb_blacklist_u64s(unsigned nr) -{ - struct bch_sb_field_journal_seq_blacklist *bl; - - return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); -} - -int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) -{ - struct bch_sb_field_journal_seq_blacklist *bl; - unsigned i = 0, nr; - int ret = 0; - - mutex_lock(&c->sb_lock); - bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); - nr = blacklist_nr_entries(bl); - - while (i < nr) { - struct journal_seq_blacklist_entry *e = - bl->start + i; - - if (end < le64_to_cpu(e->start)) - break; - - if (start > le64_to_cpu(e->end)) { - i++; - continue; - } - - /* - * Entry is contiguous or overlapping with new entry: merge it - * with new entry, and delete: - */ - - start = min(start, le64_to_cpu(e->start)); - end = max(end, le64_to_cpu(e->end)); - array_remove_item(bl->start, nr, i); - } - - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - sb_blacklist_u64s(nr + 1)); - if (!bl) { - ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist); - goto out; - } - - array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) { - .start = cpu_to_le64(start), - .end = cpu_to_le64(end), - })); - c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); - - ret = bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); - - return ret ?: bch2_blacklist_table_initialize(c); -} - -static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) -{ - const struct journal_seq_blacklist_table_entry *l = _l; - const struct journal_seq_blacklist_table_entry *r = _r; - - return cmp_int(l->start, r->start); -} - -bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, - bool dirty) -{ - struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; - struct journal_seq_blacklist_table_entry search = { .start = seq }; - int idx; - - if (!t) - return false; - - idx = eytzinger0_find_le(t->entries, t->nr, - sizeof(t->entries[0]), - journal_seq_blacklist_table_cmp, - &search); - if (idx < 0) - return false; - - BUG_ON(t->entries[idx].start > seq); - - if (seq >= t->entries[idx].end) - return false; - - if (dirty) - t->entries[idx].dirty = true; - return true; -} - -u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c) -{ - struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; - - if (!t || !t->nr) - return 0; - - return t->entries[eytzinger0_last(t->nr)].end - 1; -} - -int bch2_blacklist_table_initialize(struct bch_fs *c) -{ - struct bch_sb_field_journal_seq_blacklist *bl = - bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); - struct journal_seq_blacklist_table *t; - unsigned i, nr = blacklist_nr_entries(bl); - - if (!bl) - return 0; - - t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); - if (!t) - return bch_err_throw(c, ENOMEM_blacklist_table_init); - - t->nr = nr; - - for (i = 0; i < nr; i++) { - t->entries[i].start = le64_to_cpu(bl->start[i].start); - t->entries[i].end = le64_to_cpu(bl->start[i].end); - } - - eytzinger0_sort(t->entries, - t->nr, - sizeof(t->entries[0]), - journal_seq_blacklist_table_cmp, - NULL); - - kfree(c->journal_seq_blacklist_table); - c->journal_seq_blacklist_table = t; - return 0; -} - -static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_journal_seq_blacklist *bl = - field_to_type(f, journal_seq_blacklist); - unsigned i, nr = blacklist_nr_entries(bl); - - for (i = 0; i < nr; i++) { - struct journal_seq_blacklist_entry *e = bl->start + i; - - if (le64_to_cpu(e->start) >= - le64_to_cpu(e->end)) { - prt_printf(err, "entry %u start >= end (%llu >= %llu)", - i, le64_to_cpu(e->start), le64_to_cpu(e->end)); - return -BCH_ERR_invalid_sb_journal_seq_blacklist; - } - - if (i + 1 < nr && - le64_to_cpu(e[0].end) > - le64_to_cpu(e[1].start)) { - prt_printf(err, "entry %u out of order with next entry (%llu > %llu)", - i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); - return -BCH_ERR_invalid_sb_journal_seq_blacklist; - } - } - - return 0; -} - -static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_journal_seq_blacklist *bl = - field_to_type(f, journal_seq_blacklist); - struct journal_seq_blacklist_entry *i; - unsigned nr = blacklist_nr_entries(bl); - - for (i = bl->start; i < bl->start + nr; i++) { - if (i != bl->start) - prt_printf(out, " "); - - prt_printf(out, "%llu-%llu", - le64_to_cpu(i->start), - le64_to_cpu(i->end)); - } - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { - .validate = bch2_sb_journal_seq_blacklist_validate, - .to_text = bch2_sb_journal_seq_blacklist_to_text -}; - -bool bch2_blacklist_entries_gc(struct bch_fs *c) -{ - struct journal_seq_blacklist_entry *src, *dst; - - struct bch_sb_field_journal_seq_blacklist *bl = - bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); - if (!bl) - return false; - - unsigned nr = blacklist_nr_entries(bl); - dst = bl->start; - - struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; - BUG_ON(nr != t->nr); - - src = bl->start; - eytzinger0_for_each(i, nr) { - BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); - BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); - - if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) - *dst++ = *src; - src++; - } - - unsigned new_nr = dst - bl->start; - if (new_nr == nr) - return false; - - bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr); - - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - new_nr ? sb_blacklist_u64s(new_nr) : 0); - BUG_ON(new_nr && !bl); - return true; -} diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h deleted file mode 100644 index f06942ccfcdd..000000000000 --- a/fs/bcachefs/journal_seq_blacklist.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H - -static inline unsigned -blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) -{ - return bl - ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / - sizeof(struct journal_seq_blacklist_entry)) - : 0; -} - -bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); -u64 bch2_journal_last_blacklisted_seq(struct bch_fs *); -int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); -int bch2_blacklist_table_initialize(struct bch_fs *); - -extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; - -bool bch2_blacklist_entries_gc(struct bch_fs *); - -#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h deleted file mode 100644 index 2566b12dbc04..000000000000 --- a/fs/bcachefs/journal_seq_blacklist_format.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H -#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H - -struct journal_seq_blacklist_entry { - __le64 start; - __le64 end; -}; - -struct bch_sb_field_journal_seq_blacklist { - struct bch_sb_field field; - struct journal_seq_blacklist_entry start[]; -}; - -#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h deleted file mode 100644 index 51104bbb99da..000000000000 --- a/fs/bcachefs/journal_types.h +++ /dev/null @@ -1,342 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_TYPES_H -#define _BCACHEFS_JOURNAL_TYPES_H - -#include <linux/cache.h> -#include <linux/workqueue.h> - -#include "alloc_types.h" -#include "super_types.h" -#include "fifo.h" - -/* btree write buffer steals 8 bits for its own purposes: */ -#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) - -#define JOURNAL_STATE_BUF_BITS 2 -#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS) -#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1) - -#define JOURNAL_BUF_BITS 4 -#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) -#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) - -/* - * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to - * the journal that are being staged or in flight. - */ -struct journal_buf { - struct closure io; - struct jset *data; - - __BKEY_PADDED(key, BCH_REPLICAS_MAX); - struct bch_devs_list devs_written; - - struct closure_waitlist wait; - u64 last_seq; /* copy of data->last_seq */ - long expires; - u64 flush_time; - - unsigned buf_size; /* size in bytes of @data */ - unsigned sectors; /* maximum size for current entry */ - unsigned disk_sectors; /* maximum size entry could have been, if - buf_size was bigger */ - unsigned u64s_reserved; - bool noflush:1; /* write has already been kicked off, and was noflush */ - bool must_flush:1; /* something wants a flush */ - bool separate_flush:1; - bool need_flush_to_write_buffer:1; - bool write_started:1; - bool write_allocated:1; - bool write_done:1; - u8 idx; -}; - -/* - * Something that makes a journal entry dirty - i.e. a btree node that has to be - * flushed: - */ - -enum journal_pin_type { - JOURNAL_PIN_TYPE_btree3, - JOURNAL_PIN_TYPE_btree2, - JOURNAL_PIN_TYPE_btree1, - JOURNAL_PIN_TYPE_btree0, - JOURNAL_PIN_TYPE_key_cache, - JOURNAL_PIN_TYPE_other, - JOURNAL_PIN_TYPE_NR, -}; - -struct journal_entry_pin_list { - struct list_head unflushed[JOURNAL_PIN_TYPE_NR]; - struct list_head flushed[JOURNAL_PIN_TYPE_NR]; - atomic_t count; - struct bch_devs_list devs; -}; - -struct journal; -struct journal_entry_pin; -typedef int (*journal_pin_flush_fn)(struct journal *j, - struct journal_entry_pin *, u64); - -struct journal_entry_pin { - struct list_head list; - journal_pin_flush_fn flush; - u64 seq; -}; - -struct journal_res { - bool ref; - u16 u64s; - u32 offset; - u64 seq; -}; - -union journal_res_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - u64 cur_entry_offset:22, - idx:2, - buf0_count:10, - buf1_count:10, - buf2_count:10, - buf3_count:10; - }; -}; - -/* bytes: */ -#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */ - -/* - * We stash some journal state as sentinal values in cur_entry_offset: - * note - cur_entry_offset is in units of u64s - */ -#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1) - -#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) -#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) -#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) - -struct journal_space { - /* Units of 512 bytes sectors: */ - unsigned next_entry; /* How big the next journal entry can be */ - unsigned total; -}; - -enum journal_space_from { - journal_space_discarded, - journal_space_clean_ondisk, - journal_space_clean, - journal_space_total, - journal_space_nr, -}; - -#define JOURNAL_FLAGS() \ - x(replay_done) \ - x(running) \ - x(may_skip_flush) \ - x(need_flush_write) \ - x(space_low) - -enum journal_flags { -#define x(n) JOURNAL_##n, - JOURNAL_FLAGS() -#undef x -}; - -struct journal_bio { - struct bch_dev *ca; - unsigned buf_idx; - u64 submit_time; - - struct bio bio; -}; - -/* Embedded in struct bch_fs */ -struct journal { - /* Fastpath stuff up front: */ - struct { - - union journal_res_state reservations; - enum bch_watermark watermark; - - } __aligned(SMP_CACHE_BYTES); - - unsigned long flags; - - /* Max size of current journal entry */ - unsigned cur_entry_u64s; - unsigned cur_entry_sectors; - - /* Reserved space in journal entry to be used just prior to write */ - unsigned entry_u64s_reserved; - - - /* - * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if - * insufficient devices: - */ - int cur_entry_error; - unsigned cur_entry_offset_if_blocked; - - unsigned buf_size_want; - /* - * We may queue up some things to be journalled (log messages) before - * the journal has actually started - stash them here: - */ - darray_u64 early_journal_entries; - - /* - * Protects journal_buf->data, when accessing without a jorunal - * reservation: for synchronization between the btree write buffer code - * and the journal write path: - */ - struct mutex buf_lock; - /* - * Two journal entries -- one is currently open for new entries, the - * other is possibly being written out. - */ - struct journal_buf buf[JOURNAL_BUF_NR]; - void *free_buf; - unsigned free_buf_size; - - spinlock_t lock; - - /* if nonzero, we may not open a new journal entry: */ - unsigned blocked; - - /* Used when waiting because the journal was full */ - wait_queue_head_t wait; - struct closure_waitlist async_wait; - struct closure_waitlist reclaim_flush_wait; - - struct delayed_work write_work; - struct workqueue_struct *wq; - - /* Sequence number of most recent journal entry (last entry in @pin) */ - atomic64_t seq; - - u64 seq_write_started; - /* seq, last_seq from the most recent journal entry successfully written */ - u64 seq_ondisk; - u64 flushed_seq_ondisk; - u64 flushing_seq; - u64 last_seq_ondisk; - u64 err_seq; - u64 last_empty_seq; - u64 oldest_seq_found_ondisk; - - /* - * FIFO of journal entries whose btree updates have not yet been - * written out. - * - * Each entry is a reference count. The position in the FIFO is the - * entry's sequence number relative to @seq. - * - * The journal entry itself holds a reference count, put when the - * journal entry is written out. Each btree node modified by the journal - * entry also holds a reference count, put when the btree node is - * written. - * - * When a reference count reaches zero, the journal entry is no longer - * needed. When all journal entries in the oldest journal bucket are no - * longer needed, the bucket can be discarded and reused. - */ - struct { - u64 front, back, size, mask; - struct journal_entry_pin_list *data; - } pin; - - struct journal_space space[journal_space_nr]; - - u64 replay_journal_seq; - u64 replay_journal_seq_end; - - struct write_point wp; - spinlock_t err_lock; - - struct mutex reclaim_lock; - /* - * Used for waiting until journal reclaim has freed up space in the - * journal: - */ - wait_queue_head_t reclaim_wait; - struct task_struct *reclaim_thread; - bool reclaim_kicked; - unsigned long next_reclaim; - u64 nr_direct_reclaim; - u64 nr_background_reclaim; - - unsigned long last_flushed; - struct journal_entry_pin *flush_in_progress; - bool flush_in_progress_dropped; - wait_queue_head_t pin_flush_wait; - - /* protects advancing ja->discard_idx: */ - struct mutex discard_lock; - bool can_discard; - - unsigned long last_flush_write; - - u64 write_start_time; - - u64 nr_flush_writes; - u64 nr_noflush_writes; - u64 entry_bytes_written; - - struct bch2_time_stats *flush_write_time; - struct bch2_time_stats *noflush_write_time; - struct bch2_time_stats *flush_seq_time; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map res_map; -#endif -} __aligned(SMP_CACHE_BYTES); - -/* - * Embedded in struct bch_dev. First three fields refer to the array of journal - * buckets, in bch_sb. - */ -struct journal_device { - /* - * For each journal bucket, contains the max sequence number of the - * journal writes it contains - so we know when a bucket can be reused. - */ - u64 *bucket_seq; - - unsigned sectors_free; - - /* - * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: - */ - unsigned discard_idx; /* Next bucket to discard */ - unsigned dirty_idx_ondisk; - unsigned dirty_idx; - unsigned cur_idx; /* Journal bucket we're currently writing to */ - unsigned nr; - - u64 *buckets; - - /* Bio for journal reads/writes to this device */ - struct journal_bio *bio[JOURNAL_BUF_NR]; - - /* for bch_journal_read_device */ - struct closure read; - u64 highest_seq_found; -}; - -/* - * journal_entry_res - reserve space in every journal entry: - */ -struct journal_entry_res { - unsigned u64s; -}; - -#endif /* _BCACHEFS_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c deleted file mode 100644 index 1b828bddd11b..000000000000 --- a/fs/bcachefs/keylist.c +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey.h" -#include "keylist.h" - -int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, - size_t nr_inline_u64s, size_t new_u64s) -{ - size_t oldsize = bch2_keylist_u64s(l); - size_t newsize = oldsize + new_u64s; - u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; - u64 *new_keys; - - newsize = roundup_pow_of_two(newsize); - - if (newsize <= nr_inline_u64s || - (old_buf && roundup_pow_of_two(oldsize) == newsize)) - return 0; - - new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS); - if (!new_keys) - return -ENOMEM; - - if (!old_buf) - memcpy_u64s(new_keys, inline_u64s, oldsize); - - l->keys_p = new_keys; - l->top_p = new_keys + oldsize; - - return 0; -} - -void bch2_keylist_pop_front(struct keylist *l) -{ - l->top_p -= bch2_keylist_front(l)->k.u64s; - - memmove_u64s_down(l->keys, - bkey_next(l->keys), - bch2_keylist_u64s(l)); -} - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_verify_keylist_sorted(struct keylist *l) -{ - for_each_keylist_key(l, k) - BUG_ON(bkey_next(k) != l->top && - bpos_ge(k->k.p, bkey_next(k)->k.p)); -} -#endif diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h deleted file mode 100644 index e687e0e9aede..000000000000 --- a/fs/bcachefs/keylist.h +++ /dev/null @@ -1,72 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_KEYLIST_H -#define _BCACHEFS_KEYLIST_H - -#include "keylist_types.h" - -int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); -void bch2_keylist_pop_front(struct keylist *); - -static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) -{ - l->top_p = l->keys_p = inline_keys; -} - -static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) -{ - if (l->keys_p != inline_keys) - kfree(l->keys_p); -} - -static inline void bch2_keylist_push(struct keylist *l) -{ - l->top = bkey_next(l->top); -} - -static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) -{ - bkey_copy(l->top, k); - bch2_keylist_push(l); -} - -static inline bool bch2_keylist_empty(struct keylist *l) -{ - return l->top == l->keys; -} - -static inline size_t bch2_keylist_u64s(struct keylist *l) -{ - return l->top_p - l->keys_p; -} - -static inline size_t bch2_keylist_bytes(struct keylist *l) -{ - return bch2_keylist_u64s(l) * sizeof(u64); -} - -static inline struct bkey_i *bch2_keylist_front(struct keylist *l) -{ - return l->keys; -} - -#define for_each_keylist_key(_keylist, _k) \ - for (struct bkey_i *_k = (_keylist)->keys; \ - _k != (_keylist)->top; \ - _k = bkey_next(_k)) - -static inline u64 keylist_sectors(struct keylist *keys) -{ - u64 ret = 0; - - for_each_keylist_key(keys, k) - ret += k->k.size; - return ret; -} - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_verify_keylist_sorted(struct keylist *); -#else -static inline void bch2_verify_keylist_sorted(struct keylist *l) {} -#endif - -#endif /* _BCACHEFS_KEYLIST_H */ diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h deleted file mode 100644 index 4b3ff7d8a875..000000000000 --- a/fs/bcachefs/keylist_types.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_KEYLIST_TYPES_H -#define _BCACHEFS_KEYLIST_TYPES_H - -struct keylist { - union { - struct bkey_i *keys; - u64 *keys_p; - }; - union { - struct bkey_i *top; - u64 *top_p; - }; -}; - -#endif /* _BCACHEFS_KEYLIST_TYPES_H */ diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c deleted file mode 100644 index 75f27ec26f85..000000000000 --- a/fs/bcachefs/logged_ops.c +++ /dev/null @@ -1,119 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "error.h" -#include "io_misc.h" -#include "logged_ops.h" -#include "super.h" - -struct bch_logged_op_fn { - u8 type; - int (*resume)(struct btree_trans *, struct bkey_i *); -}; - -static const struct bch_logged_op_fn logged_op_fns[] = { -#define x(n) { \ - .type = KEY_TYPE_logged_op_##n, \ - .resume = bch2_resume_logged_op_##n, \ -}, - BCH_LOGGED_OPS() -#undef x -}; - -static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type) -{ - for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++) - if (logged_op_fns[i].type == type) - return logged_op_fns + i; - return NULL; -} - -static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; - struct printbuf buf = PRINTBUF; - int ret = 0; - - fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags), - trans, logged_op_but_clean, - "filesystem marked as clean but have logged op\n%s", - (bch2_bkey_val_to_text(&buf, c, k), - buf.buf)); - - struct bkey_buf sk; - bch2_bkey_buf_init(&sk); - bch2_bkey_buf_reassemble(&sk, c, k); - - const struct bch_logged_op_fn *fn = logged_op_fn(sk.k->k.type); - if (fn) - fn->resume(trans, sk.k); - - ret = bch2_logged_op_finish(trans, sk.k); - - bch2_bkey_buf_exit(&sk, c); -fsck_err: - printbuf_exit(&buf); - return ret ?: trans_was_restarted(trans, restart_count); -} - -int bch2_resume_logged_ops(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, - BTREE_ID_logged_ops, - POS(LOGGED_OPS_INUM_logged_ops, 0), - POS(LOGGED_OPS_INUM_logged_ops, U64_MAX), - BTREE_ITER_prefetch, k, - resume_logged_op(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) -{ - struct btree_iter iter; - int ret = bch2_bkey_get_empty_slot(trans, &iter, - BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX)); - if (ret) - return ret; - - k->k.p = iter.pos; - - ret = bch2_trans_update(trans, &iter, k, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) -{ - return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_logged_op_start(trans, k)); -} - -int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) -{ - int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0)); - /* - * This needs to be a fatal error because we've left an unfinished - * operation in the logged ops btree. - * - * We should only ever see an error here if the filesystem has already - * been shut down, but make sure of that here: - */ - if (ret) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - bch2_fs_fatal_error(c, "deleting logged operation %s: %s", - buf.buf, bch2_err_str(ret)); - printbuf_exit(&buf); - } - - return ret; -} diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h deleted file mode 100644 index 30ae9ef737dd..000000000000 --- a/fs/bcachefs/logged_ops.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_LOGGED_OPS_H -#define _BCACHEFS_LOGGED_OPS_H - -#include "bkey.h" - -#define BCH_LOGGED_OPS() \ - x(truncate) \ - x(finsert) - -static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op) -{ - return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0); -} - -int bch2_resume_logged_ops(struct bch_fs *); -int bch2_logged_op_start(struct btree_trans *, struct bkey_i *); -int bch2_logged_op_finish(struct btree_trans *, struct bkey_i *); - -#endif /* _BCACHEFS_LOGGED_OPS_H */ diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h deleted file mode 100644 index cfb67c95d4c8..000000000000 --- a/fs/bcachefs/logged_ops_format.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H -#define _BCACHEFS_LOGGED_OPS_FORMAT_H - -enum logged_ops_inums { - LOGGED_OPS_INUM_logged_ops, - LOGGED_OPS_INUM_inode_cursors, -}; - -struct bch_logged_op_truncate { - struct bch_val v; - __le32 subvol; - __le32 pad; - __le64 inum; - __le64 new_i_size; -}; - -enum logged_op_finsert_state { - LOGGED_OP_FINSERT_start, - LOGGED_OP_FINSERT_shift_extents, - LOGGED_OP_FINSERT_finish, -}; - -struct bch_logged_op_finsert { - struct bch_val v; - __u8 state; - __u8 pad[3]; - __le32 subvol; - __le64 inum; - __le64 dst_offset; - __le64 src_offset; - __le64 pos; -}; - -#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c deleted file mode 100644 index 57b5b3263b08..000000000000 --- a/fs/bcachefs/lru.c +++ /dev/null @@ -1,223 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "bkey_buf.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "ec.h" -#include "error.h" -#include "lru.h" -#include "recovery.h" - -/* KEY_TYPE_lru is obsolete: */ -int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(!lru_pos_time(k.k->p), - c, lru_entry_at_time_0, - "lru entry at time=0"); -fsck_err: - return ret; -} - -void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const struct bch_lru *lru = bkey_s_c_to_lru(k).v; - - prt_printf(out, "idx %llu", le64_to_cpu(lru->idx)); -} - -void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru) -{ - prt_printf(out, "%llu:%llu -> %llu:%llu", - lru_pos_id(lru), - lru_pos_time(lru), - u64_to_bucket(lru.offset).inode, - u64_to_bucket(lru.offset).offset); -} - -static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, - u64 dev_bucket, u64 time, bool set) -{ - return time - ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, - lru_pos(lru_id, dev_bucket, time), set) - : 0; -} - -int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) -{ - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted); -} - -int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) -{ - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); -} - -int __bch2_lru_change(struct btree_trans *trans, - u16 lru_id, u64 dev_bucket, - u64 old_time, u64 new_time) -{ - if (old_time == new_time) - return 0; - - return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?: - bch2_lru_set(trans, lru_id, dev_bucket, new_time); -} - -static const char * const bch2_lru_types[] = { -#define x(n) #n, - BCH_LRU_TYPES() -#undef x - NULL -}; - -int bch2_lru_check_set(struct btree_trans *trans, - u16 lru_id, - u64 dev_bucket, - u64 time, - struct bkey_s_c referring_k, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter lru_iter; - struct bkey_s_c lru_k = - bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(lru_id, dev_bucket, time), 0); - int ret = bkey_err(lru_k); - if (ret) - return ret; - - if (lru_k.k->type != KEY_TYPE_set) { - ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed); - if (ret) - goto err; - - if (fsck_err(trans, alloc_key_to_missing_lru_entry, - "missing %s lru entry\n%s", - bch2_lru_types[lru_type(lru_k)], - (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { - ret = bch2_lru_set(trans, lru_id, dev_bucket, time); - if (ret) - goto err; - } - } -err: -fsck_err: - bch2_trans_iter_exit(trans, &lru_iter); - printbuf_exit(&buf); - return ret; -} - -static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) -{ - enum bch_lru_type type = lru_type(lru_k); - - switch (type) { - case BCH_LRU_read: - case BCH_LRU_fragmentation: - return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset)); - case BCH_LRU_stripes: - return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset)); - default: - BUG(); - } -} - -static u64 bkey_lru_type_idx(struct bch_fs *c, - enum bch_lru_type type, - struct bkey_s_c k) -{ - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - - switch (type) { - case BCH_LRU_read: - a = bch2_alloc_to_v4(k, &a_convert); - return alloc_lru_idx_read(*a); - case BCH_LRU_fragmentation: { - a = bch2_alloc_to_v4(k, &a_convert); - - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); - return ca - ? alloc_lru_idx_fragmentation(*a, ca) - : 0; - } - case BCH_LRU_stripes: - return k.k->type == KEY_TYPE_stripe - ? stripe_lru_pos(bkey_s_c_to_stripe(k).v) - : 0; - default: - BUG(); - } -} - -static int bch2_check_lru_key(struct btree_trans *trans, - struct btree_iter *lru_iter, - struct bkey_s_c lru_k, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - struct bbpos bp = lru_pos_to_bp(lru_k); - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0); - int ret = bkey_err(k); - if (ret) - goto err; - - enum bch_lru_type type = lru_type(lru_k); - u64 idx = bkey_lru_type_idx(c, type, k); - - if (lru_pos_time(lru_k.k->p) != idx) { - ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed); - if (ret) - goto err; - - if (fsck_err(trans, lru_entry_bad, - "incorrect lru entry: lru %s time %llu\n" - "%s\n" - "for %s", - bch2_lru_types[type], - lru_pos_time(lru_k.k->p), - (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), - (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); - } -err: -fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - -int bch2_check_lrus(struct bch_fs *c) -{ - struct bkey_buf last_flushed; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_lru_key(trans, &iter, k, &last_flushed))); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; - -} diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h deleted file mode 100644 index 8abd0aa2083a..000000000000 --- a/fs/bcachefs/lru.h +++ /dev/null @@ -1,70 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_LRU_H -#define _BCACHEFS_LRU_H - -static inline u64 lru_pos_id(struct bpos pos) -{ - return pos.inode >> LRU_TIME_BITS; -} - -static inline u64 lru_pos_time(struct bpos pos) -{ - return pos.inode & ~(~0ULL << LRU_TIME_BITS); -} - -static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) -{ - struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket); - - EBUG_ON(time > LRU_TIME_MAX); - EBUG_ON(lru_pos_id(pos) != lru_id); - EBUG_ON(lru_pos_time(pos) != time); - EBUG_ON(pos.offset != dev_bucket); - - return pos; -} - -static inline enum bch_lru_type lru_type(struct bkey_s_c l) -{ - u16 lru_id = l.k->p.inode >> 48; - - switch (lru_id) { - case BCH_LRU_BUCKET_FRAGMENTATION: - return BCH_LRU_fragmentation; - case BCH_LRU_STRIPE_FRAGMENTATION: - return BCH_LRU_stripes; - default: - return BCH_LRU_read; - } -} - -int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); -void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -void bch2_lru_pos_to_text(struct printbuf *, struct bpos); - -#define bch2_bkey_ops_lru ((struct bkey_ops) { \ - .key_validate = bch2_lru_validate, \ - .val_to_text = bch2_lru_to_text, \ - .min_val_size = 8, \ -}) - -int bch2_lru_del(struct btree_trans *, u16, u64, u64); -int bch2_lru_set(struct btree_trans *, u16, u64, u64); -int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); - -static inline int bch2_lru_change(struct btree_trans *trans, - u16 lru_id, u64 dev_bucket, - u64 old_time, u64 new_time) -{ - return old_time != new_time - ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time) - : 0; -} - -struct bkey_buf; -int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); - -int bch2_check_lrus(struct bch_fs *); - -#endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h deleted file mode 100644 index b7392ad8e41f..000000000000 --- a/fs/bcachefs/lru_format.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_LRU_FORMAT_H -#define _BCACHEFS_LRU_FORMAT_H - -struct bch_lru { - struct bch_val v; - __le64 idx; -} __packed __aligned(8); - -#define BCH_LRU_TYPES() \ - x(read) \ - x(fragmentation) \ - x(stripes) - -enum bch_lru_type { -#define x(n) BCH_LRU_##n, - BCH_LRU_TYPES() -#undef x -}; - -#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1) -#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2) - -#define LRU_TIME_BITS 48 -#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) - -#endif /* _BCACHEFS_LRU_FORMAT_H */ diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c deleted file mode 100644 index 0ea9f30803a2..000000000000 --- a/fs/bcachefs/mean_and_variance.c +++ /dev/null @@ -1,173 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Functions for incremental mean and variance. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published by - * the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * Copyright © 2022 Daniel B. Hill - * - * Author: Daniel B. Hill <daniel@gluo.nz> - * - * Description: - * - * This is includes some incremental algorithms for mean and variance calculation - * - * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf - * - * Create a struct and if it's the weighted variant set the w field (weight = 2^k). - * - * Use mean_and_variance[_weighted]_update() on the struct to update it's state. - * - * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation - * is deferred to these functions for performance reasons. - * - * see lib/math/mean_and_variance_test.c for examples of usage. - * - * DO NOT access the mean and variance fields of the weighted variants directly. - * DO NOT change the weight after calling update. - */ - -#include <linux/bug.h> -#include <linux/compiler.h> -#include <linux/export.h> -#include <linux/limits.h> -#include <linux/math.h> -#include <linux/math64.h> -#include <linux/module.h> - -#include "mean_and_variance.h" - -u128_u u128_div(u128_u n, u64 d) -{ - u128_u r; - u64 rem; - u64 hi = u128_hi(n); - u64 lo = u128_lo(n); - u64 h = hi & ((u64) U32_MAX << 32); - u64 l = (hi & (u64) U32_MAX) << 32; - - r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); - r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); - r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); - return r; -} -EXPORT_SYMBOL_GPL(u128_div); - -/** - * mean_and_variance_get_mean() - get mean from @s - * @s: mean and variance number of samples and their sums - */ -s64 mean_and_variance_get_mean(struct mean_and_variance s) -{ - return s.n ? div64_u64(s.sum, s.n) : 0; -} -EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); - -/** - * mean_and_variance_get_variance() - get variance from @s1 - * @s1: mean and variance number of samples and sums - * - * see linked pdf equation 12. - */ -u64 mean_and_variance_get_variance(struct mean_and_variance s1) -{ - if (s1.n) { - u128_u s2 = u128_div(s1.sum_squares, s1.n); - u64 s3 = abs(mean_and_variance_get_mean(s1)); - - return u128_lo(u128_sub(s2, u128_square(s3))); - } else { - return 0; - } -} -EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); - -/** - * mean_and_variance_get_stddev() - get standard deviation from @s - * @s: mean and variance number of samples and their sums - */ -u32 mean_and_variance_get_stddev(struct mean_and_variance s) -{ - return int_sqrt64(mean_and_variance_get_variance(s)); -} -EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); - -/** - * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() - * @s: mean and variance number of samples and their sums - * @x: new value to include in the &mean_and_variance_weighted - * @initted: caller must track whether this is the first use or not - * @weight: ewma weight - * - * see linked pdf: function derived from equations 140-143 where alpha = 2^w. - * values are stored bitshifted for performance and added precision. - */ -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, - s64 x, bool initted, u8 weight) -{ - // previous weighted variance. - u8 w = weight; - u64 var_w0 = s->variance; - // new value weighted. - s64 x_w = x << w; - s64 diff_w = x_w - s->mean; - s64 diff = fast_divpow2(diff_w, w); - // new mean weighted. - s64 u_w1 = s->mean + diff; - - if (!initted) { - s->mean = x_w; - s->variance = 0; - } else { - s->mean = u_w1; - s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; - } -} -EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); - -/** - * mean_and_variance_weighted_get_mean() - get mean from @s - * @s: mean and variance number of samples and their sums - * @weight: ewma weight - */ -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, - u8 weight) -{ - return fast_divpow2(s.mean, weight); -} -EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); - -/** - * mean_and_variance_weighted_get_variance() -- get variance from @s - * @s: mean and variance number of samples and their sums - * @weight: ewma weight - */ -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, - u8 weight) -{ - // always positive don't need fast divpow2 - return s.variance >> weight; -} -EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); - -/** - * mean_and_variance_weighted_get_stddev() - get standard deviation from @s - * @s: mean and variance number of samples and their sums - * @weight: ewma weight - */ -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, - u8 weight) -{ - return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight)); -} -EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); - -MODULE_AUTHOR("Daniel B. Hill"); -MODULE_LICENSE("GPL"); diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h deleted file mode 100644 index 47e4a3c3d26e..000000000000 --- a/fs/bcachefs/mean_and_variance.h +++ /dev/null @@ -1,203 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef MEAN_AND_VARIANCE_H_ -#define MEAN_AND_VARIANCE_H_ - -#include <linux/types.h> -#include <linux/limits.h> -#include <linux/math.h> -#include <linux/math64.h> - -#define SQRT_U64_MAX 4294967295ULL - -/* - * u128_u: u128 user mode, because not all architectures support a real int128 - * type - * - * We don't use this version in userspace, because in userspace we link with - * Rust and rustc has issues with u128. - */ - -#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC) - -typedef struct { - unsigned __int128 v; -} __aligned(16) u128_u; - -static inline u128_u u64_to_u128(u64 a) -{ - return (u128_u) { .v = a }; -} - -static inline u64 u128_lo(u128_u a) -{ - return a.v; -} - -static inline u64 u128_hi(u128_u a) -{ - return a.v >> 64; -} - -static inline u128_u u128_add(u128_u a, u128_u b) -{ - a.v += b.v; - return a; -} - -static inline u128_u u128_sub(u128_u a, u128_u b) -{ - a.v -= b.v; - return a; -} - -static inline u128_u u128_shl(u128_u a, s8 shift) -{ - a.v <<= shift; - return a; -} - -static inline u128_u u128_square(u64 a) -{ - u128_u b = u64_to_u128(a); - - b.v *= b.v; - return b; -} - -#else - -typedef struct { - u64 hi, lo; -} __aligned(16) u128_u; - -/* conversions */ - -static inline u128_u u64_to_u128(u64 a) -{ - return (u128_u) { .lo = a }; -} - -static inline u64 u128_lo(u128_u a) -{ - return a.lo; -} - -static inline u64 u128_hi(u128_u a) -{ - return a.hi; -} - -/* arithmetic */ - -static inline u128_u u128_add(u128_u a, u128_u b) -{ - u128_u c; - - c.lo = a.lo + b.lo; - c.hi = a.hi + b.hi + (c.lo < a.lo); - return c; -} - -static inline u128_u u128_sub(u128_u a, u128_u b) -{ - u128_u c; - - c.lo = a.lo - b.lo; - c.hi = a.hi - b.hi - (c.lo > a.lo); - return c; -} - -static inline u128_u u128_shl(u128_u i, s8 shift) -{ - u128_u r; - - r.lo = i.lo << (shift & 63); - if (shift < 64) - r.hi = (i.hi << (shift & 63)) | (i.lo >> (-shift & 63)); - else { - r.hi = i.lo << (-shift & 63); - r.lo = 0; - } - return r; -} - -static inline u128_u u128_square(u64 i) -{ - u128_u r; - u64 h = i >> 32, l = i & U32_MAX; - - r = u128_shl(u64_to_u128(h*h), 64); - r = u128_add(r, u128_shl(u64_to_u128(h*l), 32)); - r = u128_add(r, u128_shl(u64_to_u128(l*h), 32)); - r = u128_add(r, u64_to_u128(l*l)); - return r; -} - -#endif - -static inline u128_u u64s_to_u128(u64 hi, u64 lo) -{ - u128_u c = u64_to_u128(hi); - - c = u128_shl(c, 64); - c = u128_add(c, u64_to_u128(lo)); - return c; -} - -u128_u u128_div(u128_u n, u64 d); - -struct mean_and_variance { - s64 n; - s64 sum; - u128_u sum_squares; -}; - -/* expontentially weighted variant */ -struct mean_and_variance_weighted { - s64 mean; - u64 variance; -}; - -/** - * fast_divpow2() - fast approximation for n / (1 << d) - * @n: numerator - * @d: the power of 2 denominator. - * - * note: this rounds towards 0. - */ -static inline s64 fast_divpow2(s64 n, u8 d) -{ - return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; -} - -/** - * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 - * and return it. - * @s1: the mean_and_variance to update. - * @v1: the new sample. - * - * see linked pdf equation 12. - */ -static inline void -mean_and_variance_update(struct mean_and_variance *s, s64 v) -{ - s->n++; - s->sum += v; - s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v))); -} - -s64 mean_and_variance_get_mean(struct mean_and_variance s); -u64 mean_and_variance_get_variance(struct mean_and_variance s1); -u32 mean_and_variance_get_stddev(struct mean_and_variance s); - -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, - s64 v, bool initted, u8 weight); - -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, - u8 weight); -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, - u8 weight); -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, - u8 weight); - -#endif // MEAN_AND_VAIRANCE_H_ diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c deleted file mode 100644 index e9d9c0212e44..000000000000 --- a/fs/bcachefs/mean_and_variance_test.c +++ /dev/null @@ -1,221 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <kunit/test.h> - -#include "mean_and_variance.h" - -#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX) - -static void mean_and_variance_basic_test(struct kunit *test) -{ - struct mean_and_variance s = {}; - - mean_and_variance_update(&s, 2); - mean_and_variance_update(&s, 2); - - KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0); - KUNIT_EXPECT_EQ(test, s.n, 2); - - mean_and_variance_update(&s, 4); - mean_and_variance_update(&s, 4); - - KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3); - KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1); - KUNIT_EXPECT_EQ(test, s.n, 4); -} - -/* - * Test values computed using a spreadsheet from the psuedocode at the bottom: - * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf - */ - -static void mean_and_variance_weighted_test(struct kunit *test) -{ - struct mean_and_variance_weighted s = { }; - - mean_and_variance_weighted_update(&s, 10, false, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - - mean_and_variance_weighted_update(&s, 20, true, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - - mean_and_variance_weighted_update(&s, 30, true, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); - - s = (struct mean_and_variance_weighted) { }; - - mean_and_variance_weighted_update(&s, -10, false, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - - mean_and_variance_weighted_update(&s, -20, true, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - - mean_and_variance_weighted_update(&s, -30, true, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); -} - -static void mean_and_variance_weighted_advanced_test(struct kunit *test) -{ - struct mean_and_variance_weighted s = { }; - bool initted = false; - s64 i; - - for (i = 10; i <= 100; i += 10) { - mean_and_variance_weighted_update(&s, i, initted, 8); - initted = true; - } - - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); - - s = (struct mean_and_variance_weighted) { }; - initted = false; - - for (i = -10; i >= -100; i -= 10) { - mean_and_variance_weighted_update(&s, i, initted, 8); - initted = true; - } - - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); -} - -static void do_mean_and_variance_test(struct kunit *test, - s64 initial_value, - s64 initial_n, - s64 n, - unsigned weight, - s64 *data, - s64 *mean, - s64 *stddev, - s64 *weighted_mean, - s64 *weighted_stddev) -{ - struct mean_and_variance mv = {}; - struct mean_and_variance_weighted vw = { }; - - for (unsigned i = 0; i < initial_n; i++) { - mean_and_variance_update(&mv, initial_value); - mean_and_variance_weighted_update(&vw, initial_value, false, weight); - - KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value); - KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0); - } - - for (unsigned i = 0; i < n; i++) { - mean_and_variance_update(&mv, data[i]); - mean_and_variance_weighted_update(&vw, data[i], true, weight); - - KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]); - } - - KUNIT_EXPECT_EQ(test, mv.n, initial_n + n); -} - -/* Test behaviour with a single outlier, then back to steady state: */ -static void mean_and_variance_test_1(struct kunit *test) -{ - s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; - s64 mean[] = { 22, 21, 20, 19, 18, 17, 16 }; - s64 stddev[] = { 32, 29, 28, 27, 26, 25, 24 }; - s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; - s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - -/* Test behaviour where we switch from one steady state to another: */ -static void mean_and_variance_test_2(struct kunit *test) -{ - s64 d[] = { 100, 100, 100, 100, 100 }; - s64 mean[] = { 22, 32, 40, 46, 50 }; - s64 stddev[] = { 32, 39, 42, 44, 45 }; - s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; - s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - -static void mean_and_variance_fast_divpow2(struct kunit *test) -{ - s64 i; - u8 d; - - for (i = 0; i < 100; i++) { - d = 0; - KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d)); - KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d)); - for (d = 1; d < 32; d++) { - KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)), - div_u64(i, 1 << d), "%lld %u", i, d); - KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)), - div_u64(i, 1 << d), "%lld %u", -i, d); - } - } -} - -static void mean_and_variance_u128_basic_test(struct kunit *test) -{ - u128_u a = u64s_to_u128(0, U64_MAX); - u128_u a1 = u64s_to_u128(0, 1); - u128_u b = u64s_to_u128(1, 0); - u128_u c = u64s_to_u128(0, 1LLU << 63); - u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0); - KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0); - - KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX); - KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1); - - KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31); -} - -static struct kunit_case mean_and_variance_test_cases[] = { - KUNIT_CASE(mean_and_variance_fast_divpow2), - KUNIT_CASE(mean_and_variance_u128_basic_test), - KUNIT_CASE(mean_and_variance_basic_test), - KUNIT_CASE(mean_and_variance_weighted_test), - KUNIT_CASE(mean_and_variance_weighted_advanced_test), - KUNIT_CASE(mean_and_variance_test_1), - KUNIT_CASE(mean_and_variance_test_2), - {} -}; - -static struct kunit_suite mean_and_variance_test_suite = { - .name = "mean and variance tests", - .test_cases = mean_and_variance_test_cases -}; - -kunit_test_suite(mean_and_variance_test_suite); - -MODULE_AUTHOR("Daniel B. Hill"); -MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests"); -MODULE_LICENSE("GPL"); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c deleted file mode 100644 index f296cce95338..000000000000 --- a/fs/bcachefs/migrate.c +++ /dev/null @@ -1,277 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Code for moving data off a device. - */ - -#include "bcachefs.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "ec.h" -#include "errcode.h" -#include "extents.h" -#include "io_write.h" -#include "journal.h" -#include "keylist.h" -#include "migrate.h" -#include "move.h" -#include "progress.h" -#include "replicas.h" -#include "super-io.h" - -static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, - unsigned dev_idx, unsigned flags, bool metadata) -{ - unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; - unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; - unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; - unsigned nr_good; - - bch2_bkey_drop_device(k, dev_idx); - - nr_good = bch2_bkey_durability(c, k.s_c); - if ((!nr_good && !(flags & lost)) || - (nr_good < replicas && !(flags & degraded))) - return bch_err_throw(c, remove_would_lose_data); - - return 0; -} - -static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter, - struct btree *b, unsigned dev_idx, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_buf k; - - bch2_bkey_buf_init(&k); - bch2_bkey_buf_copy(&k, c, &b->key); - - int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?: - bch2_btree_node_update_key(trans, iter, b, k.k, 0, false); - - bch_err_fn(c, ret); - bch2_bkey_buf_exit(&k, c); - return ret; -} - -static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - unsigned dev_idx, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_i *n; - int ret; - - if (!bch2_bkey_has_device_c(k, dev_idx)) - return 0; - - n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false); - if (ret) - return ret; - - /* - * If the new extent no longer has any pointers, bch2_extent_normalize() - * will do the appropriate thing with it (turning it into a - * KEY_TYPE_error key, or just a discard if it was a cached extent) - */ - bch2_extent_normalize(c, bkey_i_to_s(n)); - - /* - * Since we're not inserting through an extent iterator - * (BTREE_ITER_all_snapshots iterators aren't extent iterators), - * we aren't using the extent overwrite path to delete, we're - * just using the normal key deletion path: - */ - if (bkey_deleted(&n->k)) - n->k.size = 0; - return 0; -} - -static int bch2_dev_btree_drop_key(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - unsigned dev_idx, - struct bkey_buf *last_flushed, - unsigned flags) -{ - struct btree_iter iter; - struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret; - - ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_dev_usrdata_drop(struct bch_fs *c, - struct progress_indicator_state *progress, - unsigned dev_idx, unsigned flags) -{ - struct btree_trans *trans = bch2_trans_get(c); - enum btree_id id; - int ret = 0; - - for (id = 0; id < BTREE_ID_NR; id++) { - if (!btree_type_has_ptrs(id)) - continue; - - ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); - bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); - })); - if (ret) - break; - } - - bch2_trans_put(trans); - - return ret; -} - -static int bch2_dev_metadata_drop(struct bch_fs *c, - struct progress_indicator_state *progress, - unsigned dev_idx, unsigned flags) -{ - struct btree_trans *trans; - struct btree_iter iter; - struct closure cl; - struct btree *b; - struct bkey_buf k; - unsigned id; - int ret; - - /* don't handle this yet: */ - if (flags & BCH_FORCE_IF_METADATA_LOST) - return bch_err_throw(c, remove_with_metadata_missing_unimplemented); - - trans = bch2_trans_get(c); - bch2_bkey_buf_init(&k); - closure_init_stack(&cl); - - for (id = 0; id < BTREE_ID_NR; id++) { - bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, - BTREE_ITER_prefetch); -retry: - ret = 0; - while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(trans, &iter)) && - !(ret = PTR_ERR_OR_ZERO(b))) { - bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); - - if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) - goto next; - - ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - - if (ret) - break; -next: - bch2_btree_iter_next_node(trans, &iter); - } - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_iter_exit(trans, &iter); - - if (ret) - goto err; - } - - bch2_btree_interior_updates_flush(c); - ret = 0; -err: - bch2_bkey_buf_exit(&k, c); - bch2_trans_put(trans); - - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); - - return ret; -} - -static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, - struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed, - unsigned flags) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, - last_flushed); - int ret = bkey_err(k); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - return 0; - if (ret) - return ret; - - if (!k.k || !bch2_bkey_has_device_c(k, dev_idx)) - goto out; - - /* - * XXX: pass flags arg to invalidate_stripe_to_dev and handle it - * properly - */ - - if (bkey_is_btree_ptr(k.k)) - ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags); - else if (k.k->type == KEY_TYPE_stripe) - ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags); - else - ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags) -{ - struct btree_trans *trans = bch2_trans_get(c); - - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_btree_write_buffer_flush_sync(trans) ?: - for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, - POS(dev_idx, 0), - POS(dev_idx, U64_MAX), 0, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - if (k.k->type != KEY_TYPE_backpointer) - continue; - - data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), - &last_flushed, flags); - - })); - - bch2_bkey_buf_exit(&last_flushed, trans->c); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags) -{ - struct progress_indicator_state progress; - bch2_progress_init(&progress, c, - BIT_ULL(BTREE_ID_extents)| - BIT_ULL(BTREE_ID_reflink)); - - return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?: - bch2_dev_metadata_drop(c, &progress, dev_idx, flags); -} diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h deleted file mode 100644 index 30018140711b..000000000000 --- a/fs/bcachefs/migrate.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_MIGRATE_H -#define _BCACHEFS_MIGRATE_H - -int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned); -int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned); - -#endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c deleted file mode 100644 index eec591e947bd..000000000000 --- a/fs/bcachefs/move.c +++ /dev/null @@ -1,1494 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "btree_gc.h" -#include "btree_io.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "compress.h" -#include "disk_groups.h" -#include "ec.h" -#include "errcode.h" -#include "error.h" -#include "inode.h" -#include "io_read.h" -#include "io_write.h" -#include "journal_reclaim.h" -#include "keylist.h" -#include "move.h" -#include "rebalance.h" -#include "reflink.h" -#include "replicas.h" -#include "snapshot.h" -#include "super-io.h" -#include "trace.h" - -#include <linux/ioprio.h> -#include <linux/kthread.h> - -const char * const bch2_data_ops_strs[] = { -#define x(t, n, ...) [n] = #t, - BCH_DATA_OPS() -#undef x - NULL -}; - -struct evacuate_bucket_arg { - struct bpos bucket; - int gen; - struct data_update_opts data_opts; -}; - -static bool evacuate_bucket_pred(struct bch_fs *, void *, - enum btree_id, struct bkey_s_c, - struct bch_io_opts *, - struct data_update_opts *); - -static noinline void -trace_io_move2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_io_move(c, buf.buf); - printbuf_exit(&buf); -} - -static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) -{ - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - trace_io_move_read(c, buf.buf); - printbuf_exit(&buf); -} - -static noinline void -trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts, - move_pred_fn pred, void *_arg, bool p) -{ - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "%ps: %u", pred, p); - - if (pred == evacuate_bucket_pred) { - struct evacuate_bucket_arg *arg = _arg; - prt_printf(&buf, " gen=%u", arg->gen); - } - - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_io_move_pred(c, buf.buf); - printbuf_exit(&buf); -} - -static noinline void -trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen) -{ - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "bucket: "); - bch2_bpos_to_text(&buf, bucket); - prt_printf(&buf, " gen: %i\n", gen); - - trace_io_move_evacuate_bucket(c, buf.buf); - printbuf_exit(&buf); -} - -struct moving_io { - struct list_head read_list; - struct list_head io_list; - struct move_bucket *b; - struct closure cl; - bool read_completed; - - unsigned read_sectors; - unsigned write_sectors; - - struct data_update write; -}; - -static void move_free(struct moving_io *io) -{ - struct moving_context *ctxt = io->write.ctxt; - - if (io->b) - atomic_dec(&io->b->count); - - mutex_lock(&ctxt->lock); - list_del(&io->io_list); - wake_up(&ctxt->wait); - mutex_unlock(&ctxt->lock); - - if (!io->write.data_opts.scrub) { - bch2_data_update_exit(&io->write); - } else { - bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); - kfree(io->write.bvecs); - } - kfree(io); -} - -static void move_write_done(struct bch_write_op *op) -{ - struct moving_io *io = container_of(op, struct moving_io, write.op); - struct bch_fs *c = op->c; - struct moving_context *ctxt = io->write.ctxt; - - if (op->error) { - if (trace_io_move_write_fail_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_write_op_to_text(&buf, op); - trace_io_move_write_fail(c, buf.buf); - printbuf_exit(&buf); - } - this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]); - - ctxt->write_error = true; - } - - atomic_sub(io->write_sectors, &ctxt->write_sectors); - atomic_dec(&ctxt->write_ios); - move_free(io); - closure_put(&ctxt->cl); -} - -static void move_write(struct moving_io *io) -{ - struct bch_fs *c = io->write.op.c; - struct moving_context *ctxt = io->write.ctxt; - struct bch_read_bio *rbio = &io->write.rbio; - - if (ctxt->stats) { - if (rbio->bio.bi_status) - atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, - &ctxt->stats->sectors_error_uncorrected); - else if (rbio->saw_error) - atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, - &ctxt->stats->sectors_error_corrected); - } - - /* - * If the extent has been bitrotted, we're going to have to give it a - * new checksum in order to move it - but the poison bit will ensure - * that userspace still gets the appropriate error. - */ - if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && - (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - - rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, - nonce, &rbio->bio); - rbio->ret = 0; - } - - if (unlikely(rbio->ret || io->write.data_opts.scrub)) { - move_free(io); - return; - } - - if (trace_io_move_write_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); - trace_io_move_write(c, buf.buf); - printbuf_exit(&buf); - } - - closure_get(&io->write.ctxt->cl); - atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); - atomic_inc(&io->write.ctxt->write_ios); - - bch2_data_update_read_done(&io->write); -} - -struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) -{ - struct moving_io *io = - list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); - - return io && io->read_completed ? io : NULL; -} - -static void move_read_endio(struct bio *bio) -{ - struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); - struct moving_context *ctxt = io->write.ctxt; - - atomic_sub(io->read_sectors, &ctxt->read_sectors); - atomic_dec(&ctxt->read_ios); - io->read_completed = true; - - wake_up(&ctxt->wait); - closure_put(&ctxt->cl); -} - -void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) -{ - struct moving_io *io; - - while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { - bch2_trans_unlock_long(ctxt->trans); - list_del(&io->read_list); - move_write(io); - } -} - -void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) -{ - unsigned sectors_pending = atomic_read(&ctxt->write_sectors); - - move_ctxt_wait_event(ctxt, - !atomic_read(&ctxt->write_sectors) || - atomic_read(&ctxt->write_sectors) != sectors_pending); -} - -void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) -{ - move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); - bch2_trans_unlock_long(ctxt->trans); - closure_sync(&ctxt->cl); -} - -void bch2_moving_ctxt_exit(struct moving_context *ctxt) -{ - struct bch_fs *c = ctxt->trans->c; - - bch2_moving_ctxt_flush_all(ctxt); - - EBUG_ON(atomic_read(&ctxt->write_sectors)); - EBUG_ON(atomic_read(&ctxt->write_ios)); - EBUG_ON(atomic_read(&ctxt->read_sectors)); - EBUG_ON(atomic_read(&ctxt->read_ios)); - - mutex_lock(&c->moving_context_lock); - list_del(&ctxt->list); - mutex_unlock(&c->moving_context_lock); - - /* - * Generally, releasing a transaction within a transaction restart means - * an unhandled transaction restart: but this can happen legitimately - * within the move code, e.g. when bch2_move_ratelimit() tells us to - * exit before we've retried - */ - bch2_trans_begin(ctxt->trans); - bch2_trans_put(ctxt->trans); - memset(ctxt, 0, sizeof(*ctxt)); -} - -void bch2_moving_ctxt_init(struct moving_context *ctxt, - struct bch_fs *c, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc) -{ - memset(ctxt, 0, sizeof(*ctxt)); - - ctxt->trans = bch2_trans_get(c); - ctxt->fn = (void *) _RET_IP_; - ctxt->rate = rate; - ctxt->stats = stats; - ctxt->wp = wp; - ctxt->wait_on_copygc = wait_on_copygc; - - closure_init_stack(&ctxt->cl); - - mutex_init(&ctxt->lock); - INIT_LIST_HEAD(&ctxt->reads); - INIT_LIST_HEAD(&ctxt->ios); - init_waitqueue_head(&ctxt->wait); - - mutex_lock(&c->moving_context_lock); - list_add(&ctxt->list, &c->moving_context_list); - mutex_unlock(&c->moving_context_lock); -} - -void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) -{ - trace_move_data(c, stats); -} - -void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) -{ - memset(stats, 0, sizeof(*stats)); - stats->data_type = BCH_DATA_user; - scnprintf(stats->name, sizeof(stats->name), "%s", name); -} - -int bch2_move_extent(struct moving_context *ctxt, - struct move_bucket *bucket_in_flight, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_io_opts io_opts, - struct data_update_opts data_opts) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - int ret = -ENOMEM; - - if (trace_io_move_enabled()) - trace_io_move2(c, k, &io_opts, &data_opts); - this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); - - if (ctxt->stats) - ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); - - bch2_data_update_opts_normalize(k, &data_opts); - - if (!data_opts.rewrite_ptrs && - !data_opts.extra_replicas && - !data_opts.scrub) { - if (data_opts.kill_ptrs) - return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); - return 0; - } - - struct moving_io *io = allocate_dropping_locks(trans, ret, - kzalloc(sizeof(struct moving_io), _gfp)); - if (!io) - goto err; - - if (ret) - goto err_free; - - INIT_LIST_HEAD(&io->io_list); - io->write.ctxt = ctxt; - io->read_sectors = k.k->size; - io->write_sectors = k.k->size; - - if (!data_opts.scrub) { - ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, - &io_opts, data_opts, iter->btree_id, k); - if (ret) - goto err_free; - - io->write.op.end_io = move_write_done; - } else { - bch2_bkey_buf_init(&io->write.k); - bch2_bkey_buf_reassemble(&io->write.k, c, k); - - io->write.op.c = c; - io->write.data_opts = data_opts; - - bch2_trans_unlock(trans); - - ret = bch2_data_update_bios_init(&io->write, c, &io_opts); - if (ret) - goto err_free; - } - - io->write.rbio.bio.bi_end_io = move_read_endio; - io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, k.k->size); - - if (ctxt->stats) { - atomic64_inc(&ctxt->stats->keys_moved); - atomic64_add(k.k->size, &ctxt->stats->sectors_moved); - } - - if (bucket_in_flight) { - io->b = bucket_in_flight; - atomic_inc(&io->b->count); - } - - if (trace_io_move_read_enabled()) - trace_io_move_read2(c, k); - - mutex_lock(&ctxt->lock); - atomic_add(io->read_sectors, &ctxt->read_sectors); - atomic_inc(&ctxt->read_ios); - - list_add_tail(&io->read_list, &ctxt->reads); - list_add_tail(&io->io_list, &ctxt->ios); - mutex_unlock(&ctxt->lock); - - /* - * dropped by move_read_endio() - guards against use after free of - * ctxt when doing wakeup - */ - closure_get(&ctxt->cl); - __bch2_read_extent(trans, &io->write.rbio, - io->write.rbio.bio.bi_iter, - bkey_start_pos(k.k), - iter->btree_id, k, 0, - NULL, - BCH_READ_last_fragment, - data_opts.scrub ? data_opts.read_dev : -1); - return 0; -err_free: - kfree(io); -err: - if (bch2_err_matches(ret, EROFS) || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - - count_event(c, io_move_start_fail); - - if (trace_io_move_start_fail_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, ": "); - prt_str(&buf, bch2_err_str(ret)); - trace_io_move_start_fail(c, buf.buf); - printbuf_exit(&buf); - } - - if (bch2_err_matches(ret, BCH_ERR_data_update_done)) - return 0; - return ret; -} - -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, - struct per_snapshot_io_opts *io_opts, - struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ - struct btree_iter *extent_iter, - struct bkey_s_c extent_k) -{ - struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; - struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; - int ret = 0; - - if (extent_iter->min_depth) - return opts_ret; - - if (extent_k.k->type == KEY_TYPE_reflink_v) - goto out; - - if (io_opts->cur_inum != extent_pos.inode) { - io_opts->d.nr = 0; - - ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), - BTREE_ITER_all_snapshots, k, ({ - if (k.k->p.offset != extent_pos.inode) - break; - - if (!bkey_is_inode(k.k)) - continue; - - struct bch_inode_unpacked inode; - _ret3 = bch2_inode_unpack(k, &inode); - if (_ret3) - break; - - struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; - bch2_inode_opts_get(&e.io_opts, trans->c, &inode); - - darray_push(&io_opts->d, e); - })); - io_opts->cur_inum = extent_pos.inode; - } - - ret = ret ?: trans_was_restarted(trans, restart_count); - if (ret) - return ERR_PTR(ret); - - if (extent_k.k->p.snapshot) - darray_for_each(io_opts->d, i) - if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) { - opts_ret = &i->io_opts; - break; - } -out: - ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); - if (ret) - return ERR_PTR(ret); - return opts_ret; -} - -int bch2_move_get_io_opts_one(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct btree_iter *extent_iter, - struct bkey_s_c extent_k) -{ - struct bch_fs *c = trans->c; - - *io_opts = bch2_opts_to_inode_opts(c->opts); - - /* reflink btree? */ - if (!extent_k.k->p.inode) - goto out; - - struct btree_iter inode_iter; - struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, - SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), - BTREE_ITER_cached); - int ret = bkey_err(inode_k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - - if (!ret && bkey_is_inode(inode_k.k)) { - struct bch_inode_unpacked inode; - bch2_inode_unpack(inode_k, &inode); - bch2_inode_opts_get(io_opts, c, &inode); - } - bch2_trans_iter_exit(trans, &inode_iter); - /* seem to be spinning here? */ -out: - return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); -} - -int bch2_move_ratelimit(struct moving_context *ctxt) -{ - struct bch_fs *c = ctxt->trans->c; - bool is_kthread = current->flags & PF_KTHREAD; - u64 delay; - - if (ctxt->wait_on_copygc && c->copygc_running) { - bch2_moving_ctxt_flush_all(ctxt); - wait_event_killable(c->copygc_running_wq, - !c->copygc_running || - (is_kthread && kthread_should_stop())); - } - - do { - delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; - - if (is_kthread && kthread_should_stop()) - return 1; - - if (delay) - move_ctxt_wait_event_timeout(ctxt, - freezing(current) || - (is_kthread && kthread_should_stop()), - delay); - - if (unlikely(freezing(current))) { - bch2_moving_ctxt_flush_all(ctxt); - try_to_freeze(); - } - } while (delay); - - /* - * XXX: these limits really ought to be per device, SSDs and hard drives - * will want different limits - */ - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && - atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && - atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && - atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); - - return 0; -} - -/* - * Move requires non extents iterators, and there's also no need for it to - * signal indirect_extent_missing_error: - */ -static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_reflink_p p) -{ - if (unlikely(REFLINK_P_ERROR(p.v))) - return bkey_s_c_null; - - struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v)); - - bch2_trans_iter_init(trans, iter, - BTREE_ID_reflink, reflink_pos, - BTREE_ITER_not_extents); - - struct bkey_s_c k = bch2_btree_iter_peek(trans, iter); - if (!k.k || bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } - - if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { - bch2_trans_iter_exit(trans, iter); - return bkey_s_c_null; - } - - return k; -} - -int bch2_move_data_btree(struct moving_context *ctxt, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - enum btree_id btree_id, unsigned level) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct per_snapshot_io_opts snapshot_io_opts; - struct bch_io_opts *io_opts; - struct bkey_buf sk; - struct btree_iter iter, reflink_iter = {}; - struct bkey_s_c k; - struct data_update_opts data_opts; - /* - * If we're moving a single file, also process reflinked data it points - * to (this includes propagating changed io_opts from the inode to the - * extent): - */ - bool walk_indirect = start.inode == end.inode; - int ret = 0, ret2; - - per_snapshot_io_opts_init(&snapshot_io_opts, c); - bch2_bkey_buf_init(&sk); - - if (ctxt->stats) { - ctxt->stats->data_type = BCH_DATA_user; - ctxt->stats->pos = BBPOS(btree_id, start); - } - -retry_root: - bch2_trans_begin(trans); - - if (level == bch2_btree_id_root(c, btree_id)->level + 1) { - bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1, - BTREE_ITER_prefetch| - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto root_err; - - if (b != btree_node_root(c, b)) { - bch2_trans_iter_exit(trans, &iter); - goto retry_root; - } - - k = bkey_i_to_s_c(&b->key); - - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, - iter.pos, &iter, k); - ret = PTR_ERR_OR_ZERO(io_opts); - if (ret) - goto root_err; - - memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) - goto out; - - - if (!data_opts.scrub) - ret = bch2_btree_node_rewrite_pos(trans, btree_id, level, - k.k->p, data_opts.target, 0); - else - ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); - -root_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - bch2_trans_iter_exit(trans, &iter); - goto retry_root; - } - - goto out; - } - - bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, - BTREE_ITER_prefetch| - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); - - if (ctxt->rate) - bch2_ratelimit_reset(ctxt->rate); - - while (!bch2_move_ratelimit(ctxt)) { - struct btree_iter *extent_iter = &iter; - - bch2_trans_begin(trans); - - k = bch2_btree_iter_peek(trans, &iter); - if (!k.k) - break; - - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (bkey_gt(bkey_start_pos(k.k), end)) - break; - - if (ctxt->stats) - ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - - if (walk_indirect && - k.k->type == KEY_TYPE_reflink_p && - REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - - bch2_trans_iter_exit(trans, &reflink_iter); - k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (!k.k) - goto next_nondata; - - /* - * XXX: reflink pointers may point to multiple indirect - * extents, so don't advance past the entire reflink - * pointer - need to fixup iter->k - */ - extent_iter = &reflink_iter; - } - - if (!bkey_extent_is_direct_data(k.k)) - goto next_nondata; - - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, - iter.pos, extent_iter, k); - ret = PTR_ERR_OR_ZERO(io_opts); - if (ret) - continue; - - memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) - goto next; - - /* - * The iterator gets unlocked by __bch2_read_extent - need to - * save a copy of @k elsewhere: - */ - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - if (!level) - ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); - else if (!data_opts.scrub) - ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, - k.k->p, data_opts.target, 0); - else - ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); - - if (ret2) { - if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) - continue; - - if (bch2_err_matches(ret2, ENOMEM)) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; - } - - /* XXX signal failure */ - goto next; - } -next: - if (ctxt->stats) - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); -next_nondata: - if (!bch2_btree_iter_advance(trans, &iter)) - break; - } -out: - bch2_trans_iter_exit(trans, &reflink_iter); - bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&sk, c); - per_snapshot_io_opts_exit(&snapshot_io_opts); - - return ret; -} - -int __bch2_move_data(struct moving_context *ctxt, - struct bbpos start, - struct bbpos end, - move_pred_fn pred, void *arg) -{ - struct bch_fs *c = ctxt->trans->c; - enum btree_id id; - int ret = 0; - - for (id = start.btree; - id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); - id++) { - ctxt->stats->pos = BBPOS(id, POS_MIN); - - if (!btree_type_has_ptrs(id) || - !bch2_btree_id_root(c, id)->b) - continue; - - ret = bch2_move_data_btree(ctxt, - id == start.btree ? start.pos : POS_MIN, - id == end.btree ? end.pos : POS_MAX, - pred, arg, id, 0); - if (ret) - break; - } - - return ret; -} - -int bch2_move_data(struct bch_fs *c, - struct bbpos start, - struct bbpos end, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) -{ - struct moving_context ctxt; - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - int ret = __bch2_move_data(&ctxt, start, end, pred, arg); - bch2_moving_ctxt_exit(&ctxt); - - return ret; -} - -static int __bch2_move_data_phys(struct moving_context *ctxt, - struct move_bucket *bucket_in_flight, - unsigned dev, - u64 bucket_start, - u64 bucket_end, - unsigned data_types, - bool copygc, - move_pred_fn pred, void *arg) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - bool is_kthread = current->flags & PF_KTHREAD; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct btree_iter iter = {}, bp_iter = {}; - struct bkey_buf sk; - struct bkey_s_c k; - struct bkey_buf last_flushed; - u64 check_mismatch_done = bucket_start; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, dev); - if (!ca) - return 0; - - bucket_end = min(bucket_end, ca->mi.nbuckets); - - struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); - struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - bch2_bkey_buf_init(&sk); - - /* - * We're not run in a context that handles transaction restarts: - */ - bch2_trans_begin(trans); - - bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); - - ret = bch2_btree_write_buffer_tryflush(trans); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "flushing btree write buffer"); - if (ret) - goto err; - - while (!(ret = bch2_move_ratelimit(ctxt))) { - if (is_kthread && kthread_should_stop()) - break; - - bch2_trans_begin(trans); - - k = bch2_btree_iter_peek(trans, &bp_iter); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - - if (!k.k || bkey_gt(k.k->p, bp_end)) - break; - - if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { - while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { - bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, - copygc, &last_flushed); - } - continue; - } - - if (k.k->type != KEY_TYPE_backpointer) - goto next; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - - if (ctxt->stats) - ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - - if (!(data_types & BIT(bp.v->data_type))) - goto next; - - if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes) - goto next; - - k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!k.k) - goto next; - - if (!bp.v->level) { - ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); - if (ret) { - bch2_trans_iter_exit(trans, &iter); - continue; - } - } - - struct data_update_opts data_opts = {}; - bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts); - - if (trace_io_move_pred_enabled()) - trace_io_move_pred2(c, k, &io_opts, &data_opts, - pred, arg, p); - - if (!p) { - bch2_trans_iter_exit(trans, &iter); - goto next; - } - - if (data_opts.scrub && - !bch2_dev_idx_is_online(c, data_opts.read_dev)) { - bch2_trans_iter_exit(trans, &iter); - ret = bch_err_throw(c, device_offline); - break; - } - - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - /* move_extent will drop locks */ - unsigned sectors = bp.v->bucket_len; - - if (!bp.v->level) - ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); - else if (!data_opts.scrub) - ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, - k.k->p, data_opts.target, 0); - else - ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); - - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret == -ENOMEM) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; - } - if (ret) - goto err; - - if (ctxt->stats) - atomic64_add(sectors, &ctxt->stats->sectors_seen); -next: - bch2_btree_iter_advance(trans, &bp_iter); - } - - while (check_mismatch_done < bucket_end) - bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, - copygc, &last_flushed); -err: - bch2_trans_iter_exit(trans, &bp_iter); - bch2_bkey_buf_exit(&sk, c); - bch2_bkey_buf_exit(&last_flushed, c); - bch2_dev_put(ca); - return ret; -} - -int bch2_move_data_phys(struct bch_fs *c, - unsigned dev, - u64 start, - u64 end, - unsigned data_types, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) -{ - struct moving_context ctxt; - - bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - if (ctxt.stats) { - ctxt.stats->phys = true; - ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; - } - - int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, - data_types, false, pred, arg); - bch2_moving_ctxt_exit(&ctxt); - - return ret; -} - -static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct evacuate_bucket_arg *arg = _arg; - - *data_opts = arg->data_opts; - - unsigned i = 0; - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - if (ptr->dev == arg->bucket.inode && - (arg->gen < 0 || arg->gen == ptr->gen) && - !ptr->cached) - data_opts->rewrite_ptrs |= BIT(i); - i++; - } - - return data_opts->rewrite_ptrs != 0; -} - -int bch2_evacuate_bucket(struct moving_context *ctxt, - struct move_bucket *bucket_in_flight, - struct bpos bucket, int gen, - struct data_update_opts data_opts) -{ - struct bch_fs *c = ctxt->trans->c; - struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; - - count_event(c, io_move_evacuate_bucket); - if (trace_io_move_evacuate_bucket_enabled()) - trace_io_move_evacuate_bucket2(c, bucket, gen); - - return __bch2_move_data_phys(ctxt, bucket_in_flight, - bucket.inode, - bucket.offset, - bucket.offset + 1, - ~0, - true, - evacuate_bucket_pred, &arg); -} - -typedef bool (*move_btree_pred)(struct bch_fs *, void *, - struct btree *, struct bch_io_opts *, - struct data_update_opts *); - -static int bch2_move_btree(struct bch_fs *c, - struct bbpos start, - struct bbpos end, - move_btree_pred pred, void *arg, - struct bch_move_stats *stats) -{ - bool kthread = (current->flags & PF_KTHREAD) != 0; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct moving_context ctxt; - struct btree_trans *trans; - struct btree_iter iter; - struct btree *b; - enum btree_id btree; - struct data_update_opts data_opts; - int ret = 0; - - bch2_moving_ctxt_init(&ctxt, c, NULL, stats, - writepoint_ptr(&c->btree_write_point), - true); - trans = ctxt.trans; - - stats->data_type = BCH_DATA_btree; - - for (btree = start.btree; - btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); - btree ++) { - stats->pos = BBPOS(btree, POS_MIN); - - if (!bch2_btree_id_root(c, btree)->b) - continue; - - bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, - BTREE_ITER_prefetch); -retry: - ret = 0; - while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(trans, &iter)) && - !(ret = PTR_ERR_OR_ZERO(b))) { - if (kthread && kthread_should_stop()) - break; - - if ((cmp_int(btree, end.btree) ?: - bpos_cmp(b->key.k.p, end.pos)) > 0) - break; - - stats->pos = BBPOS(iter.btree_id, iter.pos); - - if (!pred(c, arg, b, &io_opts, &data_opts)) - goto next; - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; -next: - bch2_btree_iter_next_node(trans, &iter); - } - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_iter_exit(trans, &iter); - - if (kthread && kthread_should_stop()) - break; - } - - bch_err_fn(c, ret); - bch2_moving_ctxt_exit(&ctxt); - bch2_btree_interior_updates_flush(c); - - return ret; -} - -static bool rereplicate_pred(struct bch_fs *c, void *arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - unsigned nr_good = bch2_bkey_durability(c, k); - unsigned replicas = bkey_is_btree_ptr(k.k) - ? c->opts.metadata_replicas - : io_opts->data_replicas; - - guard(rcu)(); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i = 0; - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ptr->cached && - (!ca || !ca->mi.durability)) - data_opts->kill_ptrs |= BIT(i); - i++; - } - - if (!data_opts->kill_ptrs && - (!nr_good || nr_good >= replicas)) - return false; - - data_opts->target = 0; - data_opts->extra_replicas = replicas - nr_good; - data_opts->btree_insert_flags = 0; - return true; -} - -static bool migrate_pred(struct bch_fs *c, void *arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_ioctl_data *op = arg; - unsigned i = 0; - - data_opts->rewrite_ptrs = 0; - data_opts->target = 0; - data_opts->extra_replicas = 0; - data_opts->btree_insert_flags = 0; - - bkey_for_each_ptr(ptrs, ptr) { - if (ptr->dev == op->migrate.dev) - data_opts->rewrite_ptrs |= 1U << i; - i++; - } - - return data_opts->rewrite_ptrs != 0; -} - -static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts); -} - -/* - * Ancient versions of bcachefs produced packed formats which could represent - * keys that the in memory format cannot represent; this checks for those - * formats so we can get rid of them. - */ -static bool bformat_needs_redo(struct bkey_format *f) -{ - for (unsigned i = 0; i < f->nr_fields; i++) - if (bch2_bkey_format_field_overflows(f, i)) - return true; - - return false; -} - -static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - if (b->version_ondisk != c->sb.version || - btree_node_need_rewrite(b) || - bformat_needs_redo(&b->format)) { - data_opts->target = 0; - data_opts->extra_replicas = 0; - data_opts->btree_insert_flags = 0; - return true; - } - - return false; -} - -int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) -{ - int ret; - - ret = bch2_move_btree(c, - BBPOS_MIN, - BBPOS_MAX, - rewrite_old_nodes_pred, c, stats); - if (!ret) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); - c->disk_sb.sb->version_min = c->disk_sb.sb->version; - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - - bch_err_fn(c, ret); - return ret; -} - -static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - unsigned durability = bch2_bkey_durability(c, k); - unsigned replicas = bkey_is_btree_ptr(k.k) - ? c->opts.metadata_replicas - : io_opts->data_replicas; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned i = 0; - - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { - unsigned d = bch2_extent_ptr_durability(c, &p); - - if (d && durability - d >= replicas) { - data_opts->kill_ptrs |= BIT(i); - durability -= d; - } - - i++; - } - - return data_opts->kill_ptrs != 0; -} - -static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), - io_opts, data_opts); -} - -static bool scrub_pred(struct bch_fs *c, void *_arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bch_ioctl_data *arg = _arg; - - if (k.k->type != KEY_TYPE_btree_ptr_v2) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == arg->migrate.dev) { - if (!p.crc.csum_type) - return false; - break; - } - } - - data_opts->scrub = true; - data_opts->read_dev = arg->migrate.dev; - return true; -} - -int bch2_data_job(struct bch_fs *c, - struct bch_move_stats *stats, - struct bch_ioctl_data op) -{ - struct bbpos start = BBPOS(op.start_btree, op.start_pos); - struct bbpos end = BBPOS(op.end_btree, op.end_pos); - int ret = 0; - - if (op.op >= BCH_DATA_OP_NR) - return -EINVAL; - - bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); - - switch (op.op) { - case BCH_DATA_OP_scrub: - /* - * prevent tests from spuriously failing, make sure we see all - * btree nodes that need to be repaired - */ - bch2_btree_interior_updates_flush(c); - - ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, - op.scrub.data_types, - NULL, - stats, - writepoint_hashed((unsigned long) current), - false, - scrub_pred, &op) ?: ret; - break; - - case BCH_DATA_OP_rereplicate: - stats->data_type = BCH_DATA_journal; - ret = bch2_journal_flush_device_pins(&c->journal, -1); - ret = bch2_move_btree(c, start, end, - rereplicate_btree_pred, c, stats) ?: ret; - ret = bch2_move_data(c, start, end, - NULL, - stats, - writepoint_hashed((unsigned long) current), - true, - rereplicate_pred, c) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - break; - case BCH_DATA_OP_migrate: - if (op.migrate.dev >= c->sb.nr_devices) - return -EINVAL; - - stats->data_type = BCH_DATA_journal; - ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, - ~0, - NULL, - stats, - writepoint_hashed((unsigned long) current), - true, - migrate_pred, &op) ?: ret; - bch2_btree_interior_updates_flush(c); - ret = bch2_replicas_gc2(c) ?: ret; - break; - case BCH_DATA_OP_rewrite_old_nodes: - ret = bch2_scan_old_btree_nodes(c, stats); - break; - case BCH_DATA_OP_drop_extra_replicas: - ret = bch2_move_btree(c, start, end, - drop_extra_replicas_btree_pred, c, stats) ?: ret; - ret = bch2_move_data(c, start, end, NULL, stats, - writepoint_hashed((unsigned long) current), - true, - drop_extra_replicas_pred, c) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - break; - default: - ret = -EINVAL; - } - - bch2_move_stats_exit(stats, c); - return ret; -} - -void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) -{ - prt_printf(out, "%s: data type==", stats->name); - bch2_prt_data_type(out, stats->data_type); - prt_str(out, " pos="); - bch2_bbpos_to_text(out, stats->pos); - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); - prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); - prt_printf(out, "bytes seen:\t"); - prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); - prt_newline(out); - - prt_printf(out, "bytes moved:\t"); - prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); - prt_newline(out); - - prt_printf(out, "bytes raced:\t"); - prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); - prt_newline(out); - - printbuf_indent_sub(out, 2); -} - -static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - bch2_move_stats_to_text(out, ctxt->stats); - printbuf_indent_add(out, 2); - - prt_printf(out, "reads: ios %u/%u sectors %u/%u\n", - atomic_read(&ctxt->read_ios), - c->opts.move_ios_in_flight, - atomic_read(&ctxt->read_sectors), - c->opts.move_bytes_in_flight >> 9); - - prt_printf(out, "writes: ios %u/%u sectors %u/%u\n", - atomic_read(&ctxt->write_ios), - c->opts.move_ios_in_flight, - atomic_read(&ctxt->write_sectors), - c->opts.move_bytes_in_flight >> 9); - - printbuf_indent_add(out, 2); - - mutex_lock(&ctxt->lock); - struct moving_io *io; - list_for_each_entry(io, &ctxt->ios, io_list) - bch2_data_update_inflight_to_text(out, &io->write); - mutex_unlock(&ctxt->lock); - - printbuf_indent_sub(out, 4); -} - -void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct moving_context *ctxt; - - mutex_lock(&c->moving_context_lock); - list_for_each_entry(ctxt, &c->moving_context_list, list) - bch2_moving_ctxt_to_text(out, c, ctxt); - mutex_unlock(&c->moving_context_lock); -} - -void bch2_fs_move_init(struct bch_fs *c) -{ - INIT_LIST_HEAD(&c->moving_context_list); - mutex_init(&c->moving_context_lock); -} diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h deleted file mode 100644 index 86b80499ac55..000000000000 --- a/fs/bcachefs/move.h +++ /dev/null @@ -1,165 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_MOVE_H -#define _BCACHEFS_MOVE_H - -#include "bbpos.h" -#include "bcachefs_ioctl.h" -#include "btree_iter.h" -#include "buckets.h" -#include "data_update.h" -#include "move_types.h" - -struct bch_read_bio; - -struct moving_context { - struct btree_trans *trans; - struct list_head list; - void *fn; - - struct bch_ratelimit *rate; - struct bch_move_stats *stats; - struct write_point_specifier wp; - bool wait_on_copygc; - bool write_error; - - /* For waiting on outstanding reads and writes: */ - struct closure cl; - - struct mutex lock; - struct list_head reads; - struct list_head ios; - - /* in flight sectors: */ - atomic_t read_sectors; - atomic_t write_sectors; - atomic_t read_ios; - atomic_t write_ios; - - wait_queue_head_t wait; -}; - -#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout) \ -({ \ - int _ret = 0; \ - while (true) { \ - bool cond_finished = false; \ - bch2_moving_ctxt_do_pending_writes(_ctxt); \ - \ - if (_cond) \ - break; \ - bch2_trans_unlock_long((_ctxt)->trans); \ - _ret = __wait_event_timeout((_ctxt)->wait, \ - bch2_moving_ctxt_next_pending_write(_ctxt) || \ - (cond_finished = (_cond)), _timeout); \ - if (_ret || ( cond_finished)) \ - break; \ - } \ - _ret; \ -}) - -#define move_ctxt_wait_event(_ctxt, _cond) \ -do { \ - bool cond_finished = false; \ - bch2_moving_ctxt_do_pending_writes(_ctxt); \ - \ - if (_cond) \ - break; \ - bch2_trans_unlock_long((_ctxt)->trans); \ - __wait_event((_ctxt)->wait, \ - bch2_moving_ctxt_next_pending_write(_ctxt) || \ - (cond_finished = (_cond))); \ - if (cond_finished) \ - break; \ -} while (1) - -typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, - struct bch_io_opts *, struct data_update_opts *); - -extern const char * const bch2_data_ops_strs[]; - -void bch2_moving_ctxt_exit(struct moving_context *); -void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, - struct bch_ratelimit *, struct bch_move_stats *, - struct write_point_specifier, bool); -struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); -void bch2_moving_ctxt_do_pending_writes(struct moving_context *); -void bch2_moving_ctxt_flush_all(struct moving_context *); -void bch2_move_ctxt_wait_for_io(struct moving_context *); -int bch2_move_ratelimit(struct moving_context *); - -/* Inodes in different snapshots may have different IO options: */ -struct snapshot_io_opts_entry { - u32 snapshot; - struct bch_io_opts io_opts; -}; - -struct per_snapshot_io_opts { - u64 cur_inum; - struct bch_io_opts fs_io_opts; - DARRAY(struct snapshot_io_opts_entry) d; -}; - -static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) -{ - memset(io_opts, 0, sizeof(*io_opts)); - io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts); -} - -static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) -{ - darray_exit(&io_opts->d); -} - -int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, - struct btree_iter *, struct bkey_s_c); - -int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); - -int bch2_move_extent(struct moving_context *, - struct move_bucket *, - struct btree_iter *, - struct bkey_s_c, - struct bch_io_opts, - struct data_update_opts); - -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, - struct per_snapshot_io_opts *, struct bpos, - struct btree_iter *, struct bkey_s_c); - -int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, - move_pred_fn, void *, enum btree_id, unsigned); -int __bch2_move_data(struct moving_context *, - struct bbpos, - struct bbpos, - move_pred_fn, void *); -int bch2_move_data(struct bch_fs *, - struct bbpos start, - struct bbpos end, - struct bch_ratelimit *, - struct bch_move_stats *, - struct write_point_specifier, - bool, - move_pred_fn, void *); - -int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned, - struct bch_ratelimit *, struct bch_move_stats *, - struct write_point_specifier, bool, - move_pred_fn, void *); - -int bch2_evacuate_bucket(struct moving_context *, - struct move_bucket *, - struct bpos, int, - struct data_update_opts); -int bch2_data_job(struct bch_fs *, - struct bch_move_stats *, - struct bch_ioctl_data); - -void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); -void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); -void bch2_move_stats_init(struct bch_move_stats *, const char *); - -void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_move_init(struct bch_fs *); - -#endif /* _BCACHEFS_MOVE_H */ diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h deleted file mode 100644 index c5c62cd600de..000000000000 --- a/fs/bcachefs/move_types.h +++ /dev/null @@ -1,46 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_MOVE_TYPES_H -#define _BCACHEFS_MOVE_TYPES_H - -#include "bbpos_types.h" -#include "bcachefs_ioctl.h" - -struct bch_move_stats { - char name[32]; - bool phys; - enum bch_ioctl_data_event_ret ret; - - union { - struct { - enum bch_data_type data_type; - struct bbpos pos; - }; - struct { - unsigned dev; - u64 offset; - }; - }; - - atomic64_t keys_moved; - atomic64_t keys_raced; - atomic64_t sectors_seen; - atomic64_t sectors_moved; - atomic64_t sectors_raced; - atomic64_t sectors_error_corrected; - atomic64_t sectors_error_uncorrected; -}; - -struct move_bucket_key { - struct bpos bucket; - unsigned gen; -}; - -struct move_bucket { - struct move_bucket *next; - struct rhash_head hash; - struct move_bucket_key k; - unsigned sectors; - atomic_t count; -}; - -#endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c deleted file mode 100644 index 5e6de91a8763..000000000000 --- a/fs/bcachefs/movinggc.c +++ /dev/null @@ -1,476 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Moving/copying garbage collector - * - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "clock.h" -#include "errcode.h" -#include "error.h" -#include "lru.h" -#include "move.h" -#include "movinggc.h" -#include "trace.h" - -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/math64.h> -#include <linux/sched/task.h> -#include <linux/wait.h> - -struct buckets_in_flight { - struct rhashtable *table; - struct move_bucket *first; - struct move_bucket *last; - size_t nr; - size_t sectors; - - DARRAY(struct move_bucket *) to_evacuate; -}; - -static const struct rhashtable_params bch_move_bucket_params = { - .head_offset = offsetof(struct move_bucket, hash), - .key_offset = offsetof(struct move_bucket, k), - .key_len = sizeof(struct move_bucket_key), - .automatic_shrinking = true, -}; - -static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b) -{ - if (!list->first) - list->first = b; - else - list->last->next = b; - - list->last = b; - list->nr++; - list->sectors += b->sectors; -} - -static int bch2_bucket_is_movable(struct btree_trans *trans, - struct move_bucket *b, u64 time) -{ - struct bch_fs *c = trans->c; - - if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) - return 0; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - b->k.bucket, BTREE_ITER_cached); - int ret = bkey_err(k); - if (ret) - return ret; - - struct bch_dev *ca = bch2_dev_bucket_tryget(c, k.k->p); - if (!ca) - goto out; - - if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) - goto out; - - if (ca->mi.state != BCH_MEMBER_STATE_rw || - !bch2_dev_is_online(ca)) - goto out; - - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - b->k.gen = a->gen; - b->sectors = bch2_bucket_sectors_dirty(*a); - u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); - - ret = lru_idx && lru_idx <= time; -out: - bch2_dev_put(ca); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static void move_bucket_free(struct buckets_in_flight *list, - struct move_bucket *b) -{ - int ret = rhashtable_remove_fast(list->table, &b->hash, - bch_move_bucket_params); - BUG_ON(ret); - kfree(b); -} - -static void move_buckets_wait(struct moving_context *ctxt, - struct buckets_in_flight *list, - bool flush) -{ - struct move_bucket *i; - - while ((i = list->first)) { - if (flush) - move_ctxt_wait_event(ctxt, !atomic_read(&i->count)); - - if (atomic_read(&i->count)) - break; - - list->first = i->next; - if (!list->first) - list->last = NULL; - - list->nr--; - list->sectors -= i->sectors; - - move_bucket_free(list, i); - } - - bch2_trans_unlock_long(ctxt->trans); -} - -static bool bucket_in_flight(struct buckets_in_flight *list, - struct move_bucket_key k) -{ - return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params); -} - -static int bch2_copygc_get_buckets(struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); - size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; - int ret; - - move_buckets_wait(ctxt, buckets_in_flight, false); - - ret = bch2_btree_write_buffer_tryflush(trans); - if (bch2_err_matches(ret, EROFS)) - return ret; - - if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) - return ret; - - ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), - 0, k, ({ - struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; - int ret2 = 0; - - saw++; - - ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)); - if (ret2 < 0) - goto err; - - if (!ret2) - not_movable++; - else if (bucket_in_flight(buckets_in_flight, b.k)) - in_flight++; - else { - struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); - ret2 = b_i ? 0 : -ENOMEM; - if (ret2) - goto err; - - *b_i = b; - - ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); - if (ret2) { - kfree(b_i); - goto err; - } - - ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, - bch_move_bucket_params); - BUG_ON(ret2); - - sectors += b.sectors; - } - - ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; -err: - ret2; - })); - - pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", - buckets_in_flight->nr, buckets_in_flight->sectors, - saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); - - return ret < 0 ? ret : 0; -} - -noinline -static int bch2_copygc(struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight, - bool *did_work) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct data_update_opts data_opts = { - .btree_insert_flags = BCH_WATERMARK_copygc, - }; - u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); - u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); - int ret = 0; - - ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); - if (ret) - goto err; - - darray_for_each(buckets_in_flight->to_evacuate, i) { - if (kthread_should_stop() || freezing(current)) - break; - - struct move_bucket *b = *i; - *i = NULL; - - move_bucket_in_flight_add(buckets_in_flight, b); - - ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts); - if (ret) - goto err; - - *did_work = true; - } -err: - /* no entries in LRU btree found, or got to end: */ - if (bch2_err_matches(ret, ENOENT)) - ret = 0; - - if (ret < 0 && !bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "from bch2_move_data()"); - - sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; - sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; - trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved); - - darray_for_each(buckets_in_flight->to_evacuate, i) - if (*i) - move_bucket_free(buckets_in_flight, *i); - darray_exit(&buckets_in_flight->to_evacuate); - return ret; -} - -static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) -{ - struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); - struct bch_dev_usage usage; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - usage.buckets[i] = usage_full.d[i].buckets; - - s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * - ca->mi.bucket_size) >> 1); - s64 fragmented = 0; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - if (data_type_movable(i)) - fragmented += usage_full.d[i].fragmented; - - return max(0LL, fragmented_allowed - fragmented); -} - -/* - * Copygc runs when the amount of fragmented data is above some arbitrary - * threshold: - * - * The threshold at the limit - when the device is full - is the amount of space - * we reserved in bch2_recalc_capacity; we can't have more than that amount of - * disk space stranded due to fragmentation and store everything we have - * promised to store. - * - * But we don't want to be running copygc unnecessarily when the device still - * has plenty of free space - rather, we want copygc to smoothly run every so - * often and continually reduce the amount of fragmented space as the device - * fills up. So, we increase the threshold by half the current free space. - */ -u64 bch2_copygc_wait_amount(struct bch_fs *c) -{ - u64 wait = U64_MAX; - - guard(rcu)(); - for_each_rw_member_rcu(c, ca) - wait = min(wait, bch2_copygc_dev_wait_amount(ca)); - return wait; -} - -void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) -{ - printbuf_tabstop_push(out, 32); - prt_printf(out, "running:\t%u\n", c->copygc_running); - prt_printf(out, "copygc_wait:\t%llu\n", c->copygc_wait); - prt_printf(out, "copygc_wait_at:\t%llu\n", c->copygc_wait_at); - - prt_printf(out, "Currently waiting for:\t"); - prt_human_readable_u64(out, max(0LL, c->copygc_wait - - atomic64_read(&c->io_clock[WRITE].now)) << 9); - prt_newline(out); - - prt_printf(out, "Currently waiting since:\t"); - prt_human_readable_u64(out, max(0LL, - atomic64_read(&c->io_clock[WRITE].now) - - c->copygc_wait_at) << 9); - prt_newline(out); - - bch2_printbuf_make_room(out, 4096); - - struct task_struct *t; - out->atomic++; - scoped_guard(rcu) { - prt_printf(out, "Currently calculated wait:\n"); - for_each_rw_member_rcu(c, ca) { - prt_printf(out, " %s:\t", ca->name); - prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); - prt_newline(out); - } - - t = rcu_dereference(c->copygc_thread); - if (t) - get_task_struct(t); - } - --out->atomic; - - if (t) { - bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); - put_task_struct(t); - } -} - -static int bch2_copygc_thread(void *arg) -{ - struct bch_fs *c = arg; - struct moving_context ctxt; - struct bch_move_stats move_stats; - struct io_clock *clock = &c->io_clock[WRITE]; - struct buckets_in_flight buckets = {}; - u64 last, wait; - - buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL); - int ret = !buckets.table - ? -ENOMEM - : rhashtable_init(buckets.table, &bch_move_bucket_params); - bch_err_msg(c, ret, "allocating copygc buckets in flight"); - if (ret) - goto err; - - set_freezable(); - - /* - * Data move operations can't run until after check_snapshots has - * completed, and bch2_snapshot_is_ancestor() is available. - */ - kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || - kthread_should_stop()); - - bch2_move_stats_init(&move_stats, "copygc"); - bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, - writepoint_ptr(&c->copygc_write_point), - false); - - while (!ret && !kthread_should_stop()) { - bool did_work = false; - - bch2_trans_unlock_long(ctxt.trans); - cond_resched(); - - if (!c->opts.copygc_enabled) { - move_buckets_wait(&ctxt, &buckets, true); - kthread_wait_freezable(c->opts.copygc_enabled || - kthread_should_stop()); - } - - if (unlikely(freezing(current))) { - move_buckets_wait(&ctxt, &buckets, true); - __refrigerator(false); - continue; - } - - last = atomic64_read(&clock->now); - wait = bch2_copygc_wait_amount(c); - - if (wait > clock->max_slop) { - c->copygc_wait_at = last; - c->copygc_wait = last + wait; - move_buckets_wait(&ctxt, &buckets, true); - trace_and_count(c, copygc_wait, c, wait, last + wait); - bch2_kthread_io_clock_wait(clock, last + wait, - MAX_SCHEDULE_TIMEOUT); - continue; - } - - c->copygc_wait = 0; - - c->copygc_running = true; - ret = bch2_copygc(&ctxt, &buckets, &did_work); - c->copygc_running = false; - - wake_up(&c->copygc_running_wq); - - if (!wait && !did_work) { - u64 min_member_capacity = bch2_min_rw_member_capacity(c); - - if (min_member_capacity == U64_MAX) - min_member_capacity = 128 * 2048; - - move_buckets_wait(&ctxt, &buckets, true); - bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), - MAX_SCHEDULE_TIMEOUT); - } - } - - move_buckets_wait(&ctxt, &buckets, true); - rhashtable_destroy(buckets.table); - bch2_moving_ctxt_exit(&ctxt); - bch2_move_stats_exit(&move_stats, c); -err: - kfree(buckets.table); - return ret; -} - -void bch2_copygc_stop(struct bch_fs *c) -{ - if (c->copygc_thread) { - kthread_stop(c->copygc_thread); - put_task_struct(c->copygc_thread); - } - c->copygc_thread = NULL; -} - -int bch2_copygc_start(struct bch_fs *c) -{ - struct task_struct *t; - int ret; - - if (c->copygc_thread) - return 0; - - if (c->opts.nochanges) - return 0; - - if (bch2_fs_init_fault("copygc_start")) - return -ENOMEM; - - t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); - ret = PTR_ERR_OR_ZERO(t); - bch_err_msg(c, ret, "creating copygc thread"); - if (ret) - return ret; - - get_task_struct(t); - - c->copygc_thread = t; - wake_up_process(c->copygc_thread); - - return 0; -} - -void bch2_fs_copygc_init(struct bch_fs *c) -{ - init_waitqueue_head(&c->copygc_running_wq); - c->copygc_running = false; -} diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h deleted file mode 100644 index f615910d6f98..000000000000 --- a/fs/bcachefs/movinggc.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_MOVINGGC_H -#define _BCACHEFS_MOVINGGC_H - -u64 bch2_copygc_wait_amount(struct bch_fs *); -void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); - -static inline void bch2_copygc_wakeup(struct bch_fs *c) -{ - guard(rcu)(); - struct task_struct *p = rcu_dereference(c->copygc_thread); - if (p) - wake_up_process(p); -} - -void bch2_copygc_stop(struct bch_fs *); -int bch2_copygc_start(struct bch_fs *); -void bch2_fs_copygc_init(struct bch_fs *); - -#endif /* _BCACHEFS_MOVINGGC_H */ diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c deleted file mode 100644 index c3f87c59922d..000000000000 --- a/fs/bcachefs/namei.c +++ /dev/null @@ -1,1034 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "acl.h" -#include "btree_update.h" -#include "dirent.h" -#include "inode.h" -#include "namei.h" -#include "subvolume.h" -#include "xattr.h" - -#include <linux/posix_acl.h> - -static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode) -{ - return (subvol_inum) { - .subvol = inode->bi_parent_subvol ?: inum.subvol, - .inum = inode->bi_dir, - }; -} - -static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) -{ - return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; -} - -int bch2_create_trans(struct btree_trans *trans, - subvol_inum dir, - struct bch_inode_unpacked *dir_u, - struct bch_inode_unpacked *new_inode, - const struct qstr *name, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev, - struct posix_acl *default_acl, - struct posix_acl *acl, - subvol_inum snapshot_src, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter dir_iter = {}; - struct btree_iter inode_iter = {}; - subvol_inum new_inum = dir; - u64 now = bch2_current_time(c); - u64 cpu = raw_smp_processor_id(); - u64 dir_target; - u32 snapshot; - unsigned dir_type = mode_to_type(mode); - int ret; - - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); - if (ret) - goto err; - - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, - BTREE_ITER_intent|BTREE_ITER_with_updates); - if (ret) - goto err; - - if (!(flags & BCH_CREATE_SNAPSHOT)) { - /* Normal create path - allocate a new inode: */ - bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u); - - if (flags & BCH_CREATE_TMPFILE) - new_inode->bi_flags |= BCH_INODE_unlinked; - - ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); - if (ret) - goto err; - - snapshot_src = (subvol_inum) { 0 }; - } else { - /* - * Creating a snapshot - we're not allocating a new inode, but - * we do have to lookup the root inode of the subvolume we're - * snapshotting and update it (in the new snapshot): - */ - - if (!snapshot_src.inum) { - /* Inode wasn't specified, just snapshot: */ - struct bch_subvolume s; - ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s); - if (ret) - goto err; - - snapshot_src.inum = le64_to_cpu(s.inode); - } - - ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, - BTREE_ITER_intent); - if (ret) - goto err; - - if (new_inode->bi_subvol != snapshot_src.subvol) { - /* Not a subvolume root: */ - ret = -EINVAL; - goto err; - } - - /* - * If we're not root, we have to own the subvolume being - * snapshotted: - */ - if (uid && new_inode->bi_uid != uid) { - ret = -EPERM; - goto err; - } - - flags |= BCH_CREATE_SUBVOL; - } - - new_inum.inum = new_inode->bi_inum; - dir_target = new_inode->bi_inum; - - if (flags & BCH_CREATE_SUBVOL) { - u32 new_subvol, dir_snapshot; - - ret = bch2_subvolume_create(trans, new_inode->bi_inum, - dir.subvol, - snapshot_src.subvol, - &new_subvol, &snapshot, - (flags & BCH_CREATE_SNAPSHOT_RO) != 0); - if (ret) - goto err; - - new_inode->bi_parent_subvol = dir.subvol; - new_inode->bi_subvol = new_subvol; - new_inum.subvol = new_subvol; - dir_target = new_subvol; - dir_type = DT_SUBVOL; - - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot); - ret = bch2_btree_iter_traverse(trans, &dir_iter); - if (ret) - goto err; - } - - if (!(flags & BCH_CREATE_SNAPSHOT)) { - if (default_acl) { - ret = bch2_set_acl_trans(trans, new_inum, new_inode, - default_acl, ACL_TYPE_DEFAULT); - if (ret) - goto err; - } - - if (acl) { - ret = bch2_set_acl_trans(trans, new_inum, new_inode, - acl, ACL_TYPE_ACCESS); - if (ret) - goto err; - } - } - - if (!(flags & BCH_CREATE_TMPFILE)) { - struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); - u64 dir_offset; - - if (is_subdir_for_nlink(new_inode)) - dir_u->bi_nlink++; - dir_u->bi_mtime = dir_u->bi_ctime = now; - - ret = bch2_dirent_create(trans, dir, &dir_hash, - dir_type, - name, - dir_target, - &dir_offset, - STR_HASH_must_create|BTREE_ITER_with_updates) ?: - bch2_inode_write(trans, &dir_iter, dir_u); - if (ret) - goto err; - - new_inode->bi_dir = dir_u->bi_inum; - new_inode->bi_dir_offset = dir_offset; - } - - if (S_ISDIR(mode)) { - ret = bch2_maybe_propagate_has_case_insensitive(trans, - (subvol_inum) { - new_inode->bi_subvol ?: dir.subvol, - new_inode->bi_inum }, - new_inode); - if (ret) - goto err; - } - - if (S_ISDIR(mode) && - !new_inode->bi_subvol) - new_inode->bi_depth = dir_u->bi_depth + 1; - - inode_iter.flags &= ~BTREE_ITER_all_snapshots; - bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot); - - ret = bch2_btree_iter_traverse(trans, &inode_iter) ?: - bch2_inode_write(trans, &inode_iter, new_inode); -err: - bch2_trans_iter_exit(trans, &inode_iter); - bch2_trans_iter_exit(trans, &dir_iter); - return ret; -} - -int bch2_link_trans(struct btree_trans *trans, - subvol_inum dir, struct bch_inode_unpacked *dir_u, - subvol_inum inum, struct bch_inode_unpacked *inode_u, - const struct qstr *name) -{ - struct bch_fs *c = trans->c; - struct btree_iter dir_iter = {}; - struct btree_iter inode_iter = {}; - struct bch_hash_info dir_hash; - u64 now = bch2_current_time(c); - u64 dir_offset = 0; - int ret; - - if (dir.subvol != inum.subvol) - return -EXDEV; - - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); - if (ret) - return ret; - - inode_u->bi_ctime = now; - ret = bch2_inode_nlink_inc(inode_u); - if (ret) - goto err; - - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); - if (ret) - goto err; - - if (bch2_reinherit_attrs(inode_u, dir_u)) { - ret = -EXDEV; - goto err; - } - - dir_u->bi_mtime = dir_u->bi_ctime = now; - - dir_hash = bch2_hash_info_init(c, dir_u); - - ret = bch2_dirent_create(trans, dir, &dir_hash, - mode_to_type(inode_u->bi_mode), - name, inum.inum, - &dir_offset, - STR_HASH_must_create); - if (ret) - goto err; - - inode_u->bi_dir = dir.inum; - inode_u->bi_dir_offset = dir_offset; - - ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: - bch2_inode_write(trans, &inode_iter, inode_u); -err: - bch2_trans_iter_exit(trans, &dir_iter); - bch2_trans_iter_exit(trans, &inode_iter); - return ret; -} - -int bch2_unlink_trans(struct btree_trans *trans, - subvol_inum dir, - struct bch_inode_unpacked *dir_u, - struct bch_inode_unpacked *inode_u, - const struct qstr *name, - bool deleting_subvol) -{ - struct bch_fs *c = trans->c; - struct btree_iter dir_iter = {}; - struct btree_iter dirent_iter = {}; - struct btree_iter inode_iter = {}; - struct bch_hash_info dir_hash; - subvol_inum inum; - u64 now = bch2_current_time(c); - struct bkey_s_c k; - int ret; - - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); - if (ret) - goto err; - - dir_hash = bch2_hash_info_init(c, dir_u); - - ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, - name, &inum, BTREE_ITER_intent); - if (ret) - goto err; - - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, - BTREE_ITER_intent); - if (ret) - goto err; - - if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) { - ret = bch2_empty_dir_trans(trans, inum); - if (ret) - goto err; - } - - if (deleting_subvol && !inode_u->bi_subvol) { - ret = bch_err_throw(c, ENOENT_not_subvol); - goto err; - } - - if (inode_u->bi_subvol) { - /* Recursive subvolume destroy not allowed (yet?) */ - ret = bch2_subvol_has_children(trans, inode_u->bi_subvol); - if (ret) - goto err; - } - - if (deleting_subvol || inode_u->bi_subvol) { - ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); - if (ret) - goto err; - - k = bch2_btree_iter_peek_slot(trans, &dirent_iter); - ret = bkey_err(k); - if (ret) - goto err; - - /* - * If we're deleting a subvolume, we need to really delete the - * dirent, not just emit a whiteout in the current snapshot: - */ - bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot); - ret = bch2_btree_iter_traverse(trans, &dirent_iter); - if (ret) - goto err; - } else { - bch2_inode_nlink_dec(trans, inode_u); - } - - if (inode_u->bi_dir == dirent_iter.pos.inode && - inode_u->bi_dir_offset == dirent_iter.pos.offset) { - inode_u->bi_dir = 0; - inode_u->bi_dir_offset = 0; - } - - dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; - dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); - - ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash, &dirent_iter, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_inode_write(trans, &dir_iter, dir_u) ?: - bch2_inode_write(trans, &inode_iter, inode_u); -err: - bch2_trans_iter_exit(trans, &inode_iter); - bch2_trans_iter_exit(trans, &dirent_iter); - bch2_trans_iter_exit(trans, &dir_iter); - return ret; -} - -bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, - struct bch_inode_unpacked *src_u) -{ - u64 src, dst; - unsigned id; - bool ret = false; - - for (id = 0; id < Inode_opt_nr; id++) { - if (!S_ISDIR(dst_u->bi_mode) && id == Inode_opt_casefold) - continue; - - /* Skip attributes that were explicitly set on this inode */ - if (dst_u->bi_fields_set & (1 << id)) - continue; - - src = bch2_inode_opt_get(src_u, id); - dst = bch2_inode_opt_get(dst_u, id); - - if (src == dst) - continue; - - bch2_inode_opt_set(dst_u, id, src); - ret = true; - } - - return ret; -} - -static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent) -{ - struct btree_iter iter; - struct bkey_i_subvolume *s = - bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvol), - BTREE_ITER_cached, subvolume); - int ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - s->v.fs_path_parent = cpu_to_le32(new_parent); - bch2_trans_iter_exit(trans, &iter); - return 0; -} - -int bch2_rename_trans(struct btree_trans *trans, - subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, - subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, - struct bch_inode_unpacked *src_inode_u, - struct bch_inode_unpacked *dst_inode_u, - const struct qstr *src_name, - const struct qstr *dst_name, - enum bch_rename_mode mode) -{ - struct bch_fs *c = trans->c; - struct btree_iter src_dir_iter = {}; - struct btree_iter dst_dir_iter = {}; - struct btree_iter src_inode_iter = {}; - struct btree_iter dst_inode_iter = {}; - struct bch_hash_info src_hash, dst_hash; - subvol_inum src_inum, dst_inum; - u64 src_offset, dst_offset; - u64 now = bch2_current_time(c); - int ret; - - ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, - BTREE_ITER_intent); - if (ret) - goto err; - - src_hash = bch2_hash_info_init(c, src_dir_u); - - if (!subvol_inum_eq(dst_dir, src_dir)) { - ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, - BTREE_ITER_intent); - if (ret) - goto err; - - dst_hash = bch2_hash_info_init(c, dst_dir_u); - } else { - dst_dir_u = src_dir_u; - dst_hash = src_hash; - } - - ret = bch2_dirent_rename(trans, - src_dir, &src_hash, - dst_dir, &dst_hash, - src_name, &src_inum, &src_offset, - dst_name, &dst_inum, &dst_offset, - mode); - if (ret) - goto err; - - ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, - BTREE_ITER_intent); - if (ret) - goto err; - - if (dst_inum.inum) { - ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, - BTREE_ITER_intent); - if (ret) - goto err; - } - - if (src_inode_u->bi_subvol && - dst_dir.subvol != src_inode_u->bi_parent_subvol) { - ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol); - if (ret) - goto err; - } - - if (mode == BCH_RENAME_EXCHANGE && - dst_inode_u->bi_subvol && - src_dir.subvol != dst_inode_u->bi_parent_subvol) { - ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol); - if (ret) - goto err; - } - - /* Can't move across subvolumes, unless it's a subvolume root: */ - if (src_dir.subvol != dst_dir.subvol && - (!src_inode_u->bi_subvol || - (dst_inum.inum && !dst_inode_u->bi_subvol))) { - ret = -EXDEV; - goto err; - } - - if (src_inode_u->bi_parent_subvol) - src_inode_u->bi_parent_subvol = dst_dir.subvol; - - if ((mode == BCH_RENAME_EXCHANGE) && - dst_inode_u->bi_parent_subvol) - dst_inode_u->bi_parent_subvol = src_dir.subvol; - - src_inode_u->bi_dir = dst_dir_u->bi_inum; - src_inode_u->bi_dir_offset = dst_offset; - - if (mode == BCH_RENAME_EXCHANGE) { - dst_inode_u->bi_dir = src_dir_u->bi_inum; - dst_inode_u->bi_dir_offset = src_offset; - } - - if (mode == BCH_RENAME_OVERWRITE && - dst_inode_u->bi_dir == dst_dir_u->bi_inum && - dst_inode_u->bi_dir_offset == src_offset) { - dst_inode_u->bi_dir = 0; - dst_inode_u->bi_dir_offset = 0; - } - - if (mode == BCH_RENAME_OVERWRITE) { - if (S_ISDIR(src_inode_u->bi_mode) != - S_ISDIR(dst_inode_u->bi_mode)) { - ret = -ENOTDIR; - goto err; - } - - if (S_ISDIR(dst_inode_u->bi_mode)) { - ret = bch2_empty_dir_trans(trans, dst_inum); - if (ret) - goto err; - } - } - - if (!subvol_inum_eq(dst_dir, src_dir)) { - if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && - S_ISDIR(src_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } - - if (mode == BCH_RENAME_EXCHANGE && - bch2_reinherit_attrs(dst_inode_u, src_dir_u) && - S_ISDIR(dst_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } - - ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?: - (mode == BCH_RENAME_EXCHANGE - ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u) - : 0); - if (ret) - goto err; - - if (is_subdir_for_nlink(src_inode_u)) { - src_dir_u->bi_nlink--; - dst_dir_u->bi_nlink++; - } - - if (S_ISDIR(src_inode_u->bi_mode) && - !src_inode_u->bi_subvol) - src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; - - if (mode == BCH_RENAME_EXCHANGE && - S_ISDIR(dst_inode_u->bi_mode) && - !dst_inode_u->bi_subvol) - dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; - } - - if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { - dst_dir_u->bi_nlink--; - src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; - } - - if (mode == BCH_RENAME_OVERWRITE) - bch2_inode_nlink_dec(trans, dst_inode_u); - - src_dir_u->bi_mtime = now; - src_dir_u->bi_ctime = now; - - if (src_dir.inum != dst_dir.inum) { - dst_dir_u->bi_mtime = now; - dst_dir_u->bi_ctime = now; - } - - src_inode_u->bi_ctime = now; - - if (dst_inum.inum) - dst_inode_u->bi_ctime = now; - - ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: - (src_dir.inum != dst_dir.inum - ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) - : 0) ?: - bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: - (dst_inum.inum - ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) - : 0); -err: - bch2_trans_iter_exit(trans, &dst_inode_iter); - bch2_trans_iter_exit(trans, &src_inode_iter); - bch2_trans_iter_exit(trans, &dst_dir_iter); - bch2_trans_iter_exit(trans, &src_dir_iter); - return ret; -} - -/* inum_to_path */ - -static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) -{ - bch2_printbuf_make_room(out, n); - - unsigned can_print = min(n, printbuf_remaining(out)); - - b += n; - - for (unsigned i = 0; i < can_print; i++) - out->buf[out->pos++] = *((char *) --b); - - printbuf_nul_terminate(out); -} - -static inline void prt_str_reversed(struct printbuf *out, const char *s) -{ - prt_bytes_reversed(out, s, strlen(s)); -} - -static inline void reverse_bytes(void *b, size_t n) -{ - char *e = b + n, *s = b; - - while (s < e) { - --e; - swap(*s, *e); - s++; - } -} - -static int __bch2_inum_to_path(struct btree_trans *trans, - u32 subvol, u64 inum, u32 snapshot, - struct printbuf *path) -{ - unsigned orig_pos = path->pos; - int ret = 0; - DARRAY(subvol_inum) inums = {}; - - if (!snapshot) { - ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); - if (ret) - goto disconnected; - } - - while (true) { - subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum }; - - if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) { - prt_str_reversed(path, "(loop)"); - break; - } - - ret = darray_push(&inums, n); - if (ret) - goto err; - - struct bch_inode_unpacked inode; - ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); - if (ret) - goto disconnected; - - if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL && - inode.bi_inum == BCACHEFS_ROOT_INO) - break; - - if (!inode.bi_dir && !inode.bi_dir_offset) { - ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer); - goto disconnected; - } - - inum = inode.bi_dir; - if (inode.bi_parent_subvol) { - subvol = inode.bi_parent_subvol; - ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot); - if (ret) - goto disconnected; - } - - struct btree_iter d_iter; - struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, - BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot), - 0, dirent); - ret = bkey_err(d.s_c); - if (ret) - goto disconnected; - - struct qstr dirent_name = bch2_dirent_get_name(d); - - prt_bytes_reversed(path, dirent_name.name, dirent_name.len); - - prt_char(path, '/'); - - bch2_trans_iter_exit(trans, &d_iter); - } - - if (orig_pos == path->pos) - prt_char(path, '/'); -out: - ret = path->allocation_failure ? -ENOMEM : 0; - if (ret) - goto err; - - reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); - darray_exit(&inums); - return 0; -err: - darray_exit(&inums); - return ret; -disconnected: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto err; - - prt_str_reversed(path, "(disconnected)"); - goto out; -} - -int bch2_inum_to_path(struct btree_trans *trans, - subvol_inum inum, - struct printbuf *path) -{ - return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path); -} - -int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot, - snapshot_id_list *snapshot_overwrites, - struct printbuf *path) -{ - return __bch2_inum_to_path(trans, 0, inum, snapshot, path); -} - -/* fsck */ - -static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - bool in_fsck) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = {}; - int ret = 0; - - if (inode_points_to_dirent(target, d)) - return 0; - - if (!bch2_inode_has_backpointer(target)) { - fsck_err_on(S_ISDIR(target->bi_mode), - trans, inode_dir_missing_backpointer, - "directory with missing backpointer\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - fsck_err_on(target->bi_flags & BCH_INODE_unlinked, - trans, inode_unlinked_but_has_dirent, - "inode unlinked but has dirent\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - target->bi_flags &= ~BCH_INODE_unlinked; - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - return __bch2_fsck_write_inode(trans, target); - } - - struct bkey_s_c_dirent bp_dirent = - bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, - SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), - 0, dirent); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - bool backpointer_exists = !ret; - ret = 0; - - if (!backpointer_exists) { - if (fsck_err(trans, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target->bi_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - ret = __bch2_fsck_write_inode(trans, target); - } - } else { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); - - if (S_ISDIR(target->bi_mode) || target->bi_subvol) { - /* - * XXX: verify connectivity of the other dirent - * up to the root before removing this one - * - * Additionally, bch2_lookup would need to cope with the - * dirent it found being removed - or should we remove - * the other one, even though the inode points to it? - */ - if (in_fsck) { - if (fsck_err(trans, inode_dir_multiple_links, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf)) - ret = bch2_fsck_remove_dirent(trans, d.k->p); - } else { - bch2_fs_inconsistent(c, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf); - } - - goto out; - } else { - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(!target->bi_nlink, - trans, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - ret = __bch2_fsck_write_inode(trans, target); - if (ret) - goto err; - } - } - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -int __bch2_check_dirent_target(struct btree_trans *trans, - struct btree_iter *dirent_iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - bool in_fsck) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck); - if (ret) - goto err; - - if (fsck_err_on(d.v->d_type != inode_d_type(target), - trans, dirent_d_type_wrong, - "incorrect d_type: got %s, should be %s:\n%s", - bch2_d_type_str(d.v->d_type), - bch2_d_type_str(inode_d_type(target)), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = inode_d_type(target); - if (n->v.d_type == DT_SUBVOL) { - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); - n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); - } else { - n->v.d_inum = cpu_to_le64(target->bi_inum); - } - - ret = bch2_trans_update(trans, dirent_iter, &n->k_i, - BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto err; - } -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -/* - * BCH_INODE_has_case_insensitive: - * We have to track whether directories have any descendent directory that is - * casefolded - for overlayfs: - */ - -static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum) -{ - struct btree_iter iter = {}; - int ret = 0; - - while (true) { - struct bch_inode_unpacked inode; - ret = bch2_inode_peek(trans, &iter, &inode, inum, - BTREE_ITER_intent|BTREE_ITER_with_updates); - if (ret) - break; - - if (inode.bi_flags & BCH_INODE_has_case_insensitive) - break; - - inode.bi_flags |= BCH_INODE_has_case_insensitive; - ret = bch2_inode_write(trans, &iter, &inode); - if (ret) - break; - - bch2_trans_iter_exit(trans, &iter); - if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM)) - break; - - inum = parent_inum(inum, &inode); - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - if (!bch2_inode_casefold(trans->c, inode)) - return 0; - - inode->bi_flags |= BCH_INODE_has_case_insensitive; - - return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode)); -} - -int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - snapshot_id_list *snapshot_overwrites, - bool *do_update) -{ - struct printbuf buf = PRINTBUF; - bool repairing_parents = false; - int ret = 0; - - if (!S_ISDIR(inode->bi_mode)) { - /* - * Old versions set bi_casefold for non dirs, but that's - * unnecessary and wasteful - */ - if (inode->bi_casefold) { - inode->bi_casefold = 0; - *do_update = true; - } - return 0; - } - - if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive) - return 0; - - if (bch2_inode_casefold(trans->c, inode) && - !(inode->bi_flags & BCH_INODE_has_case_insensitive)) { - prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ", - inode->bi_inum, inode->bi_snapshot); - - ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot, - snapshot_overwrites, &buf); - if (ret) - goto err; - - if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) { - inode->bi_flags |= BCH_INODE_has_case_insensitive; - *do_update = true; - } - } - - if (!(inode->bi_flags & BCH_INODE_has_case_insensitive)) - goto out; - - struct bch_inode_unpacked dir = *inode; - u32 snapshot = dir.bi_snapshot; - - while (!(dir.bi_inum == BCACHEFS_ROOT_INO && - dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { - if (dir.bi_parent_subvol) { - ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot); - if (ret) - goto err; - - snapshot_overwrites = NULL; - } - - ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0); - if (ret) - goto err; - - if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) { - prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n"); - - ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot, - snapshot_overwrites, &buf); - if (ret) - goto err; - - if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) { - dir.bi_flags |= BCH_INODE_has_case_insensitive; - ret = __bch2_fsck_write_inode(trans, &dir); - if (ret) - goto err; - } - } - - /* - * We only need to check the first parent, unless we find an - * inconsistency - */ - if (!repairing_parents) - break; - } -out: -err: -fsck_err: - printbuf_exit(&buf); - if (ret) - return ret; - - if (repairing_parents) { - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - } - - return 0; -} diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h deleted file mode 100644 index ae6ebc2d0785..000000000000 --- a/fs/bcachefs/namei.h +++ /dev/null @@ -1,79 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_NAMEI_H -#define _BCACHEFS_NAMEI_H - -#include "dirent.h" - -struct posix_acl; - -#define BCH_CREATE_TMPFILE (1U << 0) -#define BCH_CREATE_SUBVOL (1U << 1) -#define BCH_CREATE_SNAPSHOT (1U << 2) -#define BCH_CREATE_SNAPSHOT_RO (1U << 3) - -int bch2_create_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - struct bch_inode_unpacked *, - const struct qstr *, - uid_t, gid_t, umode_t, dev_t, - struct posix_acl *, - struct posix_acl *, - subvol_inum, unsigned); - -int bch2_link_trans(struct btree_trans *, - subvol_inum, struct bch_inode_unpacked *, - subvol_inum, struct bch_inode_unpacked *, - const struct qstr *); - -int bch2_unlink_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - struct bch_inode_unpacked *, - const struct qstr *, bool); - -int bch2_rename_trans(struct btree_trans *, - subvol_inum, struct bch_inode_unpacked *, - subvol_inum, struct bch_inode_unpacked *, - struct bch_inode_unpacked *, - struct bch_inode_unpacked *, - const struct qstr *, - const struct qstr *, - enum bch_rename_mode); - -bool bch2_reinherit_attrs(struct bch_inode_unpacked *, - struct bch_inode_unpacked *); - -int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); -int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32, - snapshot_id_list *, struct printbuf *); - -int __bch2_check_dirent_target(struct btree_trans *, - struct btree_iter *, - struct bkey_s_c_dirent, - struct bch_inode_unpacked *, bool); - -static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - -static inline int bch2_check_dirent_target(struct btree_trans *trans, - struct btree_iter *dirent_iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - bool in_fsck) -{ - if (likely(inode_points_to_dirent(target, d) && - d.v->d_type == inode_d_type(target))) - return 0; - - return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); -} - -int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *); -int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *, - snapshot_id_list *, bool *); - -#endif /* _BCACHEFS_NAMEI_H */ diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c deleted file mode 100644 index 962218fa68ec..000000000000 --- a/fs/bcachefs/nocow_locking.c +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_methods.h" -#include "nocow_locking.h" -#include "util.h" - -#include <linux/closure.h> - -bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - unsigned i; - - for (i = 0; i < ARRAY_SIZE(l->b); i++) - if (l->b[i] == dev_bucket && atomic_read(&l->l[i])) - return true; - return false; -} - -#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0) - -void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - int lock_val = flags ? 1 : -1; - unsigned i; - - for (i = 0; i < ARRAY_SIZE(l->b); i++) - if (l->b[i] == dev_bucket) { - int v = atomic_sub_return(lock_val, &l->l[i]); - - BUG_ON(v && sign(v) != lock_val); - if (!v) - closure_wake_up(&l->wait); - return; - } - - BUG(); -} - -bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, - u64 dev_bucket, int flags) -{ - int v, lock_val = flags ? 1 : -1; - unsigned i; - - spin_lock(&l->lock); - - for (i = 0; i < ARRAY_SIZE(l->b); i++) - if (l->b[i] == dev_bucket) - goto got_entry; - - for (i = 0; i < ARRAY_SIZE(l->b); i++) - if (!atomic_read(&l->l[i])) { - l->b[i] = dev_bucket; - goto take_lock; - } -fail: - spin_unlock(&l->lock); - return false; -got_entry: - v = atomic_read(&l->l[i]); - if (lock_val > 0 ? v < 0 : v > 0) - goto fail; -take_lock: - v = atomic_read(&l->l[i]); - /* Overflow? */ - if (v && sign(v + lock_val) != sign(v)) - goto fail; - - atomic_add(lock_val, &l->l[i]); - spin_unlock(&l->lock); - return true; -} - -void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, - struct nocow_lock_bucket *l, - u64 dev_bucket, int flags) -{ - if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) { - struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks); - u64 start_time = local_clock(); - - __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags)); - bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); - } -} - -void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t) - -{ - unsigned i, nr_zero = 0; - struct nocow_lock_bucket *l; - - for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) { - unsigned v = 0; - - for (i = 0; i < ARRAY_SIZE(l->l); i++) - v |= atomic_read(&l->l[i]); - - if (!v) { - nr_zero++; - continue; - } - - if (nr_zero) - prt_printf(out, "(%u empty entries)\n", nr_zero); - nr_zero = 0; - - for (i = 0; i < ARRAY_SIZE(l->l); i++) { - int v = atomic_read(&l->l[i]); - if (v) { - bch2_bpos_to_text(out, u64_to_bucket(l->b[i])); - prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v)); - } - } - prt_newline(out); - } - - if (nr_zero) - prt_printf(out, "(%u empty entries)\n", nr_zero); -} - -void bch2_fs_nocow_locking_exit(struct bch_fs *c) -{ - struct bucket_nocow_lock_table *t = &c->nocow_locks; - - for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) - for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++) - BUG_ON(atomic_read(&l->l[j])); -} - -void bch2_fs_nocow_locking_init_early(struct bch_fs *c) -{ - struct bucket_nocow_lock_table *t = &c->nocow_locks; - - for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) - spin_lock_init(&l->lock); -} diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h deleted file mode 100644 index 48b8a003c0d2..000000000000 --- a/fs/bcachefs/nocow_locking.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_NOCOW_LOCKING_H -#define _BCACHEFS_NOCOW_LOCKING_H - -#include "bcachefs.h" -#include "alloc_background.h" -#include "nocow_locking_types.h" - -#include <linux/hash.h> - -static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t, - u64 dev_bucket) -{ - unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS); - - return t->l + (h & (BUCKET_NOCOW_LOCKS - 1)); -} - -#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0) - -bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos); -void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int); -bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int); -void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, - struct nocow_lock_bucket *, u64, int); - -static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, - struct bpos bucket, int flags) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - - __bch2_bucket_nocow_lock(t, l, dev_bucket, flags); -} - -static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, - struct bpos bucket, int flags) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - - return __bch2_bucket_nocow_trylock(l, dev_bucket, flags); -} - -void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); - -void bch2_fs_nocow_locking_exit(struct bch_fs *); -void bch2_fs_nocow_locking_init_early(struct bch_fs *); - -#endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h deleted file mode 100644 index bd12bf677924..000000000000 --- a/fs/bcachefs/nocow_locking_types.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H -#define _BCACHEFS_NOCOW_LOCKING_TYPES_H - -#define BUCKET_NOCOW_LOCKS_BITS 10 -#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS) - -struct nocow_lock_bucket { - struct closure_waitlist wait; - spinlock_t lock; - u64 b[4]; - atomic_t l[4]; -} __aligned(SMP_CACHE_BYTES); - -struct bucket_nocow_lock_table { - struct nocow_lock_bucket l[BUCKET_NOCOW_LOCKS]; -}; - -#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */ - diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c deleted file mode 100644 index b1cf88905b81..000000000000 --- a/fs/bcachefs/opts.c +++ /dev/null @@ -1,844 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/kernel.h> -#include <linux/fs_parser.h> - -#include "bcachefs.h" -#include "compress.h" -#include "disk_groups.h" -#include "error.h" -#include "movinggc.h" -#include "opts.h" -#include "rebalance.h" -#include "recovery_passes.h" -#include "super-io.h" -#include "util.h" - -#define x(t, n, ...) [n] = #t, - -const char * const bch2_error_actions[] = { - BCH_ERROR_ACTIONS() - NULL -}; - -const char * const bch2_degraded_actions[] = { - BCH_DEGRADED_ACTIONS() - NULL -}; - -const char * const bch2_fsck_fix_opts[] = { - BCH_FIX_ERRORS_OPTS() - NULL -}; - -const char * const bch2_version_upgrade_opts[] = { - BCH_VERSION_UPGRADE_OPTS() - NULL -}; - -const char * const bch2_sb_features[] = { - BCH_SB_FEATURES() - NULL -}; - -const char * const bch2_sb_compat[] = { - BCH_SB_COMPAT() - NULL -}; - -const char * const __bch2_btree_ids[] = { - BCH_BTREE_IDS() - NULL -}; - -const char * const __bch2_csum_types[] = { - BCH_CSUM_TYPES() - NULL -}; - -const char * const __bch2_csum_opts[] = { - BCH_CSUM_OPTS() - NULL -}; - -const char * const __bch2_compression_types[] = { - BCH_COMPRESSION_TYPES() - NULL -}; - -const char * const bch2_compression_opts[] = { - BCH_COMPRESSION_OPTS() - NULL -}; - -const char * const __bch2_str_hash_types[] = { - BCH_STR_HASH_TYPES() - NULL -}; - -const char * const bch2_str_hash_opts[] = { - BCH_STR_HASH_OPTS() - NULL -}; - -const char * const __bch2_data_types[] = { - BCH_DATA_TYPES() - NULL -}; - -const char * const bch2_member_states[] = { - BCH_MEMBER_STATES() - NULL -}; - -static const char * const __bch2_jset_entry_types[] = { - BCH_JSET_ENTRY_TYPES() - NULL -}; - -static const char * const __bch2_fs_usage_types[] = { - BCH_FS_USAGE_TYPES() - NULL -}; - -#undef x - -static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], - unsigned nr, const char *type, unsigned idx) -{ - if (idx < nr) - prt_str(out, opts[idx]); - else - prt_printf(out, "(unknown %s %u)", type, idx); -} - -#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \ -void bch2_prt_##name(struct printbuf *out, type t) \ -{ \ - prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\ -} - -PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); -PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); -PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); -PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); -PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); -PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); -PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); - -static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, - struct printbuf *err) -{ - if (!val) { - *res = FSCK_FIX_yes; - } else { - int ret = match_string(bch2_fsck_fix_opts, -1, val); - - if (ret < 0 && err) - prt_str(err, "fix_errors: invalid selection"); - if (ret < 0) - return ret; - *res = ret; - } - - return 0; -} - -static void bch2_opt_fix_errors_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_sb *sb, - u64 v) -{ - prt_str(out, bch2_fsck_fix_opts[v]); -} - -#define bch2_opt_fix_errors (struct bch_opt_fn) { \ - .parse = bch2_opt_fix_errors_parse, \ - .to_text = bch2_opt_fix_errors_to_text, \ -} - -const char * const bch2_d_types[BCH_DT_MAX] = { - [DT_UNKNOWN] = "unknown", - [DT_FIFO] = "fifo", - [DT_CHR] = "chr", - [DT_DIR] = "dir", - [DT_BLK] = "blk", - [DT_REG] = "reg", - [DT_LNK] = "lnk", - [DT_SOCK] = "sock", - [DT_WHT] = "whiteout", - [DT_SUBVOL] = "subvol", -}; - -void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) -{ -#define x(_name, ...) \ - if (opt_defined(src, _name)) \ - opt_set(*dst, _name, src._name); - - BCH_OPTS() -#undef x -} - -bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) -{ - switch (id) { -#define x(_name, ...) \ - case Opt_##_name: \ - return opt_defined(*opts, _name); - BCH_OPTS() -#undef x - default: - BUG(); - } -} - -u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) -{ - switch (id) { -#define x(_name, ...) \ - case Opt_##_name: \ - return opts->_name; - BCH_OPTS() -#undef x - default: - BUG(); - } -} - -void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) -{ - switch (id) { -#define x(_name, ...) \ - case Opt_##_name: \ - opt_set(*opts, _name, v); \ - break; - BCH_OPTS() -#undef x - default: - BUG(); - } -} - -/* dummy option, for options that aren't stored in the superblock */ -typedef u64 (*sb_opt_get_fn)(const struct bch_sb *); -typedef void (*sb_opt_set_fn)(struct bch_sb *, u64); -typedef u64 (*member_opt_get_fn)(const struct bch_member *); -typedef void (*member_opt_set_fn)(struct bch_member *, u64); - -__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL; -__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL; -__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL; -__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL; - -#define type_compatible_or_null(_p, _type) \ - __builtin_choose_expr( \ - __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL) - -const struct bch_option bch2_opt_table[] = { -#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 -#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ - .min = _min, .max = _max -#define OPT_STR(_choices) .type = BCH_OPT_STR, \ - .min = 0, .max = ARRAY_SIZE(_choices) - 1, \ - .choices = _choices -#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ - .min = 0, .max = U64_MAX, \ - .choices = _choices -#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \ - .choices = _choices -#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn - -#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ - [Opt_##_name] = { \ - .attr.name = #_name, \ - .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ - .flags = _flags, \ - .hint = _hint, \ - .help = _help, \ - .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \ - .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \ - .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \ - .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\ - _type \ - }, - - BCH_OPTS() -#undef x -}; - -int bch2_opt_lookup(const char *name) -{ - const struct bch_option *i; - - for (i = bch2_opt_table; - i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); - i++) - if (!strcmp(name, i->attr.name)) - return i - bch2_opt_table; - - return -1; -} - -struct opt_synonym { - const char *s1, *s2; -}; - -static const struct opt_synonym bch2_opt_synonyms[] = { - { "quota", "usrquota" }, -}; - -static int bch2_mount_opt_lookup(const char *name) -{ - const struct opt_synonym *i; - - for (i = bch2_opt_synonyms; - i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms); - i++) - if (!strcmp(name, i->s1)) - name = i->s2; - - return bch2_opt_lookup(name); -} - -struct opt_val_synonym { - const char *opt, *v1, *v2; -}; - -static const struct opt_val_synonym bch2_opt_val_synonyms[] = { - { "degraded", "true", "yes" }, - { "degraded", "false", "no" }, - { "degraded", "1", "yes" }, - { "degraded", "0", "no" }, -}; - -static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val) -{ - const struct opt_val_synonym *i; - - for (i = bch2_opt_val_synonyms; - i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms); - i++) - if (!strcmp(opt, i->opt) && !strcmp(val, i->v1)) - return i->v2; - - return val; -} - -int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) -{ - if (v < opt->min) { - if (err) - prt_printf(err, "%s: too small (min %llu)", - opt->attr.name, opt->min); - return -BCH_ERR_ERANGE_option_too_small; - } - - if (opt->max && v >= opt->max) { - if (err) - prt_printf(err, "%s: too big (max %llu)", - opt->attr.name, opt->max); - return -BCH_ERR_ERANGE_option_too_big; - } - - if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { - if (err) - prt_printf(err, "%s: not a multiple of 512", - opt->attr.name); - return -BCH_ERR_opt_parse_error; - } - - if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { - if (err) - prt_printf(err, "%s: must be a power of two", - opt->attr.name); - return -BCH_ERR_opt_parse_error; - } - - if (opt->fn.validate) - return opt->fn.validate(v, err); - - return 0; -} - -int bch2_opt_parse(struct bch_fs *c, - const struct bch_option *opt, - const char *val, u64 *res, - struct printbuf *err) -{ - ssize_t ret; - - if (err) - printbuf_indent_add_nextline(err, 2); - - switch (opt->type) { - case BCH_OPT_BOOL: - if (!val) - val = "1"; - - ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); - if (ret != -BCH_ERR_option_not_bool) { - *res = ret; - } else { - if (err) - prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; - } - break; - case BCH_OPT_UINT: - if (!val) { - prt_printf(err, "%s: required value", - opt->attr.name); - return -EINVAL; - } - - if (*val != '-') { - ret = opt->flags & OPT_HUMAN_READABLE - ? bch2_strtou64_h(val, res) - : kstrtou64(val, 10, res); - } else { - prt_printf(err, "%s: must be a non-negative number", opt->attr.name); - return -BCH_ERR_option_negative; - } - - if (ret < 0) { - if (err) - prt_printf(err, "%s: must be a number", - opt->attr.name); - return ret; - } - break; - case BCH_OPT_STR: - if (!val) { - prt_printf(err, "%s: required value", - opt->attr.name); - return -EINVAL; - } - - ret = match_string(opt->choices, -1, val); - if (ret < 0) { - if (err) - prt_printf(err, "%s: invalid selection", - opt->attr.name); - return ret; - } - - *res = ret; - break; - case BCH_OPT_BITFIELD: { - s64 v = bch2_read_flag_list(val, opt->choices); - if (v < 0) - return v; - *res = v; - break; - } - case BCH_OPT_FN: - ret = opt->fn.parse(c, val, res, err); - - if (ret == -BCH_ERR_option_needs_open_fs) - return ret; - - if (ret < 0) { - if (err) - prt_printf(err, "%s: parse error", - opt->attr.name); - return ret; - } - } - - return bch2_opt_validate(opt, *res, err); -} - -void bch2_opt_to_text(struct printbuf *out, - struct bch_fs *c, struct bch_sb *sb, - const struct bch_option *opt, u64 v, - unsigned flags) -{ - if (flags & OPT_SHOW_MOUNT_STYLE) { - if (opt->type == BCH_OPT_BOOL) { - prt_printf(out, "%s%s", - v ? "" : "no", - opt->attr.name); - return; - } - - prt_printf(out, "%s=", opt->attr.name); - } - - switch (opt->type) { - case BCH_OPT_BOOL: - case BCH_OPT_UINT: - if (opt->flags & OPT_HUMAN_READABLE) - prt_human_readable_u64(out, v); - else - prt_printf(out, "%lli", v); - break; - case BCH_OPT_STR: - if (v < opt->min || v >= opt->max) - prt_printf(out, "(invalid option %lli)", v); - else if (flags & OPT_SHOW_FULL_LIST) - prt_string_option(out, opt->choices, v); - else - prt_str(out, opt->choices[v]); - break; - case BCH_OPT_BITFIELD: - prt_bitflags(out, opt->choices, v); - break; - case BCH_OPT_FN: - opt->fn.to_text(out, c, sb, v); - break; - default: - BUG(); - } -} - -void bch2_opts_to_text(struct printbuf *out, - struct bch_opts opts, - struct bch_fs *c, struct bch_sb *sb, - unsigned show_mask, unsigned hide_mask, - unsigned flags) -{ - bool first = true; - - for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) { - const struct bch_option *opt = &bch2_opt_table[i]; - - if ((opt->flags & hide_mask) || !(opt->flags & show_mask)) - continue; - - u64 v = bch2_opt_get_by_id(&opts, i); - if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) - continue; - - if (!first) - prt_char(out, ','); - first = false; - - bch2_opt_to_text(out, c, sb, opt, v, flags); - } -} - -int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v) -{ - int ret = 0; - - switch (id) { - case Opt_state: - if (ca) - return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED); - break; - - case Opt_compression: - case Opt_background_compression: - ret = bch2_check_set_has_compressed_data(c, v); - break; - case Opt_erasure_code: - if (v) - bch2_check_set_feature(c, BCH_FEATURE_ec); - break; - default: - break; - } - - return ret; -} - -int bch2_opts_hooks_pre_set(struct bch_fs *c) -{ - for (unsigned i = 0; i < bch2_opts_nr; i++) { - int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); - if (ret) - return ret; - } - - return 0; -} - -void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, - struct bch_opts *new_opts, enum bch_opt_id id) -{ - switch (id) { - case Opt_foreground_target: - if (new_opts->foreground_target && - !new_opts->background_target) - bch2_set_rebalance_needs_scan(c, inum); - break; - case Opt_compression: - if (new_opts->compression && - !new_opts->background_compression) - bch2_set_rebalance_needs_scan(c, inum); - break; - case Opt_background_target: - if (new_opts->background_target) - bch2_set_rebalance_needs_scan(c, inum); - break; - case Opt_background_compression: - if (new_opts->background_compression) - bch2_set_rebalance_needs_scan(c, inum); - break; - case Opt_rebalance_enabled: - bch2_rebalance_wakeup(c); - break; - case Opt_copygc_enabled: - bch2_copygc_wakeup(c); - break; - case Opt_discard: - if (!ca) { - mutex_lock(&c->sb_lock); - for_each_member_device(c, ca) { - struct bch_member *m = - bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_DISCARD(m, c->opts.discard); - } - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - break; - case Opt_version_upgrade: - /* - * XXX: in the future we'll likely want to do compatible - * upgrades at runtime as well, but right now there's nothing - * that does that: - */ - if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible) - bch2_sb_upgrade_incompat(c); - break; - default: - break; - } -} - -int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, - struct printbuf *parse_later, - const char *name, const char *val) -{ - struct printbuf err = PRINTBUF; - u64 v; - int ret, id; - - id = bch2_mount_opt_lookup(name); - - /* Check for the form "noopt", negation of a boolean opt: */ - if (id < 0 && - !val && - !strncmp("no", name, 2)) { - id = bch2_mount_opt_lookup(name + 2); - val = "0"; - } - - /* Unknown options are ignored: */ - if (id < 0) - return 0; - - /* must have a value for synonym lookup - but OPT_FN is weird */ - if (!val && bch2_opt_table[id].type != BCH_OPT_FN) - val = "1"; - - val = bch2_opt_val_synonym_lookup(name, val); - - if (!(bch2_opt_table[id].flags & OPT_MOUNT)) - goto bad_opt; - - if (id == Opt_acl && - !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) - goto bad_opt; - - if ((id == Opt_usrquota || - id == Opt_grpquota) && - !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) - goto bad_opt; - - ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); - if (ret == -BCH_ERR_option_needs_open_fs) { - ret = 0; - - if (parse_later) { - prt_printf(parse_later, "%s=%s,", name, val); - if (parse_later->allocation_failure) - ret = -ENOMEM; - } - - goto out; - } - - if (ret < 0) - goto bad_val; - - if (opts) - bch2_opt_set_by_id(opts, id, v); - - ret = 0; -out: - printbuf_exit(&err); - return ret; -bad_opt: - ret = -BCH_ERR_option_name; - goto out; -bad_val: - ret = -BCH_ERR_option_value; - goto out; -} - -int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, - struct printbuf *parse_later, char *options, - bool ignore_unknown) -{ - char *copied_opts, *copied_opts_start; - char *opt, *name, *val; - int ret = 0; - - if (!options) - return 0; - - /* - * sys_fsconfig() is now occasionally providing us with option lists - * starting with a comma - weird. - */ - if (*options == ',') - options++; - - copied_opts = kstrdup(options, GFP_KERNEL); - if (!copied_opts) - return -ENOMEM; - copied_opts_start = copied_opts; - - while ((opt = strsep(&copied_opts, ",")) != NULL) { - if (!*opt) - continue; - - name = strsep(&opt, "="); - val = opt; - - ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val); - if (ret == -BCH_ERR_option_name && ignore_unknown) - ret = 0; - if (ret) { - pr_err("Error parsing option %s: %s", name, bch2_err_str(ret)); - break; - } - } - - kfree(copied_opts_start); - return ret; -} - -u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx) -{ - const struct bch_option *opt = bch2_opt_table + id; - u64 v; - - if (dev_idx < 0) { - v = opt->get_sb(sb); - } else { - if (WARN(!bch2_member_exists(sb, dev_idx), - "tried to set device option %s on nonexistent device %i", - opt->attr.name, dev_idx)) - return 0; - - struct bch_member m = bch2_sb_member_get(sb, dev_idx); - v = opt->get_member(&m); - } - - if (opt->flags & OPT_SB_FIELD_ONE_BIAS) - --v; - - if (opt->flags & OPT_SB_FIELD_ILOG2) - v = 1ULL << v; - - if (opt->flags & OPT_SB_FIELD_SECTORS) - v <<= 9; - - return v; -} - -/* - * Initial options from superblock - here we don't want any options undefined, - * any options the superblock doesn't specify are set to 0: - */ -int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) -{ - for (unsigned id = 0; id < bch2_opts_nr; id++) { - const struct bch_option *opt = bch2_opt_table + id; - - if (opt->get_sb) - bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1)); - } - - return 0; -} - -bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, - const struct bch_option *opt, u64 v) -{ - bool changed = false; - - if (opt->flags & OPT_SB_FIELD_SECTORS) - v >>= 9; - - if (opt->flags & OPT_SB_FIELD_ILOG2) - v = ilog2(v); - - if (opt->flags & OPT_SB_FIELD_ONE_BIAS) - v++; - - if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) { - changed = v != opt->get_sb(sb); - - opt->set_sb(sb, v); - } - - if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) { - if (WARN(!bch2_member_exists(sb, dev_idx), - "tried to set device option %s on nonexistent device %i", - opt->attr.name, dev_idx)) - return false; - - struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); - changed = v != opt->get_member(m); - opt->set_member(m, v); - } - - return changed; -} - -bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, - const struct bch_option *opt, u64 v) -{ - mutex_lock(&c->sb_lock); - bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); - if (changed) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - return changed; -} - -/* io opts: */ - -struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) -{ - struct bch_io_opts opts = { -#define x(_name, _bits) ._name = src._name, - BCH_INODE_OPTS() -#undef x - }; - - bch2_io_opts_fixups(&opts); - return opts; -} - -bool bch2_opt_is_inode_opt(enum bch_opt_id id) -{ - static const enum bch_opt_id inode_opt_list[] = { -#define x(_name, _bits) Opt_##_name, - BCH_INODE_OPTS() -#undef x - }; - unsigned i; - - for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) - if (inode_opt_list[i] == id) - return true; - - return false; -} diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h deleted file mode 100644 index 63f8e254495c..000000000000 --- a/fs/bcachefs/opts.h +++ /dev/null @@ -1,693 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_OPTS_H -#define _BCACHEFS_OPTS_H - -#include <linux/bug.h> -#include <linux/log2.h> -#include <linux/string.h> -#include <linux/sysfs.h> -#include "bcachefs_format.h" - -struct bch_fs; - -extern const char * const bch2_error_actions[]; -extern const char * const bch2_degraded_actions[]; -extern const char * const bch2_fsck_fix_opts[]; -extern const char * const bch2_version_upgrade_opts[]; -extern const char * const bch2_sb_features[]; -extern const char * const bch2_sb_compat[]; -extern const char * const __bch2_btree_ids[]; -extern const char * const __bch2_csum_types[]; -extern const char * const __bch2_csum_opts[]; -extern const char * const __bch2_compression_types[]; -extern const char * const bch2_compression_opts[]; -extern const char * const __bch2_str_hash_types[]; -extern const char * const bch2_str_hash_opts[]; -extern const char * const __bch2_data_types[]; -extern const char * const bch2_member_states[]; -extern const char * const bch2_d_types[]; - -void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); -void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); -void bch2_prt_data_type(struct printbuf *, enum bch_data_type); -void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); -void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); -void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); -void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); - -static inline const char *bch2_d_type_str(unsigned d_type) -{ - return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; -} - -/* - * Mount options; we also store defaults in the superblock. - * - * Also exposed via sysfs: if an option is writeable, and it's also stored in - * the superblock, changing it via sysfs (currently? might change this) also - * updates the superblock. - * - * We store options as signed integers, where -1 means undefined. This means we - * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only - * apply the options from that struct that are defined. - */ - -/* When can be set: */ -enum opt_flags { - OPT_FS = BIT(0), /* Filesystem option */ - OPT_DEVICE = BIT(1), /* Device option */ - OPT_INODE = BIT(2), /* Inode option */ - OPT_FORMAT = BIT(3), /* May be specified at format time */ - OPT_MOUNT = BIT(4), /* May be specified at mount time */ - OPT_RUNTIME = BIT(5), /* May be specified at runtime */ - OPT_HUMAN_READABLE = BIT(6), - OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */ - OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */ - OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */ - OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */ - OPT_HIDDEN = BIT(11), -}; - -enum opt_type { - BCH_OPT_BOOL, - BCH_OPT_UINT, - BCH_OPT_STR, - BCH_OPT_BITFIELD, - BCH_OPT_FN, -}; - -struct bch_opt_fn { - int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *); - void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); - int (*validate)(u64, struct printbuf *); -}; - -/** - * x(name, shortopt, type, in mem type, mode, sb_opt) - * - * @name - name of mount option, sysfs attribute, and struct bch_opts - * member - * - * @mode - when opt may be set - * - * @sb_option - name of corresponding superblock option - * - * @type - one of OPT_BOOL, OPT_UINT, OPT_STR - */ - -/* - * XXX: add fields for - * - default value - * - helptext - */ - -#ifdef __KERNEL__ -#define RATELIMIT_ERRORS_DEFAULT true -#else -#define RATELIMIT_ERRORS_DEFAULT false -#endif - -#ifdef CONFIG_BCACHEFS_DEBUG -#define BCACHEFS_VERBOSE_DEFAULT true -#else -#define BCACHEFS_VERBOSE_DEFAULT false -#endif - -#define BCH_FIX_ERRORS_OPTS() \ - x(exit, 0) \ - x(yes, 1) \ - x(no, 2) \ - x(ask, 3) - -enum fsck_err_opts { -#define x(t, n) FSCK_FIX_##t, - BCH_FIX_ERRORS_OPTS() -#undef x -}; - -#define BCH_OPTS() \ - x(block_size, u16, \ - OPT_FS|OPT_FORMAT| \ - OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(512, 1U << 16), \ - BCH_SB_BLOCK_SIZE, 4 << 10, \ - "size", NULL) \ - x(btree_node_size, u32, \ - OPT_FS|OPT_FORMAT| \ - OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(512, 1U << 20), \ - BCH_SB_BTREE_NODE_SIZE, 256 << 10, \ - "size", "Btree node size, default 256k") \ - x(errors, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_error_actions), \ - BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ - NULL, "Action to take on filesystem error") \ - x(write_error_timeout, u16, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, 300), \ - BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ - NULL, "Number of consecutive write errors allowed before kicking out a device")\ - x(metadata_replicas, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_META_REPLICAS_WANT, 1, \ - "#", "Number of metadata replicas") \ - x(data_replicas, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_DATA_REPLICAS_WANT, 1, \ - "#", "Number of data replicas") \ - x(metadata_replicas_required, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_META_REPLICAS_REQ, 1, \ - "#", NULL) \ - x(data_replicas_required, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_DATA_REPLICAS_REQ, 1, \ - "#", NULL) \ - x(encoded_extent_max, u32, \ - OPT_FS|OPT_FORMAT| \ - OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\ - OPT_UINT(4096, 2U << 20), \ - BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ - "size", "Maximum size of checksummed/compressed extents")\ - x(metadata_checksum, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(__bch2_csum_opts), \ - BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ - NULL, NULL) \ - x(data_checksum, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(__bch2_csum_opts), \ - BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ - NULL, NULL) \ - x(checksum_err_retry_nr, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(0, 32), \ - BCH_SB_CSUM_ERR_RETRY_NR, 3, \ - NULL, NULL) \ - x(compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_compression), \ - BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ - NULL, NULL) \ - x(background_compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_compression), \ - BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ - NULL, NULL) \ - x(str_hash, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_str_hash_opts), \ - BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ - NULL, "Hash function for directory entries and xattrs")\ - x(metadata_target, u16, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_METADATA_TARGET, 0, \ - "(target)", "Device or label for metadata writes") \ - x(foreground_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_FOREGROUND_TARGET, 0, \ - "(target)", "Device or label for foreground writes") \ - x(background_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_BACKGROUND_TARGET, 0, \ - "(target)", "Device or label to move data to in the background")\ - x(promote_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_PROMOTE_TARGET, 0, \ - "(target)", "Device or label to promote data to on read") \ - x(erasure_code, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_ERASURE_CODE, false, \ - NULL, "Enable erasure coding (DO NOT USE YET)") \ - x(casefold, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT, \ - OPT_BOOL(), \ - BCH_SB_CASEFOLD, false, \ - NULL, "Dirent lookups are casefolded") \ - x(casefold_disabled, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Disable casefolding filesystem wide") \ - x(inodes_32bit, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_INODE_32BIT, true, \ - NULL, "Constrain inode numbers to 32 bits") \ - x(shard_inode_numbers_bits, u8, \ - OPT_FS|OPT_FORMAT, \ - OPT_UINT(0, 8), \ - BCH_SB_SHARD_INUMS_NBITS, 0, \ - NULL, "Shard new inode numbers by CPU id") \ - x(inodes_use_key_cache, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_INODES_USE_KEY_CACHE, true, \ - NULL, "Use the btree key cache for the inodes btree") \ - x(btree_node_mem_ptr_optimization, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Stash pointer to in memory btree node in btree ptr")\ - x(gc_reserve_percent, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(5, 21), \ - BCH_SB_GC_RESERVE, 8, \ - "%", "Percentage of disk space to reserve for copygc")\ - x(gc_reserve_bytes, u64, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \ - OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(0, U64_MAX), \ - BCH_SB_GC_RESERVE_BYTES, 0, \ - "%", "Amount of disk space to reserve for copygc\n" \ - "Takes precedence over gc_reserve_percent if set")\ - x(root_reserve_percent, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_UINT(0, 100), \ - BCH_SB_ROOT_RESERVE, 0, \ - "%", "Percentage of disk space to reserve for superuser")\ - x(wide_macs, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_128_BIT_MACS, false, \ - NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ - x(inline_data, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable inline data extents") \ - x(promote_whole_extents, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \ - NULL, "Promote whole extents, instead of just part being read")\ - x(acl, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_POSIX_ACL, true, \ - NULL, "Enable POSIX acls") \ - x(usrquota, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_USRQUOTA, false, \ - NULL, "Enable user quotas") \ - x(grpquota, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_GRPQUOTA, false, \ - NULL, "Enable group quotas") \ - x(prjquota, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_PRJQUOTA, false, \ - NULL, "Enable project quotas") \ - x(degraded, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_STR(bch2_degraded_actions), \ - BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \ - NULL, "Allow mounting in degraded mode") \ - x(no_splitbrain_check, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don't kick drives out when splitbrain detected")\ - x(verbose, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \ - NULL, "Extra debugging information during mount/recovery")\ - x(journal_flush_delay, u32, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, U32_MAX), \ - BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ - NULL, "Delay in milliseconds before automatic journal commits")\ - x(journal_flush_disabled, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_JOURNAL_FLUSH_DISABLED,false, \ - NULL, "Disable journal flush on sync/fsync\n" \ - "If enabled, writes can be lost, but only since the\n"\ - "last journal write (default 1 second)") \ - x(journal_reclaim_delay, u32, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(0, U32_MAX), \ - BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ - NULL, "Delay in milliseconds before automatic journal reclaim")\ - x(move_bytes_in_flight, u32, \ - OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1024, U32_MAX), \ - BCH2_NO_SB_OPT, 1U << 20, \ - NULL, "Maximum Amount of IO to keep in flight by the move path")\ - x(move_ios_in_flight, u32, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, 1024), \ - BCH2_NO_SB_OPT, 32, \ - NULL, "Maximum number of IOs to keep in flight by the move path")\ - x(fsck, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Run fsck on mount") \ - x(fsck_memory_usage_percent, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_UINT(20, 70), \ - BCH2_NO_SB_OPT, 50, \ - NULL, "Maximum percentage of system ram fsck is allowed to pin")\ - x(fix_errors, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_FN(bch2_opt_fix_errors), \ - BCH2_NO_SB_OPT, FSCK_FIX_exit, \ - NULL, "Fix errors during fsck without asking") \ - x(ratelimit_errors, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ - NULL, "Ratelimit error messages during fsck") \ - x(nochanges, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Super read only mode - no writes at all will be issued,\n"\ - "even if we have to replay the journal") \ - x(norecovery, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Exit recovery immediately prior to journal replay")\ - x(journal_rewind, u64, \ - OPT_FS|OPT_MOUNT, \ - OPT_UINT(0, U64_MAX), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Rewind journal") \ - x(recovery_passes, u64, \ - OPT_FS|OPT_MOUNT, \ - OPT_BITFIELD(bch2_recovery_passes), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Recovery passes to run explicitly") \ - x(recovery_passes_exclude, u64, \ - OPT_FS|OPT_MOUNT, \ - OPT_BITFIELD(bch2_recovery_passes), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Recovery passes to exclude") \ - x(recovery_pass_last, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_STR_NOLIMIT(bch2_recovery_passes), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Exit recovery after specified pass") \ - x(retain_recovery_info, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\ - x(read_entire_journal, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Read all journal entries, not just dirty ones")\ - x(read_journal_only, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Only read the journal, skip the rest of recovery")\ - x(journal_transaction_names, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ - NULL, "Log transaction function names in journal") \ - x(allocator_stuck_timeout, u16, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(0, U16_MAX), \ - BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \ - NULL, "Default timeout in seconds for stuck allocator messages")\ - x(noexcl, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don't open device in exclusive mode") \ - x(direct_io, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Use O_DIRECT (userspace only)") \ - x(sb, u64, \ - OPT_MOUNT, \ - OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ - "offset", "Sector offset of superblock") \ - x(read_only, u8, \ - OPT_FS|OPT_MOUNT|OPT_HIDDEN, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, NULL) \ - x(nostart, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don\'t start filesystem, only open devices") \ - x(reconstruct_alloc, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Reconstruct alloc btree") \ - x(version_upgrade, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_version_upgrade_opts), \ - BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ - NULL, "Set superblock to latest version,\n" \ - "allowing any new features to be used") \ - x(stdio, u64, \ - 0, \ - OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Pointer to a struct stdio_redirect") \ - x(project, u8, \ - OPT_INODE, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, NULL) \ - x(nocow, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ - OPT_BOOL(), \ - BCH_SB_NOCOW, false, \ - NULL, "Nocow mode: Writes will be done in place when possible.\n"\ - "Snapshots and reflink will still caused writes to be COW\n"\ - "Implicitly disables data checksumming, compression and encryption")\ - x(nocow_enabled, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable nocow mode: enables runtime locking in\n"\ - "data move path needed if nocow will ever be in use\n")\ - x(copygc_enabled, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable copygc: disable for debugging, or to\n"\ - "quiet the system when doing performance testing\n")\ - x(rebalance_enabled, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable rebalance: disable for debugging, or to\n"\ - "quiet the system when doing performance testing\n")\ - x(rebalance_on_ac_only, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_REBALANCE_AC_ONLY, false, \ - NULL, "Enable rebalance while on mains power only\n") \ - x(auto_snapshot_deletion, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\ - "quiet the system when doing performance testing\n")\ - x(no_data_io, u8, \ - OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Skip submit_bio() for data reads and writes, " \ - "for performance testing purposes") \ - x(state, u64, \ - OPT_DEVICE|OPT_RUNTIME, \ - OPT_STR(bch2_member_states), \ - BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \ - "state", "rw,ro,failed,spare") \ - x(bucket_size, u32, \ - OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(0, S64_MAX), \ - BCH_MEMBER_BUCKET_SIZE, 0, \ - "size", "Specifies the bucket size; must be greater than the btree node size")\ - x(durability, u8, \ - OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \ - OPT_UINT(0, BCH_REPLICAS_MAX), \ - BCH_MEMBER_DURABILITY, 1, \ - "n", "Data written to this device will be considered\n"\ - "to have already been replicated n times") \ - x(data_allowed, u8, \ - OPT_DEVICE, \ - OPT_BITFIELD(__bch2_data_types), \ - BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ - "types", "Allowed data types for this device: journal, btree, and/or user")\ - x(discard, u8, \ - OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_MEMBER_DISCARD, true, \ - NULL, "Enable discard/TRIM support") \ - x(btree_node_prefetch, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\ - " prefetched sequentially") - -struct bch_opts { -#define x(_name, _bits, ...) unsigned _name##_defined:1; - BCH_OPTS() -#undef x - -#define x(_name, _bits, ...) _bits _name; - BCH_OPTS() -#undef x -}; - -struct bch2_opts_parse { - struct bch_opts opts; - - /* to save opts that can't be parsed before the FS is opened: */ - struct printbuf parse_later; -}; - -static const __maybe_unused struct bch_opts bch2_opts_default = { -#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ - ._name##_defined = true, \ - ._name = _default, \ - - BCH_OPTS() -#undef x -}; - -#define opt_defined(_opts, _name) ((_opts)._name##_defined) - -#define opt_get(_opts, _name) \ - (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) - -#define opt_set(_opts, _name, _v) \ -do { \ - (_opts)._name##_defined = true; \ - (_opts)._name = _v; \ -} while (0) - -static inline struct bch_opts bch2_opts_empty(void) -{ - return (struct bch_opts) { 0 }; -} - -void bch2_opts_apply(struct bch_opts *, struct bch_opts); - -enum bch_opt_id { -#define x(_name, ...) Opt_##_name, - BCH_OPTS() -#undef x - bch2_opts_nr -}; - -struct bch_fs; -struct printbuf; - -struct bch_option { - struct attribute attr; - enum opt_type type; - enum opt_flags flags; - u64 min, max; - - const char * const *choices; - - struct bch_opt_fn fn; - - const char *hint; - const char *help; - - u64 (*get_sb)(const struct bch_sb *); - void (*set_sb)(struct bch_sb *, u64); - - u64 (*get_member)(const struct bch_member *); - void (*set_member)(struct bch_member *, u64); - -}; - -extern const struct bch_option bch2_opt_table[]; - -bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); -u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); -void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); - -u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); -int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); -bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); - -struct bch_dev; -bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); - -int bch2_opt_lookup(const char *); -int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); -int bch2_opt_parse(struct bch_fs *, const struct bch_option *, - const char *, u64 *, struct printbuf *); - -#define OPT_SHOW_FULL_LIST (1 << 0) -#define OPT_SHOW_MOUNT_STYLE (1 << 1) - -void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, - const struct bch_option *, u64, unsigned); -void bch2_opts_to_text(struct printbuf *, - struct bch_opts, - struct bch_fs *, struct bch_sb *, - unsigned, unsigned, unsigned); - -int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64); -int bch2_opts_hooks_pre_set(struct bch_fs *); -void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, - struct bch_opts *, enum bch_opt_id); - -int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, - struct printbuf *, const char *, const char *); -int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, - char *, bool); - -/* inode opts: */ - -struct bch_io_opts { -#define x(_name, _bits) u##_bits _name; - BCH_INODE_OPTS() -#undef x -#define x(_name, _bits) u64 _name##_from_inode:1; - BCH_INODE_OPTS() -#undef x -}; - -static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) -{ - if (!opts->background_target) - opts->background_target = opts->foreground_target; - if (!opts->background_compression) - opts->background_compression = opts->compression; - if (opts->nocow) { - opts->compression = opts->background_compression = 0; - opts->data_checksum = 0; - opts->erasure_code = 0; - } -} - -struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); -bool bch2_opt_is_inode_opt(enum bch_opt_id); - -#endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c deleted file mode 100644 index 3302bbc78a09..000000000000 --- a/fs/bcachefs/printbuf.c +++ /dev/null @@ -1,528 +0,0 @@ -// SPDX-License-Identifier: LGPL-2.1+ -/* Copyright (C) 2022 Kent Overstreet */ - -#include <linux/bitmap.h> -#include <linux/err.h> -#include <linux/export.h> -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/string_helpers.h> - -#include "printbuf.h" - -static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos) -{ - return pos - buf->last_newline; -} - -static inline unsigned printbuf_linelen(struct printbuf *buf) -{ - return __printbuf_linelen(buf, buf->pos); -} - -/* - * Returns spaces from start of line, if set, or 0 if unset: - */ -static inline unsigned cur_tabstop(struct printbuf *buf) -{ - return buf->cur_tabstop < buf->nr_tabstops - ? buf->_tabstops[buf->cur_tabstop] - : 0; -} - -int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) -{ - /* Reserved space for terminating nul: */ - extra += 1; - - if (out->pos + extra <= out->size) - return 0; - - if (!out->heap_allocated) { - out->overflow = true; - return 0; - } - - unsigned new_size = roundup_pow_of_two(out->size + extra); - - /* Sanity check... */ - if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) { - out->allocation_failure = true; - out->overflow = true; - return -ENOMEM; - } - - /* - * Note: output buffer must be freeable with kfree(), it's not required - * that the user use printbuf_exit(). - */ - char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); - - if (!buf) { - out->allocation_failure = true; - out->overflow = true; - return -ENOMEM; - } - - out->buf = buf; - out->size = new_size; - return 0; -} - -static void printbuf_advance_pos(struct printbuf *out, unsigned len) -{ - out->pos += min(len, printbuf_remaining(out)); -} - -static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr) -{ - unsigned move = out->pos - pos; - - bch2_printbuf_make_room(out, nr); - - if (pos + nr < out->size) - memmove(out->buf + pos + nr, - out->buf + pos, - min(move, out->size - 1 - pos - nr)); - - if (pos < out->size) - memset(out->buf + pos, ' ', min(nr, out->size - pos)); - - printbuf_advance_pos(out, nr); - printbuf_nul_terminate_reserved(out); -} - -static void __printbuf_do_indent(struct printbuf *out, unsigned pos) -{ - while (true) { - int pad; - unsigned len = out->pos - pos; - char *p = out->buf + pos; - char *n = memscan(p, '\n', len); - if (cur_tabstop(out)) { - n = min(n, (char *) memscan(p, '\r', len)); - n = min(n, (char *) memscan(p, '\t', len)); - } - - pos = n - out->buf; - if (pos == out->pos) - break; - - switch (*n) { - case '\n': - pos++; - out->last_newline = pos; - - printbuf_insert_spaces(out, pos, out->indent); - - pos = min(pos + out->indent, out->pos); - out->last_field = pos; - out->cur_tabstop = 0; - break; - case '\r': - memmove(n, n + 1, out->pos - pos); - --out->pos; - pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos); - if (pad > 0) { - printbuf_insert_spaces(out, out->last_field, pad); - pos += pad; - } - - out->last_field = pos; - out->cur_tabstop++; - break; - case '\t': - pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1; - if (pad > 0) { - *n = ' '; - printbuf_insert_spaces(out, pos, pad - 1); - pos += pad; - } else { - memmove(n, n + 1, out->pos - pos); - --out->pos; - } - - out->last_field = pos; - out->cur_tabstop++; - break; - } - } -} - -static inline void printbuf_do_indent(struct printbuf *out, unsigned pos) -{ - if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling) - __printbuf_do_indent(out, pos); -} - -void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) -{ - int len; - - do { - va_list args2; - - va_copy(args2, args); - len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2); - va_end(args2); - } while (len > printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len)); - - unsigned indent_pos = out->pos; - printbuf_advance_pos(out, len); - printbuf_do_indent(out, indent_pos); -} - -void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) -{ - va_list args; - int len; - - do { - va_start(args, fmt); - len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args); - va_end(args); - } while (len > printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len)); - - unsigned indent_pos = out->pos; - printbuf_advance_pos(out, len); - printbuf_do_indent(out, indent_pos); -} - -/** - * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be - * null terminated - * @buf: printbuf to terminate - * Returns: Printbuf contents, as a nul terminated C string - */ -const char *bch2_printbuf_str(const struct printbuf *buf) -{ - /* - * If we've written to a printbuf then it's guaranteed to be a null - * terminated string - but if we haven't, then we might not have - * allocated a buffer at all: - */ - return buf->pos - ? buf->buf - : ""; -} - -/** - * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it - * against accidental use. - * @buf: printbuf to exit - */ -void bch2_printbuf_exit(struct printbuf *buf) -{ - if (buf->heap_allocated) { - kfree(buf->buf); - buf->buf = ERR_PTR(-EINTR); /* poison value */ - } -} - -void bch2_printbuf_tabstops_reset(struct printbuf *buf) -{ - buf->nr_tabstops = 0; -} - -void bch2_printbuf_tabstop_pop(struct printbuf *buf) -{ - if (buf->nr_tabstops) - --buf->nr_tabstops; -} - -/* - * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop - * - * @buf: printbuf to control - * @spaces: number of spaces from previous tabpstop - * - * In the future this function may allocate memory if setting more than - * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start - * of line. - */ -int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) -{ - unsigned prev_tabstop = buf->nr_tabstops - ? buf->_tabstops[buf->nr_tabstops - 1] - : 0; - - if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops))) - return -EINVAL; - - buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces; - buf->has_indent_or_tabstops = true; - return 0; -} - -/** - * bch2_printbuf_indent_add() - add to the current indent level - * - * @buf: printbuf to control - * @spaces: number of spaces to add to the current indent level - * - * Subsequent lines, and the current line if the output position is at the start - * of the current line, will be indented by @spaces more spaces. - */ -void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) -{ - if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) - spaces = 0; - - buf->indent += spaces; - prt_chars(buf, ' ', spaces); - - buf->has_indent_or_tabstops = true; -} - -/** - * bch2_printbuf_indent_add_nextline() - add to the current indent level for - * subsequent lines - * - * @buf: printbuf to control - * @spaces: number of spaces to add to the current indent level - * - * Subsequent lines - not the current line - will be indented by @spaces more - * spaces. - */ -void bch2_printbuf_indent_add_nextline(struct printbuf *buf, unsigned spaces) -{ - if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) - spaces = 0; - - buf->indent += spaces; - buf->has_indent_or_tabstops = true; -} - -/** - * bch2_printbuf_indent_sub() - subtract from the current indent level - * - * @buf: printbuf to control - * @spaces: number of spaces to subtract from the current indent level - * - * Subsequent lines, and the current line if the output position is at the start - * of the current line, will be indented by @spaces less spaces. - */ -void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) -{ - if (WARN_ON_ONCE(spaces > buf->indent)) - spaces = buf->indent; - - if (buf->last_newline + buf->indent == buf->pos) { - buf->pos -= spaces; - printbuf_nul_terminate(buf); - } - buf->indent -= spaces; - - if (!buf->indent && !buf->nr_tabstops) - buf->has_indent_or_tabstops = false; -} - -void bch2_prt_newline(struct printbuf *buf) -{ - bch2_printbuf_make_room(buf, 1 + buf->indent); - - __prt_char_reserved(buf, '\n'); - - buf->last_newline = buf->pos; - - __prt_chars_reserved(buf, ' ', buf->indent); - - printbuf_nul_terminate_reserved(buf); - - buf->last_field = buf->pos; - buf->cur_tabstop = 0; -} - -void bch2_printbuf_strip_trailing_newline(struct printbuf *out) -{ - for (int p = out->pos - 1; p >= 0; --p) { - if (out->buf[p] == '\n') { - out->pos = p; - break; - } - if (out->buf[p] != ' ') - break; - } - - printbuf_nul_terminate_reserved(out); -} - -static void __prt_tab(struct printbuf *out) -{ - int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out)); - - prt_chars(out, ' ', spaces); - - out->last_field = out->pos; - out->cur_tabstop++; -} - -/** - * bch2_prt_tab() - Advance printbuf to the next tabstop - * @out: printbuf to control - * - * Advance output to the next tabstop by printing spaces. - */ -void bch2_prt_tab(struct printbuf *out) -{ - if (WARN_ON(!cur_tabstop(out))) - return; - - __prt_tab(out); -} - -static void __prt_tab_rjust(struct printbuf *buf) -{ - int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); - if (pad > 0) - printbuf_insert_spaces(buf, buf->last_field, pad); - - buf->last_field = buf->pos; - buf->cur_tabstop++; -} - -/** - * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying - * previous output - * - * @buf: printbuf to control - * - * Advance output to the next tabstop by inserting spaces immediately after the - * previous tabstop, right justifying previously outputted text. - */ -void bch2_prt_tab_rjust(struct printbuf *buf) -{ - if (WARN_ON(!cur_tabstop(buf))) - return; - - __prt_tab_rjust(buf); -} - -/** - * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters - * - * @out: output printbuf - * @str: string to print - * @count: number of bytes to print - * - * The following contol characters are handled as so: - * \n: prt_newline newline that obeys current indent level - * \t: prt_tab advance to next tabstop - * \r: prt_tab_rjust advance to next tabstop, with right justification - */ -void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) -{ - unsigned indent_pos = out->pos; - prt_bytes(out, str, count); - printbuf_do_indent(out, indent_pos); -} - -/** - * bch2_prt_human_readable_u64() - Print out a u64 in human readable units - * @out: output printbuf - * @v: integer to print - * - * Units of 2^10 (default) or 10^3 are controlled via @out->si_units - */ -void bch2_prt_human_readable_u64(struct printbuf *out, u64 v) -{ - bch2_printbuf_make_room(out, 10); - unsigned len = string_get_size(v, 1, !out->si_units, - out->buf + out->pos, - printbuf_remaining_size(out)); - printbuf_advance_pos(out, len); -} - -/** - * bch2_prt_human_readable_s64() - Print out a s64 in human readable units - * @out: output printbuf - * @v: integer to print - * - * Units of 2^10 (default) or 10^3 are controlled via @out->si_units - */ -void bch2_prt_human_readable_s64(struct printbuf *out, s64 v) -{ - if (v < 0) - prt_char(out, '-'); - bch2_prt_human_readable_u64(out, abs(v)); -} - -/** - * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options - * @out: output printbuf - * @v: integer to print - * - * Units are either raw (default), or human reabable units (controlled via - * @buf->human_readable_units) - */ -void bch2_prt_units_u64(struct printbuf *out, u64 v) -{ - if (out->human_readable_units) - bch2_prt_human_readable_u64(out, v); - else - bch2_prt_printf(out, "%llu", v); -} - -/** - * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options - * @out: output printbuf - * @v: integer to print - * - * Units are either raw (default), or human reabable units (controlled via - * @buf->human_readable_units) - */ -void bch2_prt_units_s64(struct printbuf *out, s64 v) -{ - if (v < 0) - prt_char(out, '-'); - bch2_prt_units_u64(out, abs(v)); -} - -void bch2_prt_string_option(struct printbuf *out, - const char * const list[], - size_t selected) -{ - for (size_t i = 0; list[i]; i++) - bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); -} - -void bch2_prt_bitflags(struct printbuf *out, - const char * const list[], u64 flags) -{ - unsigned bit, nr = 0; - bool first = true; - - while (list[nr]) - nr++; - - while (flags && (bit = __ffs64(flags)) < nr) { - if (!first) - bch2_prt_printf(out, ","); - first = false; - bch2_prt_printf(out, "%s", list[bit]); - flags ^= BIT_ULL(bit); - } -} - -void bch2_prt_bitflags_vector(struct printbuf *out, - const char * const list[], - unsigned long *v, unsigned nr) -{ - bool first = true; - unsigned i; - - for (i = 0; i < nr; i++) - if (!list[i]) { - nr = i - 1; - break; - } - - for_each_set_bit(i, v, nr) { - if (!first) - bch2_prt_printf(out, ","); - first = false; - bch2_prt_printf(out, "%s", list[i]); - } -} diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h deleted file mode 100644 index 8f4e28d440ac..000000000000 --- a/fs/bcachefs/printbuf.h +++ /dev/null @@ -1,298 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1+ */ -/* Copyright (C) 2022 Kent Overstreet */ - -#ifndef _BCACHEFS_PRINTBUF_H -#define _BCACHEFS_PRINTBUF_H - -/* - * Printbufs: Simple strings for printing to, with optional heap allocation - * - * This code has provisions for use in userspace, to aid in making other code - * portable between kernelspace and userspace. - * - * Basic example: - * struct printbuf buf = PRINTBUF; - * - * prt_printf(&buf, "foo="); - * foo_to_text(&buf, foo); - * printk("%s", buf.buf); - * printbuf_exit(&buf); - * - * Or - * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size) - * - * We can now write pretty printers instead of writing code that dumps - * everything to the kernel log buffer, and then those pretty-printers can be - * used by other code that outputs to kernel log, sysfs, debugfs, etc. - * - * Memory allocation: Outputing to a printbuf may allocate memory. This - * allocation is done with GFP_KERNEL, by default: use the newer - * memalloc_*_(save|restore) functions as needed. - * - * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations - * will be done with GFP_NOWAIT if printbuf->atomic is nonzero. - * - * It's allowed to grab the output buffer and free it later with kfree() instead - * of using printbuf_exit(), if the user just needs a heap allocated string at - * the end. - * - * Memory allocation failures: We don't return errors directly, because on - * memory allocation failure we usually don't want to bail out and unwind - we - * want to print what we've got, on a best-effort basis. But code that does want - * to return -ENOMEM may check printbuf.allocation_failure. - * - * Indenting, tabstops: - * - * To aid is writing multi-line pretty printers spread across multiple - * functions, printbufs track the current indent level. - * - * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent - * level, respectively. - * - * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from - * start of line. Once set, prt_tab() will output spaces up to the next tabstop. - * prt_tab_rjust() will also advance the current line of text up to the next - * tabstop, but it does so by shifting text since the previous tabstop up to the - * next tabstop - right justifying it. - * - * Make sure you use prt_newline() instead of \n in the format string for indent - * level and tabstops to work corretly. - * - * Output units: printbuf->units exists to tell pretty-printers how to output - * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as - * human readable bytes. prt_units() obeys it. - */ - -#include <linux/kernel.h> -#include <linux/string.h> - -enum printbuf_si { - PRINTBUF_UNITS_2, /* use binary powers of 2^10 */ - PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */ -}; - -#define PRINTBUF_INLINE_TABSTOPS 6 - -struct printbuf { - char *buf; - unsigned size; - unsigned pos; - unsigned last_newline; - unsigned last_field; - unsigned indent; - /* - * If nonzero, allocations will be done with GFP_ATOMIC: - */ - u8 atomic; - bool allocation_failure:1; - bool heap_allocated:1; - bool overflow:1; - enum printbuf_si si_units:1; - bool human_readable_units:1; - bool has_indent_or_tabstops:1; - bool suppress_indent_tabstop_handling:1; - u8 nr_tabstops; - - /* - * Do not modify directly: use printbuf_tabstop_add(), - * printbuf_tabstop_get() - */ - u8 cur_tabstop; - u8 _tabstops[PRINTBUF_INLINE_TABSTOPS]; -}; - -int bch2_printbuf_make_room(struct printbuf *, unsigned); -__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...); -__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list); -const char *bch2_printbuf_str(const struct printbuf *); -void bch2_printbuf_exit(struct printbuf *); - -void bch2_printbuf_tabstops_reset(struct printbuf *); -void bch2_printbuf_tabstop_pop(struct printbuf *); -int bch2_printbuf_tabstop_push(struct printbuf *, unsigned); - -void bch2_printbuf_indent_add(struct printbuf *, unsigned); -void bch2_printbuf_indent_add_nextline(struct printbuf *, unsigned); -void bch2_printbuf_indent_sub(struct printbuf *, unsigned); - -void bch2_prt_newline(struct printbuf *); -void bch2_printbuf_strip_trailing_newline(struct printbuf *); -void bch2_prt_tab(struct printbuf *); -void bch2_prt_tab_rjust(struct printbuf *); - -void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned); -void bch2_prt_human_readable_u64(struct printbuf *, u64); -void bch2_prt_human_readable_s64(struct printbuf *, s64); -void bch2_prt_units_u64(struct printbuf *, u64); -void bch2_prt_units_s64(struct printbuf *, s64); -void bch2_prt_string_option(struct printbuf *, const char * const[], size_t); -void bch2_prt_bitflags(struct printbuf *, const char * const[], u64); -void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], - unsigned long *, unsigned); - -/* Initializer for a heap allocated printbuf: */ -#define PRINTBUF ((struct printbuf) { .heap_allocated = true }) - -/* Initializer a printbuf that points to an external buffer: */ -#define PRINTBUF_EXTERN(_buf, _size) \ -((struct printbuf) { \ - .buf = _buf, \ - .size = _size, \ -}) - -static inline struct printbuf bch2_printbuf_init(void) -{ - return PRINTBUF; -} - -DEFINE_CLASS(printbuf, struct printbuf, - bch2_printbuf_exit(&_T), bch2_printbuf_init(), void) - -/* - * Returns size remaining of output buffer: - */ -static inline unsigned printbuf_remaining_size(struct printbuf *out) -{ - if (WARN_ON(out->size && out->pos >= out->size)) - out->pos = out->size - 1; - return out->size - out->pos; -} - -/* - * Returns number of characters we can print to the output buffer - i.e. - * excluding the terminating nul: - */ -static inline unsigned printbuf_remaining(struct printbuf *out) -{ - return out->size ? printbuf_remaining_size(out) - 1 : 0; -} - -static inline unsigned printbuf_written(struct printbuf *out) -{ - return out->size ? min(out->pos, out->size - 1) : 0; -} - -static inline void printbuf_nul_terminate_reserved(struct printbuf *out) -{ - if (WARN_ON(out->size && out->pos >= out->size)) - out->pos = out->size - 1; - if (out->size) - out->buf[out->pos] = 0; -} - -static inline void printbuf_nul_terminate(struct printbuf *out) -{ - bch2_printbuf_make_room(out, 1); - printbuf_nul_terminate_reserved(out); -} - -/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ -static inline void __prt_char_reserved(struct printbuf *out, char c) -{ - if (printbuf_remaining(out)) - out->buf[out->pos++] = c; -} - -/* Doesn't nul terminate: */ -static inline void __prt_char(struct printbuf *out, char c) -{ - bch2_printbuf_make_room(out, 1); - __prt_char_reserved(out, c); -} - -static inline void prt_char(struct printbuf *out, char c) -{ - bch2_printbuf_make_room(out, 2); - __prt_char_reserved(out, c); - printbuf_nul_terminate_reserved(out); -} - -static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) -{ - unsigned can_print = min(n, printbuf_remaining(out)); - - for (unsigned i = 0; i < can_print; i++) - out->buf[out->pos++] = c; -} - -static inline void prt_chars(struct printbuf *out, char c, unsigned n) -{ - bch2_printbuf_make_room(out, n); - __prt_chars_reserved(out, c, n); - printbuf_nul_terminate_reserved(out); -} - -static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) -{ - bch2_printbuf_make_room(out, n); - - unsigned can_print = min(n, printbuf_remaining(out)); - - for (unsigned i = 0; i < can_print; i++) - out->buf[out->pos++] = ((char *) b)[i]; - - printbuf_nul_terminate(out); -} - -static inline void prt_str(struct printbuf *out, const char *str) -{ - prt_bytes(out, str, strlen(str)); -} - -static inline void prt_str_indented(struct printbuf *out, const char *str) -{ - bch2_prt_bytes_indented(out, str, strlen(str)); -} - -static inline void prt_hex_byte(struct printbuf *out, u8 byte) -{ - bch2_printbuf_make_room(out, 3); - __prt_char_reserved(out, hex_asc_hi(byte)); - __prt_char_reserved(out, hex_asc_lo(byte)); - printbuf_nul_terminate_reserved(out); -} - -static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) -{ - bch2_printbuf_make_room(out, 3); - __prt_char_reserved(out, hex_asc_upper_hi(byte)); - __prt_char_reserved(out, hex_asc_upper_lo(byte)); - printbuf_nul_terminate_reserved(out); -} - -static inline void printbuf_reset_keep_tabstops(struct printbuf *buf) -{ - buf->pos = 0; - buf->allocation_failure = 0; - buf->last_newline = 0; - buf->last_field = 0; - buf->indent = 0; - buf->cur_tabstop = 0; -} - -/** - * printbuf_reset - re-use a printbuf without freeing and re-initializing it: - */ -static inline void printbuf_reset(struct printbuf *buf) -{ - printbuf_reset_keep_tabstops(buf); - buf->nr_tabstops = 0; -} - -/** - * printbuf_atomic_inc - mark as entering an atomic section - */ -static inline void printbuf_atomic_inc(struct printbuf *buf) -{ - buf->atomic++; -} - -/** - * printbuf_atomic_inc - mark as leaving an atomic section - */ -static inline void printbuf_atomic_dec(struct printbuf *buf) -{ - buf->atomic--; -} - -#endif /* _BCACHEFS_PRINTBUF_H */ diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c deleted file mode 100644 index d09898566abe..000000000000 --- a/fs/bcachefs/progress.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "bbpos.h" -#include "disk_accounting.h" -#include "progress.h" - -void bch2_progress_init(struct progress_indicator_state *s, - struct bch_fs *c, - u64 btree_id_mask) -{ - memset(s, 0, sizeof(*s)); - - s->next_print = jiffies + HZ * 10; - - for (unsigned i = 0; i < BTREE_ID_NR; i++) { - if (!(btree_id_mask & BIT_ULL(i))) - continue; - - struct disk_accounting_pos acc; - disk_accounting_key_init(acc, btree, .id = i); - - u64 v; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); - s->nodes_total += div64_ul(v, btree_sectors(c)); - } -} - -static inline bool progress_update_p(struct progress_indicator_state *s) -{ - bool ret = time_after_eq(jiffies, s->next_print); - - if (ret) - s->next_print = jiffies + HZ * 10; - return ret; -} - -void bch2_progress_update_iter(struct btree_trans *trans, - struct progress_indicator_state *s, - struct btree_iter *iter, - const char *msg) -{ - struct bch_fs *c = trans->c; - struct btree *b = path_l(btree_iter_path(trans, iter))->b; - - s->nodes_seen += b != s->last_node; - s->last_node = b; - - if (progress_update_p(s)) { - struct printbuf buf = PRINTBUF; - unsigned percent = s->nodes_total - ? div64_u64(s->nodes_seen * 100, s->nodes_total) - : 0; - - prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", - msg, percent, s->nodes_seen, s->nodes_total); - bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); - - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } -} diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h deleted file mode 100644 index 23fb1811f943..000000000000 --- a/fs/bcachefs/progress.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_PROGRESS_H -#define _BCACHEFS_PROGRESS_H - -/* - * Lame progress indicators - * - * We don't like to use these because they print to the dmesg console, which is - * spammy - we much prefer to be wired up to a userspace programm (e.g. via - * thread_with_file) and have it print the progress indicator. - * - * But some code is old and doesn't support that, or runs in a context where - * that's not yet practical (mount). - */ - -struct progress_indicator_state { - unsigned long next_print; - u64 nodes_seen; - u64 nodes_total; - struct btree *last_node; -}; - -void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64); -void bch2_progress_update_iter(struct btree_trans *, - struct progress_indicator_state *, - struct btree_iter *, - const char *); - -#endif /* _BCACHEFS_PROGRESS_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c deleted file mode 100644 index f241efb1fb50..000000000000 --- a/fs/bcachefs/quota.c +++ /dev/null @@ -1,892 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "btree_update.h" -#include "errcode.h" -#include "error.h" -#include "inode.h" -#include "quota.h" -#include "snapshot.h" -#include "super-io.h" - -static const char * const bch2_quota_types[] = { - "user", - "group", - "project", -}; - -static const char * const bch2_quota_counters[] = { - "space", - "inodes", -}; - -static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_quota *q = field_to_type(f, quota); - - if (vstruct_bytes(&q->field) < sizeof(*q)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&q->field), sizeof(*q)); - return -BCH_ERR_invalid_sb_quota; - } - - return 0; -} - -static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_quota *q = field_to_type(f, quota); - unsigned qtyp, counter; - - for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) { - prt_printf(out, "%s: flags %llx", - bch2_quota_types[qtyp], - le64_to_cpu(q->q[qtyp].flags)); - - for (counter = 0; counter < Q_COUNTERS; counter++) - prt_printf(out, " %s timelimit %u warnlimit %u", - bch2_quota_counters[counter], - le32_to_cpu(q->q[qtyp].c[counter].timelimit), - le32_to_cpu(q->q[qtyp].c[counter].warnlimit)); - - prt_newline(out); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_quota = { - .validate = bch2_sb_quota_validate, - .to_text = bch2_sb_quota_to_text, -}; - -int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, - c, quota_type_invalid, - "invalid quota type (%llu >= %u)", - k.k->p.inode, QTYP_NR); -fsck_err: - return ret; -} - -void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); - unsigned i; - - for (i = 0; i < Q_COUNTERS; i++) - prt_printf(out, "%s hardlimit %llu softlimit %llu", - bch2_quota_counters[i], - le64_to_cpu(dq.v->c[i].hardlimit), - le64_to_cpu(dq.v->c[i].softlimit)); -} - -#ifdef CONFIG_BCACHEFS_QUOTA - -#include <linux/cred.h> -#include <linux/fs.h> -#include <linux/quota.h> - -static void qc_info_to_text(struct printbuf *out, struct qc_info *i) -{ - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 20); - - prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask); - prt_printf(out, "i_flags\t%u\n", i->i_flags); - prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit); - prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit); - prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit); - prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit); - prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit); - prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit); -} - -static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) -{ - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 20); - - prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask); - prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit); - prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit); - prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit); - prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit); - prt_printf(out, "d_space\t%llu\n", q->d_space); - prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count); - prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer); - prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer); - prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns); - prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns); -} - -static inline unsigned __next_qtype(unsigned i, unsigned qtypes) -{ - qtypes >>= i; - return qtypes ? i + __ffs(qtypes) : QTYP_NR; -} - -#define for_each_set_qtype(_c, _i, _q, _qtypes) \ - for (_i = 0; \ - (_i = __next_qtype(_i, _qtypes), \ - _q = &(_c)->quotas[_i], \ - _i < QTYP_NR); \ - _i++) - -static bool ignore_hardlimit(struct bch_memquota_type *q) -{ - if (capable(CAP_SYS_RESOURCE)) - return true; -#if 0 - struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; - - return capable(CAP_SYS_RESOURCE) && - (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || - !(info->dqi_flags & DQF_ROOT_SQUASH)); -#endif - return false; -} - -enum quota_msg { - SOFTWARN, /* Softlimit reached */ - SOFTLONGWARN, /* Grace time expired */ - HARDWARN, /* Hardlimit reached */ - - HARDBELOW, /* Usage got below inode hardlimit */ - SOFTBELOW, /* Usage got below inode softlimit */ -}; - -static int quota_nl[][Q_COUNTERS] = { - [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, - [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, - [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, - [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, - [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, - - [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, - [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, - [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, - [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, - [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, -}; - -struct quota_msgs { - u8 nr; - struct { - u8 qtype; - u8 msg; - } m[QTYP_NR * Q_COUNTERS]; -}; - -static void prepare_msg(unsigned qtype, - enum quota_counters counter, - struct quota_msgs *msgs, - enum quota_msg msg_type) -{ - BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); - - msgs->m[msgs->nr].qtype = qtype; - msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; - msgs->nr++; -} - -static void prepare_warning(struct memquota_counter *qc, - unsigned qtype, - enum quota_counters counter, - struct quota_msgs *msgs, - enum quota_msg msg_type) -{ - if (qc->warning_issued & (1 << msg_type)) - return; - - prepare_msg(qtype, counter, msgs, msg_type); -} - -static void flush_warnings(struct bch_qid qid, - struct super_block *sb, - struct quota_msgs *msgs) -{ - unsigned i; - - for (i = 0; i < msgs->nr; i++) - quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), - sb->s_dev, msgs->m[i].msg); -} - -static int bch2_quota_check_limit(struct bch_fs *c, - unsigned qtype, - struct bch_memquota *mq, - struct quota_msgs *msgs, - enum quota_counters counter, - s64 v, - enum quota_acct_mode mode) -{ - struct bch_memquota_type *q = &c->quotas[qtype]; - struct memquota_counter *qc = &mq->c[counter]; - u64 n = qc->v + v; - - BUG_ON((s64) n < 0); - - if (mode == KEY_TYPE_QUOTA_NOCHECK) - return 0; - - if (v <= 0) { - if (n < qc->hardlimit && - (qc->warning_issued & (1 << HARDWARN))) { - qc->warning_issued &= ~(1 << HARDWARN); - prepare_msg(qtype, counter, msgs, HARDBELOW); - } - - if (n < qc->softlimit && - (qc->warning_issued & (1 << SOFTWARN))) { - qc->warning_issued &= ~(1 << SOFTWARN); - prepare_msg(qtype, counter, msgs, SOFTBELOW); - } - - qc->warning_issued = 0; - return 0; - } - - if (qc->hardlimit && - qc->hardlimit < n && - !ignore_hardlimit(q)) { - prepare_warning(qc, qtype, counter, msgs, HARDWARN); - return -EDQUOT; - } - - if (qc->softlimit && - qc->softlimit < n) { - if (qc->timer == 0) { - qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit; - prepare_warning(qc, qtype, counter, msgs, SOFTWARN); - } else if (ktime_get_real_seconds() >= qc->timer && - !ignore_hardlimit(q)) { - prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); - return -EDQUOT; - } - } - - return 0; -} - -int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, - enum quota_counters counter, s64 v, - enum quota_acct_mode mode) -{ - unsigned qtypes = enabled_qtypes(c); - struct bch_memquota_type *q; - struct bch_memquota *mq[QTYP_NR]; - struct quota_msgs msgs; - unsigned i; - int ret = 0; - - memset(&msgs, 0, sizeof(msgs)); - - for_each_set_qtype(c, i, q, qtypes) { - mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL); - if (!mq[i]) - return -ENOMEM; - } - - for_each_set_qtype(c, i, q, qtypes) - mutex_lock_nested(&q->lock, i); - - for_each_set_qtype(c, i, q, qtypes) { - ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); - if (ret) - goto err; - } - - for_each_set_qtype(c, i, q, qtypes) - mq[i]->c[counter].v += v; -err: - for_each_set_qtype(c, i, q, qtypes) - mutex_unlock(&q->lock); - - flush_warnings(qid, c->vfs_sb, &msgs); - - return ret; -} - -static void __bch2_quota_transfer(struct bch_memquota *src_q, - struct bch_memquota *dst_q, - enum quota_counters counter, s64 v) -{ - BUG_ON(v > src_q->c[counter].v); - BUG_ON(v + dst_q->c[counter].v < v); - - src_q->c[counter].v -= v; - dst_q->c[counter].v += v; -} - -int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, - struct bch_qid dst, - struct bch_qid src, u64 space, - enum quota_acct_mode mode) -{ - struct bch_memquota_type *q; - struct bch_memquota *src_q[3], *dst_q[3]; - struct quota_msgs msgs; - unsigned i; - int ret = 0; - - qtypes &= enabled_qtypes(c); - - memset(&msgs, 0, sizeof(msgs)); - - for_each_set_qtype(c, i, q, qtypes) { - src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL); - dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL); - if (!src_q[i] || !dst_q[i]) - return -ENOMEM; - } - - for_each_set_qtype(c, i, q, qtypes) - mutex_lock_nested(&q->lock, i); - - for_each_set_qtype(c, i, q, qtypes) { - ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, - dst_q[i]->c[Q_SPC].v + space, - mode); - if (ret) - goto err; - - ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, - dst_q[i]->c[Q_INO].v + 1, - mode); - if (ret) - goto err; - } - - for_each_set_qtype(c, i, q, qtypes) { - __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); - __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); - } - -err: - for_each_set_qtype(c, i, q, qtypes) - mutex_unlock(&q->lock); - - flush_warnings(dst, c->vfs_sb, &msgs); - - return ret; -} - -static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, - struct qc_dqblk *qdq) -{ - struct bkey_s_c_quota dq; - struct bch_memquota_type *q; - struct bch_memquota *mq; - unsigned i; - - BUG_ON(k.k->p.inode >= QTYP_NR); - - if (!((1U << k.k->p.inode) & enabled_qtypes(c))) - return 0; - - switch (k.k->type) { - case KEY_TYPE_quota: - dq = bkey_s_c_to_quota(k); - q = &c->quotas[k.k->p.inode]; - - mutex_lock(&q->lock); - mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); - if (!mq) { - mutex_unlock(&q->lock); - return -ENOMEM; - } - - for (i = 0; i < Q_COUNTERS; i++) { - mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); - mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); - } - - if (qdq && qdq->d_fieldmask & QC_SPC_TIMER) - mq->c[Q_SPC].timer = qdq->d_spc_timer; - if (qdq && qdq->d_fieldmask & QC_SPC_WARNS) - mq->c[Q_SPC].warns = qdq->d_spc_warns; - if (qdq && qdq->d_fieldmask & QC_INO_TIMER) - mq->c[Q_INO].timer = qdq->d_ino_timer; - if (qdq && qdq->d_fieldmask & QC_INO_WARNS) - mq->c[Q_INO].warns = qdq->d_ino_warns; - - mutex_unlock(&q->lock); - } - - return 0; -} - -void bch2_fs_quota_exit(struct bch_fs *c) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(c->quotas); i++) - genradix_free(&c->quotas[i].table); -} - -void bch2_fs_quota_init(struct bch_fs *c) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(c->quotas); i++) - mutex_init(&c->quotas[i].lock); -} - -static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb) -{ - struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota); - - if (sb_quota) - return sb_quota; - - sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64)); - if (sb_quota) { - unsigned qtype, qc; - - for (qtype = 0; qtype < QTYP_NR; qtype++) - for (qc = 0; qc < Q_COUNTERS; qc++) - sb_quota->q[qtype].c[qc].timelimit = - cpu_to_le32(7 * 24 * 60 * 60); - } - - return sb_quota; -} - -static void bch2_sb_quota_read(struct bch_fs *c) -{ - struct bch_sb_field_quota *sb_quota; - unsigned i, j; - - sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota); - if (!sb_quota) - return; - - for (i = 0; i < QTYP_NR; i++) { - struct bch_memquota_type *q = &c->quotas[i]; - - for (j = 0; j < Q_COUNTERS; j++) { - q->limits[j].timelimit = - le32_to_cpu(sb_quota->q[i].c[j].timelimit); - q->limits[j].warnlimit = - le32_to_cpu(sb_quota->q[i].c[j].warnlimit); - } - } -} - -static int bch2_fs_quota_read_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bch_inode_unpacked u; - struct bch_snapshot_tree s_t; - u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot); - - int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "%s: snapshot tree %u not found", __func__, tree); - if (ret) - return ret; - - if (!s_t.master_subvol) - goto advance; - - ret = bch2_inode_find_by_inum_nowarn_trans(trans, - (subvol_inum) { - le32_to_cpu(s_t.master_subvol), - k.k->p.offset, - }, &u); - /* - * Inode might be deleted in this snapshot - the easiest way to handle - * that is to just skip it here: - */ - if (bch2_err_matches(ret, ENOENT)) - goto advance; - - if (ret) - return ret; - - bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, - KEY_TYPE_QUOTA_NOCHECK); - bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, - KEY_TYPE_QUOTA_NOCHECK); -advance: - bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); - return 0; -} - -int bch2_fs_quota_read(struct bch_fs *c) -{ - - mutex_lock(&c->sb_lock); - struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); - if (!sb_quota) { - mutex_unlock(&c->sb_lock); - return bch_err_throw(c, ENOSPC_sb_quota); - } - - bch2_sb_quota_read(c); - mutex_unlock(&c->sb_lock); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, - BTREE_ITER_prefetch, k, - __bch2_quota_set(c, k, NULL)) ?: - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - bch2_fs_quota_read_inode(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -/* Enable/disable/delete quotas for an entire filesystem: */ - -static int bch2_quota_enable(struct super_block *sb, unsigned uflags) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_sb_field_quota *sb_quota; - int ret = 0; - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - /* Accounting must be enabled at mount time: */ - if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) - return -EINVAL; - - /* Can't enable enforcement without accounting: */ - if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) - return -EINVAL; - - if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) - return -EINVAL; - - if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) - return -EINVAL; - - mutex_lock(&c->sb_lock); - sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); - if (!sb_quota) { - ret = bch_err_throw(c, ENOSPC_sb_quota); - goto unlock; - } - - if (uflags & FS_QUOTA_UDQ_ENFD) - SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); - - if (uflags & FS_QUOTA_GDQ_ENFD) - SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); - - if (uflags & FS_QUOTA_PDQ_ENFD) - SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); - - bch2_write_super(c); -unlock: - mutex_unlock(&c->sb_lock); - - return bch2_err_class(ret); -} - -static int bch2_quota_disable(struct super_block *sb, unsigned uflags) -{ - struct bch_fs *c = sb->s_fs_info; - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - mutex_lock(&c->sb_lock); - if (uflags & FS_QUOTA_UDQ_ENFD) - SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); - - if (uflags & FS_QUOTA_GDQ_ENFD) - SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); - - if (uflags & FS_QUOTA_PDQ_ENFD) - SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return 0; -} - -static int bch2_quota_remove(struct super_block *sb, unsigned uflags) -{ - struct bch_fs *c = sb->s_fs_info; - int ret; - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - if (uflags & FS_USER_QUOTA) { - if (c->opts.usrquota) - return -EINVAL; - - ret = bch2_btree_delete_range(c, BTREE_ID_quotas, - POS(QTYP_USR, 0), - POS(QTYP_USR, U64_MAX), - 0, NULL); - if (ret) - return ret; - } - - if (uflags & FS_GROUP_QUOTA) { - if (c->opts.grpquota) - return -EINVAL; - - ret = bch2_btree_delete_range(c, BTREE_ID_quotas, - POS(QTYP_GRP, 0), - POS(QTYP_GRP, U64_MAX), - 0, NULL); - if (ret) - return ret; - } - - if (uflags & FS_PROJ_QUOTA) { - if (c->opts.prjquota) - return -EINVAL; - - ret = bch2_btree_delete_range(c, BTREE_ID_quotas, - POS(QTYP_PRJ, 0), - POS(QTYP_PRJ, U64_MAX), - 0, NULL); - if (ret) - return ret; - } - - return 0; -} - -/* - * Return quota status information, such as enforcements, quota file inode - * numbers etc. - */ -static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) -{ - struct bch_fs *c = sb->s_fs_info; - unsigned qtypes = enabled_qtypes(c); - unsigned i; - - memset(state, 0, sizeof(*state)); - - for (i = 0; i < QTYP_NR; i++) { - state->s_state[i].flags |= QCI_SYSFILE; - - if (!(qtypes & (1 << i))) - continue; - - state->s_state[i].flags |= QCI_ACCT_ENABLED; - - state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; - state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; - - state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; - state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; - } - - return 0; -} - -/* - * Adjust quota timers & warnings - */ -static int bch2_quota_set_info(struct super_block *sb, int type, - struct qc_info *info) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_sb_field_quota *sb_quota; - int ret = 0; - - if (0) { - struct printbuf buf = PRINTBUF; - - qc_info_to_text(&buf, info); - pr_info("setting:\n%s", buf.buf); - printbuf_exit(&buf); - } - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - if (type >= QTYP_NR) - return -EINVAL; - - if (!((1 << type) & enabled_qtypes(c))) - return -ESRCH; - - if (info->i_fieldmask & - ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) - return -EINVAL; - - mutex_lock(&c->sb_lock); - sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); - if (!sb_quota) { - ret = bch_err_throw(c, ENOSPC_sb_quota); - goto unlock; - } - - if (info->i_fieldmask & QC_SPC_TIMER) - sb_quota->q[type].c[Q_SPC].timelimit = - cpu_to_le32(info->i_spc_timelimit); - - if (info->i_fieldmask & QC_SPC_WARNS) - sb_quota->q[type].c[Q_SPC].warnlimit = - cpu_to_le32(info->i_spc_warnlimit); - - if (info->i_fieldmask & QC_INO_TIMER) - sb_quota->q[type].c[Q_INO].timelimit = - cpu_to_le32(info->i_ino_timelimit); - - if (info->i_fieldmask & QC_INO_WARNS) - sb_quota->q[type].c[Q_INO].warnlimit = - cpu_to_le32(info->i_ino_warnlimit); - - bch2_sb_quota_read(c); - - bch2_write_super(c); -unlock: - mutex_unlock(&c->sb_lock); - - return bch2_err_class(ret); -} - -/* Get/set individual quotas: */ - -static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) -{ - dst->d_space = src->c[Q_SPC].v << 9; - dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; - dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; - dst->d_spc_timer = src->c[Q_SPC].timer; - dst->d_spc_warns = src->c[Q_SPC].warns; - - dst->d_ino_count = src->c[Q_INO].v; - dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; - dst->d_ino_softlimit = src->c[Q_INO].softlimit; - dst->d_ino_timer = src->c[Q_INO].timer; - dst->d_ino_warns = src->c[Q_INO].warns; -} - -static int bch2_get_quota(struct super_block *sb, struct kqid kqid, - struct qc_dqblk *qdq) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_memquota_type *q = &c->quotas[kqid.type]; - qid_t qid = from_kqid(&init_user_ns, kqid); - struct bch_memquota *mq; - - memset(qdq, 0, sizeof(*qdq)); - - mutex_lock(&q->lock); - mq = genradix_ptr(&q->table, qid); - if (mq) - __bch2_quota_get(qdq, mq); - mutex_unlock(&q->lock); - - return 0; -} - -static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, - struct qc_dqblk *qdq) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_memquota_type *q = &c->quotas[kqid->type]; - qid_t qid = from_kqid(&init_user_ns, *kqid); - struct genradix_iter iter; - struct bch_memquota *mq; - int ret = 0; - - mutex_lock(&q->lock); - - genradix_for_each_from(&q->table, iter, mq, qid) - if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { - __bch2_quota_get(qdq, mq); - *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); - goto found; - } - - ret = -ENOENT; -found: - mutex_unlock(&q->lock); - return bch2_err_class(ret); -} - -static int bch2_set_quota_trans(struct btree_trans *trans, - struct bkey_i_quota *new_quota, - struct qc_dqblk *qdq) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, - BTREE_ITER_slots|BTREE_ITER_intent); - ret = bkey_err(k); - if (unlikely(ret)) - return ret; - - if (k.k->type == KEY_TYPE_quota) - new_quota->v = *bkey_s_c_to_quota(k).v; - - if (qdq->d_fieldmask & QC_SPC_SOFT) - new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); - if (qdq->d_fieldmask & QC_SPC_HARD) - new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); - - if (qdq->d_fieldmask & QC_INO_SOFT) - new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); - if (qdq->d_fieldmask & QC_INO_HARD) - new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); - - ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_set_quota(struct super_block *sb, struct kqid qid, - struct qc_dqblk *qdq) -{ - struct bch_fs *c = sb->s_fs_info; - struct bkey_i_quota new_quota; - int ret; - - if (0) { - struct printbuf buf = PRINTBUF; - - qc_dqblk_to_text(&buf, qdq); - pr_info("setting:\n%s", buf.buf); - printbuf_exit(&buf); - } - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - bkey_quota_init(&new_quota.k_i); - new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); - - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_set_quota_trans(trans, &new_quota, qdq)) ?: - __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); - - return bch2_err_class(ret); -} - -const struct quotactl_ops bch2_quotactl_operations = { - .quota_enable = bch2_quota_enable, - .quota_disable = bch2_quota_disable, - .rm_xquota = bch2_quota_remove, - - .get_state = bch2_quota_get_state, - .set_info = bch2_quota_set_info, - - .get_dqblk = bch2_get_quota, - .get_nextdqblk = bch2_get_next_quota, - .set_dqblk = bch2_set_quota, -}; - -#endif /* CONFIG_BCACHEFS_QUOTA */ diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h deleted file mode 100644 index 1551800ff44c..000000000000 --- a/fs/bcachefs/quota.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_QUOTA_H -#define _BCACHEFS_QUOTA_H - -#include "inode.h" -#include "quota_types.h" - -extern const struct bch_sb_field_ops bch_sb_field_ops_quota; - -int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_quota ((struct bkey_ops) { \ - .key_validate = bch2_quota_validate, \ - .val_to_text = bch2_quota_to_text, \ - .min_val_size = 32, \ -}) - -static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) -{ - return (struct bch_qid) { - .q[QTYP_USR] = u->bi_uid, - .q[QTYP_GRP] = u->bi_gid, - .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, - }; -} - -static inline unsigned enabled_qtypes(struct bch_fs *c) -{ - return ((c->opts.usrquota << QTYP_USR)| - (c->opts.grpquota << QTYP_GRP)| - (c->opts.prjquota << QTYP_PRJ)); -} - -#ifdef CONFIG_BCACHEFS_QUOTA - -int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, - s64, enum quota_acct_mode); - -int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, - struct bch_qid, u64, enum quota_acct_mode); - -void bch2_fs_quota_exit(struct bch_fs *); -void bch2_fs_quota_init(struct bch_fs *); -int bch2_fs_quota_read(struct bch_fs *); - -extern const struct quotactl_ops bch2_quotactl_operations; - -#else - -static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, - enum quota_counters counter, s64 v, - enum quota_acct_mode mode) -{ - return 0; -} - -static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, - struct bch_qid dst, - struct bch_qid src, u64 space, - enum quota_acct_mode mode) -{ - return 0; -} - -static inline void bch2_fs_quota_exit(struct bch_fs *c) {} -static inline void bch2_fs_quota_init(struct bch_fs *c) {} -static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } - -#endif - -#endif /* _BCACHEFS_QUOTA_H */ diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h deleted file mode 100644 index dc34347ef6c7..000000000000 --- a/fs/bcachefs/quota_format.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_QUOTA_FORMAT_H -#define _BCACHEFS_QUOTA_FORMAT_H - -/* KEY_TYPE_quota: */ - -enum quota_types { - QTYP_USR = 0, - QTYP_GRP = 1, - QTYP_PRJ = 2, - QTYP_NR = 3, -}; - -enum quota_counters { - Q_SPC = 0, - Q_INO = 1, - Q_COUNTERS = 2, -}; - -struct bch_quota_counter { - __le64 hardlimit; - __le64 softlimit; -}; - -struct bch_quota { - struct bch_val v; - struct bch_quota_counter c[Q_COUNTERS]; -} __packed __aligned(8); - -/* BCH_SB_FIELD_quota: */ - -struct bch_sb_quota_counter { - __le32 timelimit; - __le32 warnlimit; -}; - -struct bch_sb_quota_type { - __le64 flags; - struct bch_sb_quota_counter c[Q_COUNTERS]; -}; - -struct bch_sb_field_quota { - struct bch_sb_field field; - struct bch_sb_quota_type q[QTYP_NR]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_QUOTA_FORMAT_H */ diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h deleted file mode 100644 index 6a136083d389..000000000000 --- a/fs/bcachefs/quota_types.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_QUOTA_TYPES_H -#define _BCACHEFS_QUOTA_TYPES_H - -#include <linux/generic-radix-tree.h> - -struct bch_qid { - u32 q[QTYP_NR]; -}; - -enum quota_acct_mode { - KEY_TYPE_QUOTA_PREALLOC, - KEY_TYPE_QUOTA_WARN, - KEY_TYPE_QUOTA_NOCHECK, -}; - -struct memquota_counter { - u64 v; - u64 hardlimit; - u64 softlimit; - s64 timer; - int warns; - int warning_issued; -}; - -struct bch_memquota { - struct memquota_counter c[Q_COUNTERS]; -}; - -typedef GENRADIX(struct bch_memquota) bch_memquota_table; - -struct quota_limit { - u32 timelimit; - u32 warnlimit; -}; - -struct bch_memquota_type { - struct quota_limit limits[Q_COUNTERS]; - bch_memquota_table table; - struct mutex lock; -}; - -#endif /* _BCACHEFS_QUOTA_TYPES_H */ diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c deleted file mode 100644 index b1438be9d690..000000000000 --- a/fs/bcachefs/rcu_pending.c +++ /dev/null @@ -1,666 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define pr_fmt(fmt) "%s() " fmt "\n", __func__ - -#include <linux/generic-radix-tree.h> -#include <linux/mm.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/srcu.h> -#include <linux/vmalloc.h> - -#include "rcu_pending.h" -#include "darray.h" -#include "util.h" - -#define static_array_for_each(_a, _i) \ - for (typeof(&(_a)[0]) _i = _a; \ - _i < (_a) + ARRAY_SIZE(_a); \ - _i++) - -enum rcu_pending_special { - RCU_PENDING_KVFREE = 1, - RCU_PENDING_CALL_RCU = 2, -}; - -#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) -#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) - -#ifdef __KERNEL__ -typedef unsigned long rcu_gp_poll_state_t; - -static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) -{ - return l == r; -} -#else -typedef struct urcu_gp_poll_state rcu_gp_poll_state_t; - -static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) -{ - return l.grace_period_id == r.grace_period_id; -} -#endif - -static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp) -{ - return ssp - ? get_state_synchronize_srcu(ssp) - : get_state_synchronize_rcu(); -} - -static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp) -{ - return ssp - ? start_poll_synchronize_srcu(ssp) - : start_poll_synchronize_rcu(); -} - -static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie) -{ - return ssp - ? poll_state_synchronize_srcu(ssp, cookie) - : poll_state_synchronize_rcu(cookie); -} - -static inline void __rcu_barrier(struct srcu_struct *ssp) -{ - return ssp - ? srcu_barrier(ssp) - : rcu_barrier(); -} - -static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp, - rcu_callback_t func) -{ - if (ssp) - call_srcu(ssp, rhp, func); - else - call_rcu(rhp, func); -} - -struct rcu_pending_seq { - /* - * We're using a radix tree like a vector - we're just pushing elements - * onto the end; we're using a radix tree instead of an actual vector to - * avoid reallocation overhead - */ - GENRADIX(struct rcu_head *) objs; - size_t nr; - struct rcu_head **cursor; - rcu_gp_poll_state_t seq; -}; - -struct rcu_pending_list { - struct rcu_head *head; - struct rcu_head *tail; - rcu_gp_poll_state_t seq; -}; - -struct rcu_pending_pcpu { - struct rcu_pending *parent; - spinlock_t lock; - int cpu; - - /* - * We can't bound the number of unprocessed gp sequence numbers, and we - * can't efficiently merge radix trees for expired grace periods, so we - * need darray/vector: - */ - DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs; - - /* Third entry is for expired objects: */ - struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1]; - - struct rcu_head cb; - bool cb_armed; - struct work_struct work; -}; - -static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p) -{ - if (p->objs.nr) - return true; - - static_array_for_each(p->lists, i) - if (i->head) - return true; - - return false; -} - -static void rcu_pending_list_merge(struct rcu_pending_list *l1, - struct rcu_pending_list *l2) -{ -#ifdef __KERNEL__ - if (!l1->head) - l1->head = l2->head; - else - l1->tail->next = l2->head; -#else - if (!l1->head) - l1->head = l2->head; - else - l1->tail->next.next = (void *) l2->head; -#endif - - l1->tail = l2->tail; - l2->head = l2->tail = NULL; -} - -static void rcu_pending_list_add(struct rcu_pending_list *l, - struct rcu_head *n) -{ -#ifdef __KERNEL__ - if (!l->head) - l->head = n; - else - l->tail->next = n; - l->tail = n; - n->next = NULL; -#else - if (!l->head) - l->head = n; - else - l->tail->next.next = (void *) n; - l->tail = n; - n->next.next = NULL; -#endif -} - -static void merge_expired_lists(struct rcu_pending_pcpu *p) -{ - struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; - - for (struct rcu_pending_list *i = p->lists; i < expired; i++) - if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq)) - rcu_pending_list_merge(expired, i); -} - -#ifndef __KERNEL__ -static inline void kfree_bulk(size_t nr, void ** p) -{ - while (nr--) - kfree(*p); -} -#endif - -static noinline void __process_finished_items(struct rcu_pending *pending, - struct rcu_pending_pcpu *p, - unsigned long flags) -{ - struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; - struct rcu_pending_seq objs = {}; - struct rcu_head *list = NULL; - - if (p->objs.nr && - __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) { - objs = p->objs.data[0]; - darray_remove_item(&p->objs, p->objs.data); - } - - merge_expired_lists(p); - - list = expired->head; - expired->head = expired->tail = NULL; - - spin_unlock_irqrestore(&p->lock, flags); - - switch ((ulong) pending->process) { - case RCU_PENDING_KVFREE: - for (size_t i = 0; i < objs.nr; ) { - size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i); - - kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i)); - i += nr_this_node; - } - genradix_free(&objs.objs); - - while (list) { - struct rcu_head *obj = list; -#ifdef __KERNEL__ - list = obj->next; -#else - list = (void *) obj->next.next; -#endif - - /* - * low bit of pointer indicates whether rcu_head needs - * to be freed - kvfree_rcu_mightsleep() - */ - BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0); - - void *ptr = (void *)(((unsigned long) obj->func) & ~1UL); - bool free_head = ((unsigned long) obj->func) & 1UL; - - kvfree(ptr); - if (free_head) - kfree(obj); - } - - break; - - case RCU_PENDING_CALL_RCU: - for (size_t i = 0; i < objs.nr; i++) { - struct rcu_head *obj = *genradix_ptr(&objs.objs, i); - obj->func(obj); - } - genradix_free(&objs.objs); - - while (list) { - struct rcu_head *obj = list; -#ifdef __KERNEL__ - list = obj->next; -#else - list = (void *) obj->next.next; -#endif - obj->func(obj); - } - break; - - default: - for (size_t i = 0; i < objs.nr; i++) - pending->process(pending, *genradix_ptr(&objs.objs, i)); - genradix_free(&objs.objs); - - while (list) { - struct rcu_head *obj = list; -#ifdef __KERNEL__ - list = obj->next; -#else - list = (void *) obj->next.next; -#endif - pending->process(pending, obj); - } - break; - } -} - -static bool process_finished_items(struct rcu_pending *pending, - struct rcu_pending_pcpu *p, - unsigned long flags) -{ - /* - * XXX: we should grab the gp seq once and avoid multiple function - * calls, this is called from __rcu_pending_enqueue() fastpath in - * may_sleep==true mode - */ - if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) || - (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) || - (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) || - p->lists[2].head) { - __process_finished_items(pending, p, flags); - return true; - } - - return false; -} - -static void rcu_pending_work(struct work_struct *work) -{ - struct rcu_pending_pcpu *p = - container_of(work, struct rcu_pending_pcpu, work); - struct rcu_pending *pending = p->parent; - unsigned long flags; - - do { - spin_lock_irqsave(&p->lock, flags); - } while (process_finished_items(pending, p, flags)); - - spin_unlock_irqrestore(&p->lock, flags); -} - -static void rcu_pending_rcu_cb(struct rcu_head *rcu) -{ - struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb); - - schedule_work_on(p->cpu, &p->work); - - unsigned long flags; - spin_lock_irqsave(&p->lock, flags); - if (__rcu_pending_has_pending(p)) { - spin_unlock_irqrestore(&p->lock, flags); - __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb); - } else { - p->cb_armed = false; - spin_unlock_irqrestore(&p->lock, flags); - } -} - -static __always_inline struct rcu_pending_seq * -get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq) -{ - darray_for_each_reverse(p->objs, objs) - if (rcu_gp_poll_cookie_eq(objs->seq, seq)) - return objs; - - if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) - return NULL; - - return &darray_last(p->objs); -} - -static noinline bool -rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq, - struct rcu_head *head, void *ptr, - unsigned long *flags) -{ - if (ptr) { - if (!head) { - /* - * kvfree_rcu_mightsleep(): we weren't passed an - * rcu_head, but we need one: use the low bit of the - * ponter to free to flag that the head needs to be - * freed as well: - */ - ptr = (void *)(((unsigned long) ptr)|1UL); - head = kmalloc(sizeof(*head), __GFP_NOWARN); - if (!head) { - spin_unlock_irqrestore(&p->lock, *flags); - head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL); - /* - * dropped lock, did GFP_KERNEL allocation, - * check for gp expiration - */ - if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) { - kvfree(--ptr); - kfree(head); - spin_lock_irqsave(&p->lock, *flags); - return false; - } - } - } - - head->func = ptr; - } -again: - for (struct rcu_pending_list *i = p->lists; - i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { - if (rcu_gp_poll_cookie_eq(i->seq, seq)) { - rcu_pending_list_add(i, head); - return false; - } - } - - for (struct rcu_pending_list *i = p->lists; - i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { - if (!i->head) { - i->seq = seq; - rcu_pending_list_add(i, head); - return true; - } - } - - merge_expired_lists(p); - goto again; -} - -/* - * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via - * pending->pracess) once grace period elapses. - * - * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall - * back to a linked list. - * - * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a - * process callback - * - * - If @ptr and @head are both not NULL, we're kvfree_rcu() - * - * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep() - * - * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process - * expired items. - */ -static __always_inline void -__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, - void *ptr, bool may_sleep) -{ - - struct rcu_pending_pcpu *p; - struct rcu_pending_seq *objs; - struct genradix_node *new_node = NULL; - unsigned long flags; - bool start_gp = false; - - BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); - - /* We could technically be scheduled before taking the lock and end up - * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock - * anyways - * - * And we have to do it this way to avoid breaking PREEMPT_RT, which - * redefines how spinlocks work: - */ - p = raw_cpu_ptr(pending->p); - spin_lock_irqsave(&p->lock, flags); - rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); -restart: - if (may_sleep && - unlikely(process_finished_items(pending, p, flags))) - goto check_expired; - - /* - * In kvfree_rcu() mode, the radix tree is only for slab pointers so - * that we can do kfree_bulk() - vmalloc pointers always use the linked - * list: - */ - if (ptr && unlikely(is_vmalloc_addr(ptr))) - goto list_add; - - objs = get_object_radix(p, seq); - if (unlikely(!objs)) - goto list_add; - - if (unlikely(!objs->cursor)) { - /* - * New radix tree nodes must be added under @p->lock because the - * tree root is in a darray that can be resized (typically, - * genradix supports concurrent unlocked allocation of new - * nodes) - hence preallocation and the retry loop: - */ - objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs, - objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN); - if (unlikely(!objs->cursor)) { - if (may_sleep) { - spin_unlock_irqrestore(&p->lock, flags); - - gfp_t gfp = GFP_KERNEL; - if (!head) - gfp |= __GFP_NOFAIL; - - new_node = genradix_alloc_node(gfp); - if (!new_node) - may_sleep = false; - goto check_expired; - } -list_add: - start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags); - goto start_gp; - } - } - - *objs->cursor++ = ptr ?: head; - /* zero cursor if we hit the end of a radix tree node: */ - if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1))) - objs->cursor = NULL; - start_gp = !objs->nr; - objs->nr++; -start_gp: - if (unlikely(start_gp)) { - /* - * We only have one callback (ideally, we would have one for - * every outstanding graceperiod) - so if our callback is - * already in flight, we may still have to start a grace period - * (since we used get_state() above, not start_poll()) - */ - if (!p->cb_armed) { - p->cb_armed = true; - __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb); - } else { - __start_poll_synchronize_rcu(pending->srcu); - } - } - spin_unlock_irqrestore(&p->lock, flags); -free_node: - if (new_node) - genradix_free_node(new_node); - return; -check_expired: - if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) { - switch ((ulong) pending->process) { - case RCU_PENDING_KVFREE: - kvfree(ptr); - break; - case RCU_PENDING_CALL_RCU: - head->func(head); - break; - default: - pending->process(pending, head); - break; - } - goto free_node; - } - - p = raw_cpu_ptr(pending->p); - spin_lock_irqsave(&p->lock, flags); - goto restart; -} - -void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj) -{ - __rcu_pending_enqueue(pending, obj, NULL, true); -} - -static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p) -{ - struct rcu_head *ret = NULL; - - spin_lock_irq(&p->lock); - darray_for_each(p->objs, objs) - if (objs->nr) { - ret = *genradix_ptr(&objs->objs, --objs->nr); - objs->cursor = NULL; - if (!objs->nr) - genradix_free(&objs->objs); - goto out; - } - - static_array_for_each(p->lists, i) - if (i->head) { - ret = i->head; -#ifdef __KERNEL__ - i->head = ret->next; -#else - i->head = (void *) ret->next.next; -#endif - if (!i->head) - i->tail = NULL; - goto out; - } -out: - spin_unlock_irq(&p->lock); - - return ret; -} - -struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending) -{ - return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p)); -} - -struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending) -{ - struct rcu_head *ret = rcu_pending_dequeue(pending); - - if (ret) - return ret; - - int cpu; - for_each_possible_cpu(cpu) { - ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu)); - if (ret) - break; - } - return ret; -} - -static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending) -{ - int cpu; - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - spin_lock_irq(&p->lock); - if (__rcu_pending_has_pending(p) || p->cb_armed) { - spin_unlock_irq(&p->lock); - return true; - } - spin_unlock_irq(&p->lock); - } - - return false; -} - -void rcu_pending_exit(struct rcu_pending *pending) -{ - int cpu; - - if (!pending->p) - return; - - while (rcu_pending_has_pending_or_armed(pending)) { - __rcu_barrier(pending->srcu); - - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - flush_work(&p->work); - } - } - - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - flush_work(&p->work); - } - - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - - static_array_for_each(p->lists, i) - WARN_ON(i->head); - WARN_ON(p->objs.nr); - darray_exit(&p->objs); - } - free_percpu(pending->p); -} - -/** - * rcu_pending_init: - initialize a rcu_pending - * - * @pending: Object to init - * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal - * RCU flavor - * @process: Callback function invoked on objects once their RCU barriers - * have completed; if NULL, kvfree() is used. - */ -int rcu_pending_init(struct rcu_pending *pending, - struct srcu_struct *srcu, - rcu_pending_process_fn process) -{ - pending->p = alloc_percpu(struct rcu_pending_pcpu); - if (!pending->p) - return -ENOMEM; - - int cpu; - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - p->parent = pending; - p->cpu = cpu; - spin_lock_init(&p->lock); - darray_init(&p->objs); - INIT_WORK(&p->work, rcu_pending_work); - } - - pending->srcu = srcu; - pending->process = process; - - return 0; -} diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h deleted file mode 100644 index 71a2f4ddaade..000000000000 --- a/fs/bcachefs/rcu_pending.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_RCU_PENDING_H -#define _LINUX_RCU_PENDING_H - -#include <linux/rcupdate.h> - -struct rcu_pending; -typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *); - -struct rcu_pending_pcpu; - -struct rcu_pending { - struct rcu_pending_pcpu __percpu *p; - struct srcu_struct *srcu; - rcu_pending_process_fn process; -}; - -void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj); -struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending); -struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending); - -void rcu_pending_exit(struct rcu_pending *pending); -int rcu_pending_init(struct rcu_pending *pending, - struct srcu_struct *srcu, - rcu_pending_process_fn process); - -#endif /* _LINUX_RCU_PENDING_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c deleted file mode 100644 index 1c345b86b1c0..000000000000 --- a/fs/bcachefs/rebalance.c +++ /dev/null @@ -1,889 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "clock.h" -#include "compress.h" -#include "disk_groups.h" -#include "errcode.h" -#include "error.h" -#include "inode.h" -#include "io_write.h" -#include "move.h" -#include "rebalance.h" -#include "subvolume.h" -#include "super-io.h" -#include "trace.h" - -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/sched/cputime.h> - -/* bch_extent_rebalance: */ - -static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) -{ - const union bch_extent_entry *entry; - - bkey_extent_entry_for_each(ptrs, entry) - if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) - return &entry->rebalance; - - return NULL; -} - -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) -{ - return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); -} - -static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s_c k, - struct bkey_ptrs_c ptrs) -{ - if (!opts->background_compression) - return 0; - - unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned ptr_bit = 1; - unsigned rewrite_ptrs = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) - return 0; - - if (!p.ptr.cached && p.crc.compression_type != compression_type) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - - return rewrite_ptrs; -} - -static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_ptrs_c ptrs) -{ - if (!opts->background_target || - !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) - return 0; - - unsigned ptr_bit = 1; - unsigned rewrite_ptrs = 0; - - guard(rcu)(); - bkey_for_each_ptr(ptrs, ptr) { - if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - - return rewrite_ptrs; -} - -static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return 0; - - return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | - bch2_bkey_ptrs_need_move(c, opts, ptrs); -} - -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); - if (!opts) - return 0; - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return 0; - - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - u64 sectors = 0; - - if (opts->background_compression) { - unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) { - sectors = 0; - goto incompressible; - } - - if (!p.ptr.cached && p.crc.compression_type != compression_type) - sectors += p.crc.compressed_size; - } - } -incompressible: - if (opts->background_target) { - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && - !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) - sectors += p.crc.compressed_size; - } - - return sectors; -} - -static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, - struct bkey_s_c k) -{ - if (!bkey_extent_is_direct_data(k.k)) - return 0; - - const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); - - if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts); - return old == NULL || memcmp(old, &new, sizeof(new)); - } else { - return old != NULL; - } -} - -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, - struct bkey_i *_k) -{ - if (!bkey_extent_is_direct_data(&_k->k)) - return 0; - - struct bkey_s k = bkey_i_to_s(_k); - struct bch_extent_rebalance *old = - (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - - if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) { - if (!old) { - old = bkey_val_end(k); - k.k->u64s += sizeof(*old) / sizeof(u64); - } - - *old = io_opts_to_rebalance_opts(c, opts); - } else { - if (old) - extent_entry_drop(k, (union bch_extent_entry *) old); - } - - return 0; -} - -int bch2_get_update_rebalance_opts(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct btree_iter *iter, - struct bkey_s_c k) -{ - BUG_ON(iter->flags & BTREE_ITER_is_extents); - BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); - - const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v - ? bch2_bkey_rebalance_opts(k) : NULL; - if (r) { -#define x(_name) \ - if (r->_name##_from_inode) { \ - io_opts->_name = r->_name; \ - io_opts->_name##_from_inode = true; \ - } - BCH_REBALANCE_OPTS() -#undef x - } - - if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k)) - return 0; - - struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - bkey_reassemble(n, k); - - /* On successfull transaction commit, @k was invalidated: */ - - return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; -} - -#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) - -static const char * const bch2_rebalance_state_strs[] = { -#define x(t) #t, - BCH_REBALANCE_STATES() - NULL -#undef x -}; - -int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i_cookie *cookie; - u64 v; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, - SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - v = k.k->type == KEY_TYPE_cookie - ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) - : 0; - - cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); - ret = PTR_ERR_OR_ZERO(cookie); - if (ret) - goto err; - - bkey_cookie_init(&cookie->k_i); - cookie->k.p = iter.pos; - cookie->v.cookie = cpu_to_le64(v + 1); - - ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) -{ - int ret = bch2_trans_commit_do(c, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_set_rebalance_needs_scan_trans(trans, inum)); - bch2_rebalance_wakeup(c); - return ret; -} - -int bch2_set_fs_needs_rebalance(struct bch_fs *c) -{ - return bch2_set_rebalance_needs_scan(c, 0); -} - -static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 v; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, - SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - v = k.k->type == KEY_TYPE_cookie - ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) - : 0; - - if (v == cookie) - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, - struct btree_iter *work_iter) -{ - return !kthread_should_stop() - ? bch2_btree_iter_peek(trans, work_iter) - : bkey_s_c_null; -} - -static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) - return 0; - - struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - extent_entry_drop(bkey_i_to_s(n), - (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -} - -static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, - struct bpos work_pos, - struct btree_iter *extent_iter, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bch_fs *c = trans->c; - - bch2_trans_iter_exit(trans, extent_iter); - bch2_trans_iter_init(trans, extent_iter, - work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, - work_pos, - BTREE_ITER_all_snapshots); - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter); - if (bkey_err(k)) - return k; - - int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k); - if (ret) - return bkey_s_c_err(ret); - - memset(data_opts, 0, sizeof(*data_opts)); - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_only_specified_devs; - - if (!data_opts->rewrite_ptrs) { - /* - * device we would want to write to offline? devices in target - * changed? - * - * We'll now need a full scan before this extent is picked up - * again: - */ - int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); - if (ret) - return bkey_s_c_err(ret); - return bkey_s_c_null; - } - - if (trace_rebalance_extent_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs); - if (p) { - prt_str(&buf, "compression="); - bch2_compression_opt_to_text(&buf, io_opts->background_compression); - prt_str(&buf, " "); - bch2_prt_u64_base2(&buf, p); - prt_newline(&buf); - } - - p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs); - if (p) { - prt_str(&buf, "move="); - bch2_target_to_text(&buf, c, io_opts->background_target); - prt_str(&buf, " "); - bch2_prt_u64_base2(&buf, p); - prt_newline(&buf); - } - - trace_rebalance_extent(c, buf.buf); - printbuf_exit(&buf); - } - - return k; -} - -noinline_for_stack -static int do_rebalance_extent(struct moving_context *ctxt, - struct bpos work_pos, - struct btree_iter *extent_iter) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct bch_fs_rebalance *r = &trans->c->rebalance; - struct data_update_opts data_opts; - struct bch_io_opts io_opts; - struct bkey_s_c k; - struct bkey_buf sk; - int ret; - - ctxt->stats = &r->work_stats; - r->state = BCH_REBALANCE_working; - - bch2_bkey_buf_init(&sk); - - ret = bkey_err(k = next_rebalance_extent(trans, work_pos, - extent_iter, &io_opts, &data_opts)); - if (ret || !k.k) - goto out; - - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); - - /* - * The iterator gets unlocked by __bch2_read_extent - need to - * save a copy of @k elsewhere: - */ - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); - if (ret) { - if (bch2_err_matches(ret, ENOMEM)) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - ret = bch_err_throw(c, transaction_restart_nested); - } - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto out; - - /* skip it and continue, XXX signal failure */ - ret = 0; - } -out: - bch2_bkey_buf_exit(&sk, c); - return ret; -} - -static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct bch_fs_rebalance *r = &trans->c->rebalance; - - bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); - ctxt->stats = &r->scan_stats; - - if (!inum) { - r->scan_start = BBPOS_MIN; - r->scan_end = BBPOS_MAX; - } else { - r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); - r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); - } - - r->state = BCH_REBALANCE_scanning; - - struct per_snapshot_io_opts snapshot_io_opts; - per_snapshot_io_opts_init(&snapshot_io_opts, c); - - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, - r->scan_start.pos, r->scan_end.pos, - BTREE_ITER_all_snapshots| - BTREE_ITER_not_extents| - BTREE_ITER_prefetch, k, ({ - ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - - struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, - &snapshot_io_opts, iter.pos, &iter, k); - PTR_ERR_OR_ZERO(io_opts); - })) ?: - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_clear_rebalance_needs_scan(trans, inum, cookie)); - - per_snapshot_io_opts_exit(&snapshot_io_opts); - bch2_move_stats_exit(&r->scan_stats, trans->c); - - /* - * Ensure that the rebalance_work entries we created are seen by the - * next iteration of do_rebalance(), so we don't end up stuck in - * rebalance_wait(): - */ - atomic64_inc(&r->scan_stats.sectors_seen); - bch2_btree_write_buffer_flush_sync(trans); - - return ret; -} - -static void rebalance_wait(struct bch_fs *c) -{ - struct bch_fs_rebalance *r = &c->rebalance; - struct io_clock *clock = &c->io_clock[WRITE]; - u64 now = atomic64_read(&clock->now); - u64 min_member_capacity = bch2_min_rw_member_capacity(c); - - if (min_member_capacity == U64_MAX) - min_member_capacity = 128 * 2048; - - r->wait_iotime_end = now + (min_member_capacity >> 6); - - if (r->state != BCH_REBALANCE_waiting) { - r->wait_iotime_start = now; - r->wait_wallclock_start = ktime_get_real_ns(); - r->state = BCH_REBALANCE_waiting; - } - - bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); -} - -static bool bch2_rebalance_enabled(struct bch_fs *c) -{ - return c->opts.rebalance_enabled && - !(c->opts.rebalance_on_ac_only && - c->rebalance.on_battery); -} - -static int do_rebalance(struct moving_context *ctxt) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct bch_fs_rebalance *r = &c->rebalance; - struct btree_iter rebalance_work_iter, extent_iter = {}; - struct bkey_s_c k; - u32 kick = r->kick; - int ret = 0; - - bch2_trans_begin(trans); - - bch2_move_stats_init(&r->work_stats, "rebalance_work"); - bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); - - bch2_trans_iter_init(trans, &rebalance_work_iter, - BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_all_snapshots); - - while (!bch2_move_ratelimit(ctxt)) { - if (!bch2_rebalance_enabled(c)) { - bch2_moving_ctxt_flush_all(ctxt); - kthread_wait_freezable(bch2_rebalance_enabled(c) || - kthread_should_stop()); - } - - if (kthread_should_stop()) - break; - - bch2_trans_begin(trans); - - ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret || !k.k) - break; - - ret = k.k->type == KEY_TYPE_cookie - ? do_rebalance_scan(ctxt, k.k->p.inode, - le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) - : do_rebalance_extent(ctxt, k.k->p, &extent_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - bch2_btree_iter_advance(trans, &rebalance_work_iter); - } - - bch2_trans_iter_exit(trans, &extent_iter); - bch2_trans_iter_exit(trans, &rebalance_work_iter); - bch2_move_stats_exit(&r->scan_stats, c); - - if (!ret && - !kthread_should_stop() && - !atomic64_read(&r->work_stats.sectors_seen) && - !atomic64_read(&r->scan_stats.sectors_seen) && - kick == r->kick) { - bch2_moving_ctxt_flush_all(ctxt); - bch2_trans_unlock_long(trans); - rebalance_wait(c); - } - - if (!bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); - return ret; -} - -static int bch2_rebalance_thread(void *arg) -{ - struct bch_fs *c = arg; - struct bch_fs_rebalance *r = &c->rebalance; - struct moving_context ctxt; - - set_freezable(); - - /* - * Data move operations can't run until after check_snapshots has - * completed, and bch2_snapshot_is_ancestor() is available. - */ - kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || - kthread_should_stop()); - - bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, - writepoint_ptr(&c->rebalance_write_point), - true); - - while (!kthread_should_stop() && !do_rebalance(&ctxt)) - ; - - bch2_moving_ctxt_exit(&ctxt); - - return 0; -} - -void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) -{ - printbuf_tabstop_push(out, 32); - - struct bch_fs_rebalance *r = &c->rebalance; - - /* print pending work */ - struct disk_accounting_pos acc; - disk_accounting_key_init(acc, rebalance_work); - u64 v; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); - - prt_printf(out, "pending work:\t"); - prt_human_readable_u64(out, v << 9); - prt_printf(out, "\n\n"); - - prt_str(out, bch2_rebalance_state_strs[r->state]); - prt_newline(out); - printbuf_indent_add(out, 2); - - switch (r->state) { - case BCH_REBALANCE_waiting: { - u64 now = atomic64_read(&c->io_clock[WRITE].now); - - prt_printf(out, "io wait duration:\t"); - bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); - prt_newline(out); - - prt_printf(out, "io wait remaining:\t"); - bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); - prt_newline(out); - - prt_printf(out, "duration waited:\t"); - bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); - prt_newline(out); - break; - } - case BCH_REBALANCE_working: - bch2_move_stats_to_text(out, &r->work_stats); - break; - case BCH_REBALANCE_scanning: - bch2_move_stats_to_text(out, &r->scan_stats); - break; - } - prt_newline(out); - - struct task_struct *t; - scoped_guard(rcu) { - t = rcu_dereference(c->rebalance.thread); - if (t) - get_task_struct(t); - } - - if (t) { - bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); - put_task_struct(t); - } - - printbuf_indent_sub(out, 2); -} - -void bch2_rebalance_stop(struct bch_fs *c) -{ - struct task_struct *p; - - c->rebalance.pd.rate.rate = UINT_MAX; - bch2_ratelimit_reset(&c->rebalance.pd.rate); - - p = rcu_dereference_protected(c->rebalance.thread, 1); - c->rebalance.thread = NULL; - - if (p) { - /* for sychronizing with bch2_rebalance_wakeup() */ - synchronize_rcu(); - - kthread_stop(p); - put_task_struct(p); - } -} - -int bch2_rebalance_start(struct bch_fs *c) -{ - struct task_struct *p; - int ret; - - if (c->rebalance.thread) - return 0; - - if (c->opts.nochanges) - return 0; - - p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); - ret = PTR_ERR_OR_ZERO(p); - bch_err_msg(c, ret, "creating rebalance thread"); - if (ret) - return ret; - - get_task_struct(p); - rcu_assign_pointer(c->rebalance.thread, p); - wake_up_process(p); - return 0; -} - -#ifdef CONFIG_POWER_SUPPLY -#include <linux/power_supply.h> - -static int bch2_rebalance_power_notifier(struct notifier_block *nb, - unsigned long event, void *data) -{ - struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier); - - c->rebalance.on_battery = !power_supply_is_system_supplied(); - bch2_rebalance_wakeup(c); - return NOTIFY_OK; -} -#endif - -void bch2_fs_rebalance_exit(struct bch_fs *c) -{ -#ifdef CONFIG_POWER_SUPPLY - power_supply_unreg_notifier(&c->rebalance.power_notifier); -#endif -} - -int bch2_fs_rebalance_init(struct bch_fs *c) -{ - struct bch_fs_rebalance *r = &c->rebalance; - - bch2_pd_controller_init(&r->pd); - -#ifdef CONFIG_POWER_SUPPLY - r->power_notifier.notifier_call = bch2_rebalance_power_notifier; - int ret = power_supply_reg_notifier(&r->power_notifier); - if (ret) - return ret; - - r->on_battery = !power_supply_is_system_supplied(); -#endif - return 0; -} - -static int check_rebalance_work_one(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct btree_iter *rebalance_iter, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c extent_k, rebalance_k; - struct printbuf buf = PRINTBUF; - - int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?: - bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter)); - if (ret) - return ret; - - if (!extent_k.k && - extent_iter->btree_id == BTREE_ID_reflink && - (!rebalance_k.k || - rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) { - bch2_trans_iter_exit(trans, extent_iter); - bch2_trans_iter_init(trans, extent_iter, - BTREE_ID_extents, POS_MIN, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots); - return bch_err_throw(c, transaction_restart_nested); - } - - if (!extent_k.k && !rebalance_k.k) - return 1; - - int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX, - rebalance_k.k ? rebalance_k.k->p : SPOS_MAX); - - struct bkey deleted; - bkey_init(&deleted); - - if (cmp < 0) { - deleted.p = extent_k.k->p; - rebalance_k.k = &deleted; - } else if (cmp > 0) { - deleted.p = rebalance_k.k->p; - extent_k.k = &deleted; - } - - bool should_have_rebalance = - bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; - bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; - - if (should_have_rebalance != have_rebalance) { - ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed); - if (ret) - return ret; - - bch2_bkey_val_to_text(&buf, c, extent_k); - } - - if (fsck_err_on(!should_have_rebalance && have_rebalance, - trans, rebalance_work_incorrectly_set, - "rebalance work incorrectly set\n%s", buf.buf)) { - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - extent_k.k->p, false); - if (ret) - goto err; - } - - if (fsck_err_on(should_have_rebalance && !have_rebalance, - trans, rebalance_work_incorrectly_unset, - "rebalance work incorrectly unset\n%s", buf.buf)) { - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - extent_k.k->p, true); - if (ret) - goto err; - } - - if (cmp <= 0) - bch2_btree_iter_advance(trans, extent_iter); - if (cmp >= 0) - bch2_btree_iter_advance(trans, rebalance_iter); -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_check_rebalance_work(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter rebalance_iter, extent_iter; - int ret = 0; - - bch2_trans_iter_init(trans, &extent_iter, - BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch); - bch2_trans_iter_init(trans, &rebalance_iter, - BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_prefetch); - - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - while (!ret) { - bch2_trans_begin(trans); - - ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - } - - bch2_bkey_buf_exit(&last_flushed, c); - bch2_trans_iter_exit(trans, &extent_iter); - bch2_trans_iter_exit(trans, &rebalance_iter); - bch2_trans_put(trans); - return ret < 0 ? ret : 0; -} diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h deleted file mode 100644 index 7a565ea7dbfc..000000000000 --- a/fs/bcachefs/rebalance.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REBALANCE_H -#define _BCACHEFS_REBALANCE_H - -#include "compress.h" -#include "disk_groups.h" -#include "opts.h" -#include "rebalance_types.h" - -static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, - struct bch_io_opts *opts) -{ - struct bch_extent_rebalance r = { - .type = BIT(BCH_EXTENT_ENTRY_rebalance), -#define x(_name) \ - ._name = opts->_name, \ - ._name##_from_inode = opts->_name##_from_inode, - BCH_REBALANCE_OPTS() -#undef x - }; - - if (r.background_target && - !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) - r.background_target = 0; - - return r; -}; - -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); -int bch2_get_update_rebalance_opts(struct btree_trans *, - struct bch_io_opts *, - struct btree_iter *, - struct bkey_s_c); - -int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); -int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); -int bch2_set_fs_needs_rebalance(struct bch_fs *); - -static inline void bch2_rebalance_wakeup(struct bch_fs *c) -{ - c->rebalance.kick++; - guard(rcu)(); - struct task_struct *p = rcu_dereference(c->rebalance.thread); - if (p) - wake_up_process(p); -} - -void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); - -void bch2_rebalance_stop(struct bch_fs *); -int bch2_rebalance_start(struct bch_fs *); - -void bch2_fs_rebalance_exit(struct bch_fs *); -int bch2_fs_rebalance_init(struct bch_fs *); - -int bch2_check_rebalance_work(struct bch_fs *); - -#endif /* _BCACHEFS_REBALANCE_H */ diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h deleted file mode 100644 index ff9a1342a22b..000000000000 --- a/fs/bcachefs/rebalance_format.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REBALANCE_FORMAT_H -#define _BCACHEFS_REBALANCE_FORMAT_H - -struct bch_extent_rebalance { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:3, - - promote_target_from_inode:1, - erasure_code_from_inode:1, - data_checksum_from_inode:1, - background_compression_from_inode:1, - data_replicas_from_inode:1, - background_target_from_inode:1, - - promote_target:16, - erasure_code:1, - data_checksum:4, - data_replicas:4, - background_compression:8, /* enum bch_compression_opt */ - background_target:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 background_target:16, - background_compression:8, - data_replicas:4, - data_checksum:4, - erasure_code:1, - promote_target:16, - - background_target_from_inode:1, - data_replicas_from_inode:1, - background_compression_from_inode:1, - data_checksum_from_inode:1, - erasure_code_from_inode:1, - promote_target_from_inode:1, - - unused:3, - type:6; -#endif -}; - -/* subset of BCH_INODE_OPTS */ -#define BCH_REBALANCE_OPTS() \ - x(data_checksum) \ - x(background_compression) \ - x(data_replicas) \ - x(promote_target) \ - x(background_target) \ - x(erasure_code) - -#endif /* _BCACHEFS_REBALANCE_FORMAT_H */ - diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h deleted file mode 100644 index c659da149fa3..000000000000 --- a/fs/bcachefs/rebalance_types.h +++ /dev/null @@ -1,41 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REBALANCE_TYPES_H -#define _BCACHEFS_REBALANCE_TYPES_H - -#include "bbpos_types.h" -#include "move_types.h" - -#define BCH_REBALANCE_STATES() \ - x(waiting) \ - x(working) \ - x(scanning) - -enum bch_rebalance_states { -#define x(t) BCH_REBALANCE_##t, - BCH_REBALANCE_STATES() -#undef x -}; - -struct bch_fs_rebalance { - struct task_struct __rcu *thread; - u32 kick; - struct bch_pd_controller pd; - - enum bch_rebalance_states state; - u64 wait_iotime_start; - u64 wait_iotime_end; - u64 wait_wallclock_start; - - struct bch_move_stats work_stats; - - struct bbpos scan_start; - struct bbpos scan_end; - struct bch_move_stats scan_stats; - - bool on_battery; -#ifdef CONFIG_POWER_SUPPLY - struct notifier_block power_notifier; -#endif -}; - -#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c deleted file mode 100644 index c94debb12d2f..000000000000 --- a/fs/bcachefs/recovery.c +++ /dev/null @@ -1,1306 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "bkey_buf.h" -#include "btree_journal_iter.h" -#include "btree_node_scan.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "buckets.h" -#include "dirent.h" -#include "disk_accounting.h" -#include "errcode.h" -#include "error.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "journal_seq_blacklist.h" -#include "logged_ops.h" -#include "move.h" -#include "movinggc.h" -#include "namei.h" -#include "quota.h" -#include "rebalance.h" -#include "recovery.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-clean.h" -#include "sb-downgrade.h" -#include "snapshot.h" -#include "super-io.h" - -#include <linux/sort.h> -#include <linux/stat.h> - -int bch2_btree_lost_data(struct bch_fs *c, - struct printbuf *msg, - enum btree_id btree) -{ - u64 b = BIT_ULL(btree); - int ret = 0; - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - if (!(c->sb.btrees_lost_data & b)) { - prt_printf(msg, "flagging btree "); - bch2_btree_id_to_text(msg, btree); - prt_printf(msg, " lost data\n"); - - ext->btrees_lost_data |= cpu_to_le64(b); - } - - /* Once we have runtime self healing for topology errors we won't need this: */ - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; - - /* Btree node accounting will be off: */ - __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; - -#ifdef CONFIG_BCACHEFS_DEBUG - /* - * These are much more minor, and don't need to be corrected right away, - * but in debug mode we want the next fsck run to be clean: - */ - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret; -#endif - - switch (btree) { - case BTREE_ID_alloc: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - - __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); - goto out; - case BTREE_ID_backpointers: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret; - goto out; - case BTREE_ID_need_discard: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_freespace: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_bucket_gens: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_lru: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_accounting: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; - goto out; - case BTREE_ID_snapshots: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; - goto out; - default: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; - goto out; - } -out: - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; -} - -static void kill_btree(struct bch_fs *c, enum btree_id btree) -{ - bch2_btree_id_root(c, btree)->alive = false; - bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); -} - -/* for -o reconstruct_alloc: */ -void bch2_reconstruct_alloc(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); - - __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); - - __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent); - - __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); - - __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent); - - __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - - c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - - c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info)); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) - if (btree_id_is_alloc(i)) - kill_btree(c, i); -} - -/* - * Btree node pointers have a field to stack a pointer to the in memory btree - * node; we need to zero out this field when reading in btree nodes, or when - * reading in keys from the journal: - */ -static void zero_out_btree_mem_ptr(struct journal_keys *keys) -{ - darray_for_each(*keys, i) - if (i->k->k.type == KEY_TYPE_btree_ptr_v2) - bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; -} - -/* journal replay: */ - -static void replay_now_at(struct journal *j, u64 seq) -{ - BUG_ON(seq < j->replay_journal_seq); - - seq = min(seq, j->replay_journal_seq_end); - - while (j->replay_journal_seq < seq) - bch2_journal_pin_put(j, j->replay_journal_seq++); -} - -static int bch2_journal_replay_accounting_key(struct btree_trans *trans, - struct journal_key *k) -{ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, - BTREE_MAX_DEPTH, k->level, - BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(trans, &iter); - if (ret) - goto out; - - struct bkey u; - struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u); - - /* Has this delta already been applied to the btree? */ - if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) { - ret = 0; - goto out; - } - - struct bkey_i *new = k->k; - if (old.k->type == KEY_TYPE_accounting) { - new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - bch2_accounting_accumulate(bkey_i_to_accounting(new), - bkey_s_c_to_accounting(old)); - } - - trans->journal_res.seq = k->journal_seq; - - ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_journal_replay_key(struct btree_trans *trans, - struct journal_key *k) -{ - struct btree_iter iter; - unsigned iter_flags = - BTREE_ITER_intent| - BTREE_ITER_not_extents; - unsigned update_flags = BTREE_TRIGGER_norun; - int ret; - - if (k->overwritten) - return 0; - - trans->journal_res.seq = k->journal_seq; - - /* - * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to - * keep the key cache coherent with the underlying btree. Nothing - * besides the allocator is doing updates yet so we don't need key cache - * coherency for non-alloc btrees, and key cache fills for snapshots - * btrees use BTREE_ITER_filter_snapshots, which isn't available until - * the snapshots recovery pass runs. - */ - if (!k->level && k->btree_id == BTREE_ID_alloc) - iter_flags |= BTREE_ITER_cached; - else - update_flags |= BTREE_UPDATE_key_cache_reclaim; - - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, - BTREE_MAX_DEPTH, k->level, - iter_flags); - ret = bch2_btree_iter_traverse(trans, &iter); - if (ret) - goto out; - - struct btree_path *path = btree_iter_path(trans, &iter); - if (unlikely(!btree_path_node(path, k->level))) { - struct bch_fs *c = trans->c; - - CLASS(printbuf, buf)(); - prt_str(&buf, "btree="); - bch2_btree_id_to_text(&buf, k->btree_id); - prt_printf(&buf, " level=%u ", k->level); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k)); - - if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)| - BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) { - bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s", - buf.buf); - ret = -EINVAL; - } - - if (!k->allocated) { - bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s", - buf.buf); - k->overwritten = true; - goto out; - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, - BTREE_MAX_DEPTH, 0, iter_flags); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_increase_depth(trans, iter.path, 0) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } - - /* Must be checked with btree locked: */ - if (k->overwritten) - goto out; - - if (k->k->k.type == KEY_TYPE_accounting) { - struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto out; - - bkey_copy(n, k->k); - goto out; - } - - ret = bch2_trans_update(trans, &iter, k->k, update_flags); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int journal_sort_seq_cmp(const void *_l, const void *_r) -{ - const struct journal_key *l = *((const struct journal_key **)_l); - const struct journal_key *r = *((const struct journal_key **)_r); - - /* - * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last - * - * journal_seq == 0 means that the key comes from early repair, and - * should be inserted last so as to avoid overflowing the journal - */ - return cmp_int(l->journal_seq - 1, r->journal_seq - 1); -} - -int bch2_journal_replay(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - DARRAY(struct journal_key *) keys_sorted = { 0 }; - struct journal *j = &c->journal; - u64 start_seq = c->journal_replay_seq_start; - u64 end_seq = c->journal_replay_seq_start; - struct btree_trans *trans = NULL; - bool immediate_flush = false; - int ret = 0; - - if (keys->nr) { - ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", - keys->nr, start_seq, end_seq); - if (ret) - goto err; - } - - BUG_ON(!atomic_read(&keys->ref)); - - move_gap(keys, keys->nr); - trans = bch2_trans_get(c); - - /* - * Replay accounting keys first: we can't allow the write buffer to - * flush accounting keys until we're done - */ - darray_for_each(*keys, k) { - if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated)) - continue; - - cond_resched(); - - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_skip_accounting_apply| - BCH_TRANS_COMMIT_no_journal_res| - BCH_WATERMARK_reclaim, - bch2_journal_replay_accounting_key(trans, k)); - if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) - goto err; - - k->overwritten = true; - } - - set_bit(BCH_FS_accounting_replay_done, &c->flags); - - /* - * First, attempt to replay keys in sorted order. This is more - * efficient - better locality of btree access - but some might fail if - * that would cause a journal deadlock. - */ - darray_for_each(*keys, k) { - cond_resched(); - - /* - * k->allocated means the key wasn't read in from the journal, - * rather it was from early repair code - */ - if (k->allocated) - immediate_flush = true; - - /* Skip fastpath if we're low on space in the journal */ - ret = c->journal.watermark ? -1 : - commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_skip_accounting_apply| - (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), - bch2_journal_replay_key(trans, k)); - BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting); - if (ret) { - ret = darray_push(&keys_sorted, k); - if (ret) - goto err; - } - } - - bch2_trans_unlock_long(trans); - /* - * Now, replay any remaining keys in the order in which they appear in - * the journal, unpinning those journal entries as we go: - */ - sort_nonatomic(keys_sorted.data, keys_sorted.nr, - sizeof(keys_sorted.data[0]), - journal_sort_seq_cmp, NULL); - - darray_for_each(keys_sorted, kp) { - cond_resched(); - - struct journal_key *k = *kp; - - if (k->journal_seq) - replay_now_at(j, k->journal_seq); - else - replay_now_at(j, j->replay_journal_seq_end); - - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_skip_accounting_apply| - (!k->allocated - ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim - : 0), - bch2_journal_replay_key(trans, k)); - if (ret) { - struct printbuf buf = PRINTBUF; - bch2_btree_id_level_to_text(&buf, k->btree_id, k->level); - bch_err_msg(c, ret, "while replaying key at %s:", buf.buf); - printbuf_exit(&buf); - goto err; - } - - BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); - } - - /* - * We need to put our btree_trans before calling flush_all_pins(), since - * that will use a btree_trans internally - */ - bch2_trans_put(trans); - trans = NULL; - - if (!c->opts.retain_recovery_info && - c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) - bch2_journal_keys_put_initial(c); - - replay_now_at(j, j->replay_journal_seq_end); - j->replay_journal_seq = 0; - - bch2_journal_set_replay_done(j); - - /* if we did any repair, flush it immediately */ - if (immediate_flush) { - bch2_journal_flush_all_pins(&c->journal); - ret = bch2_journal_meta(&c->journal); - } - - if (keys->nr) - bch2_journal_log_msg(c, "journal replay finished"); -err: - if (trans) - bch2_trans_put(trans); - darray_exit(&keys_sorted); - bch_err_fn(c, ret); - return ret; -} - -/* journal replay early: */ - -static int journal_replay_entry_early(struct bch_fs *c, - struct jset_entry *entry) -{ - int ret = 0; - - switch (entry->type) { - case BCH_JSET_ENTRY_btree_root: { - - if (unlikely(!entry->u64s)) - return 0; - - if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, - c, invalid_btree_id, - "invalid btree id %u (max %u)", - entry->btree_id, BTREE_ID_NR_MAX)) - return 0; - - while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { - ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); - if (ret) - return ret; - } - - struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); - - r->level = entry->level; - bkey_copy(&r->key, (struct bkey_i *) entry->start); - r->error = 0; - r->alive = true; - break; - } - case BCH_JSET_ENTRY_usage: { - struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); - - switch (entry->btree_id) { - case BCH_FS_USAGE_key_version: - atomic64_set(&c->key_version, le64_to_cpu(u->v)); - break; - } - break; - } - case BCH_JSET_ENTRY_blacklist: { - struct jset_entry_blacklist *bl_entry = - container_of(entry, struct jset_entry_blacklist, entry); - - ret = bch2_journal_seq_blacklist_add(c, - le64_to_cpu(bl_entry->seq), - le64_to_cpu(bl_entry->seq) + 1); - break; - } - case BCH_JSET_ENTRY_blacklist_v2: { - struct jset_entry_blacklist_v2 *bl_entry = - container_of(entry, struct jset_entry_blacklist_v2, entry); - - ret = bch2_journal_seq_blacklist_add(c, - le64_to_cpu(bl_entry->start), - le64_to_cpu(bl_entry->end) + 1); - break; - } - case BCH_JSET_ENTRY_clock: { - struct jset_entry_clock *clock = - container_of(entry, struct jset_entry_clock, entry); - - atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); - } - } -fsck_err: - return ret; -} - -static int journal_replay_early(struct bch_fs *c, - struct bch_sb_field_clean *clean) -{ - if (clean) { - for (struct jset_entry *entry = clean->start; - entry != vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - int ret = journal_replay_entry_early(c, entry); - if (ret) - return ret; - } - } else { - struct genradix_iter iter; - struct journal_replay *i, **_i; - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - vstruct_for_each(&i->j, entry) { - int ret = journal_replay_entry_early(c, entry); - if (ret) - return ret; - } - } - } - - return 0; -} - -/* sb clean section: */ - -static int read_btree_roots(struct bch_fs *c) -{ - struct printbuf buf = PRINTBUF; - int ret = 0; - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (!r->alive) - continue; - - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, i, r->level); - - if (mustfix_fsck_err_on((ret = r->error), - c, btree_root_bkey_invalid, - "invalid btree root %s", - buf.buf) || - mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), - c, btree_root_read_error, - "error reading btree root %s: %s", - buf.buf, bch2_err_str(ret))) { - if (btree_id_is_alloc(i)) - r->error = 0; - ret = 0; - } - } - - for (unsigned i = 0; i < BTREE_ID_NR; i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (!r->b && !r->error) { - r->alive = false; - r->level = 0; - bch2_btree_root_alloc_fake(c, i, 0); - } - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static bool check_version_upgrade(struct bch_fs *c) -{ - unsigned latest_version = bcachefs_metadata_version_current; - unsigned latest_compatible = min(latest_version, - bch2_latest_compatible_version(c->sb.version)); - unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; - unsigned new_version = 0; - bool ret = false; - - if (old_version < bcachefs_metadata_required_upgrade_below) { - if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || - latest_compatible < bcachefs_metadata_required_upgrade_below) - new_version = latest_version; - else - new_version = latest_compatible; - } else { - switch (c->opts.version_upgrade) { - case BCH_VERSION_UPGRADE_compatible: - new_version = latest_compatible; - break; - case BCH_VERSION_UPGRADE_incompatible: - new_version = latest_version; - break; - case BCH_VERSION_UPGRADE_none: - new_version = min(old_version, latest_version); - break; - } - } - - if (new_version > old_version) { - struct printbuf buf = PRINTBUF; - - if (old_version < bcachefs_metadata_required_upgrade_below) - prt_str(&buf, "Version upgrade required:\n"); - - if (old_version != c->sb.version) { - prt_str(&buf, "Version upgrade from "); - bch2_version_to_text(&buf, c->sb.version_upgrade_complete); - prt_str(&buf, " to "); - bch2_version_to_text(&buf, c->sb.version); - prt_str(&buf, " incomplete\n"); - } - - prt_printf(&buf, "Doing %s version upgrade from ", - BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) - ? "incompatible" : "compatible"); - bch2_version_to_text(&buf, old_version); - prt_str(&buf, " to "); - bch2_version_to_text(&buf, new_version); - prt_newline(&buf); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_upgrade(c, old_version, new_version); - passes = ext->recovery_passes_required[0] & ~passes; - - if (passes) { - prt_str(&buf, " running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - - ret = true; - } - - if (new_version > c->sb.version_incompat_allowed && - c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Now allowing incompatible features up to "); - bch2_version_to_text(&buf, new_version); - prt_str(&buf, ", previously allowed up to "); - bch2_version_to_text(&buf, c->sb.version_incompat_allowed); - prt_newline(&buf); - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - - ret = true; - } - - if (ret) - bch2_sb_upgrade(c, new_version, - c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible); - - return ret; -} - -int bch2_fs_recovery(struct bch_fs *c) -{ - struct bch_sb_field_clean *clean = NULL; - struct jset *last_journal_entry = NULL; - u64 last_seq = 0, blacklist_seq, journal_seq; - int ret = 0; - - if (c->sb.clean) { - clean = bch2_read_superblock_clean(c); - ret = PTR_ERR_OR_ZERO(clean); - if (ret) - goto err; - - bch_info(c, "recovering from clean shutdown, journal seq %llu", - le64_to_cpu(clean->journal_seq)); - } else { - bch_info(c, "recovering from unclean shutdown"); - } - - if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { - bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); - ret = -EINVAL; - goto err; - } - - if (!c->sb.clean && - !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { - bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); - ret = -EINVAL; - goto err; - } - - if (c->opts.norecovery) { - c->opts.recovery_pass_last = c->opts.recovery_pass_last - ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) - : BCH_RECOVERY_PASS_snapshots_read; - c->opts.nochanges = true; - } - - if (c->opts.nochanges) - c->opts.read_only = true; - - if (c->opts.journal_rewind) { - bch_info(c, "rewinding journal, fsck required"); - c->opts.fsck = true; - } - - if (go_rw_in_recovery(c)) { - /* - * start workqueues/kworkers early - kthread creation checks for - * pending signals, which is _very_ annoying - */ - ret = bch2_fs_init_rw(c); - if (ret) - goto err; - } - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - bool write_sb = false; - - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { - ext->recovery_passes_required[0] |= - cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); - write_sb = true; - } - - u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - if (sb_passes) { - struct printbuf buf = PRINTBUF; - prt_str(&buf, "superblock requires following recovery passes to be run:\n "); - prt_bitflags(&buf, bch2_recovery_passes, sb_passes); - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - if (bch2_check_version_downgrade(c)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Version downgrade required:"); - - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_downgrade(c, - BCH_VERSION_MINOR(bcachefs_metadata_version_current), - BCH_VERSION_MINOR(c->sb.version)); - passes = ext->recovery_passes_required[0] & ~passes; - if (passes) { - prt_str(&buf, "\n running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } - - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - write_sb = true; - } - - if (check_version_upgrade(c)) - write_sb = true; - - c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - - if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) { - SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); - write_sb = true; - } - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (c->sb.clean) - set_bit(BCH_FS_clean_recovery, &c->flags); - if (c->opts.fsck) - set_bit(BCH_FS_in_fsck, &c->flags); - set_bit(BCH_FS_in_recovery, &c->flags); - - ret = bch2_blacklist_table_initialize(c); - if (ret) { - bch_err(c, "error initializing blacklist table"); - goto err; - } - - bch2_journal_pos_from_member_info_resume(c); - - if (!c->sb.clean || c->opts.retain_recovery_info) { - struct genradix_iter iter; - struct journal_replay **i; - - bch_verbose(c, "starting journal read"); - ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); - if (ret) - goto err; - - /* - * note: cmd_list_journal needs the blacklist table fully up to date so - * it can asterisk ignored journal entries: - */ - if (c->opts.read_journal_only) - goto out; - - genradix_for_each_reverse(&c->journal_entries, iter, i) - if (!journal_replay_ignore(*i)) { - last_journal_entry = &(*i)->j; - break; - } - - if (mustfix_fsck_err_on(c->sb.clean && - last_journal_entry && - !journal_entry_empty(last_journal_entry), c, - clean_but_journal_not_empty, - "filesystem marked clean but journal not empty")) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->sb.clean = false; - } - - if (!last_journal_entry) { - fsck_err_on(!c->sb.clean, c, - dirty_but_no_journal_entries, - "no journal entries found"); - if (clean) - goto use_clean; - - genradix_for_each_reverse(&c->journal_entries, iter, i) - if (*i) { - last_journal_entry = &(*i)->j; - (*i)->ignore_blacklisted = false; - (*i)->ignore_not_dirty= false; - /* - * This was probably a NO_FLUSH entry, - * so last_seq was garbage - but we know - * we're only using a single journal - * entry, set it here: - */ - (*i)->j.last_seq = (*i)->j.seq; - break; - } - } - - ret = bch2_journal_keys_sort(c); - if (ret) - goto err; - - if (c->sb.clean && last_journal_entry) { - ret = bch2_verify_superblock_clean(c, &clean, - last_journal_entry); - if (ret) - goto err; - } - } else { -use_clean: - if (!clean) { - bch_err(c, "no superblock clean section found"); - ret = bch_err_throw(c, fsck_repair_impossible); - goto err; - - } - blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; - } - - c->journal_replay_seq_start = last_seq; - c->journal_replay_seq_end = blacklist_seq - 1; - - zero_out_btree_mem_ptr(&c->journal_keys); - - ret = journal_replay_early(c, clean); - if (ret) - goto err; - - ret = bch2_fs_resize_on_mount(c); - if (ret) { - up_write(&c->state_lock); - goto err; - } - - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { - bch_info(c, "filesystem is an unresized image file, mounting ro"); - c->opts.read_only = true; - } - - if (!c->opts.read_only && - (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) { - bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); - - bch2_reconstruct_alloc(c); - } else if (c->opts.reconstruct_alloc) { - bch2_journal_log_msg(c, "dropping alloc info"); - bch_info(c, "dropping and reconstructing all alloc info"); - - bch2_reconstruct_alloc(c); - } - - if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { - /* We can't go RW to fix errors without alloc info */ - if (c->opts.fix_errors == FSCK_FIX_yes || - c->opts.fix_errors == FSCK_FIX_ask) - c->opts.fix_errors = FSCK_FIX_no; - if (c->opts.errors == BCH_ON_ERROR_fix_safe) - c->opts.errors = BCH_ON_ERROR_continue; - } - - /* - * After an unclean shutdown, skip then next few journal sequence - * numbers as they may have been referenced by btree writes that - * happened before their corresponding journal writes - those btree - * writes need to be ignored, by skipping and blacklisting the next few - * journal sequence numbers: - */ - if (!c->sb.clean) - journal_seq += JOURNAL_BUF_NR * 4; - - if (blacklist_seq != journal_seq) { - ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", - blacklist_seq, journal_seq) ?: - bch2_journal_seq_blacklist_add(c, - blacklist_seq, journal_seq); - if (ret) { - bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); - goto err; - } - } - - ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", - journal_seq, last_seq, blacklist_seq - 1) ?: - bch2_fs_journal_start(&c->journal, last_seq, journal_seq); - if (ret) - goto err; - - /* - * Skip past versions that might have possibly been used (as nonces), - * but hadn't had their pointers written: - */ - if (c->sb.encryption_type && !c->sb.clean) - atomic64_add(1 << 16, &c->key_version); - - ret = read_btree_roots(c); - if (ret) - goto err; - - set_bit(BCH_FS_btree_running, &c->flags); - - ret = bch2_sb_set_upgrade_extra(c); - if (ret) - goto err; - - ret = bch2_run_recovery_passes(c, 0); - if (ret) - goto err; - - /* - * Normally set by the appropriate recovery pass: when cleared, this - * indicates we're in early recovery and btree updates should be done by - * being applied to the journal replay keys. _Must_ be cleared before - * multithreaded use: - */ - set_bit(BCH_FS_may_go_rw, &c->flags); - clear_bit(BCH_FS_in_fsck, &c->flags); - - /* in case we don't run journal replay, i.e. norecovery mode */ - set_bit(BCH_FS_accounting_replay_done, &c->flags); - - bch2_async_btree_node_rewrites_flush(c); - - /* fsync if we fixed errors */ - if (test_bit(BCH_FS_errors_fixed, &c->flags)) { - bch2_journal_flush_all_pins(&c->journal); - bch2_journal_meta(&c->journal); - } - - /* If we fixed errors, verify that fs is actually clean now: */ - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - test_bit(BCH_FS_errors_fixed, &c->flags) && - !test_bit(BCH_FS_errors_not_fixed, &c->flags) && - !test_bit(BCH_FS_error, &c->flags)) { - bch2_flush_fsck_errs(c); - - bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); - clear_bit(BCH_FS_errors_fixed, &c->flags); - - ret = bch2_run_recovery_passes(c, - BCH_RECOVERY_PASS_check_alloc_info); - if (ret) - goto err; - - if (test_bit(BCH_FS_errors_fixed, &c->flags) || - test_bit(BCH_FS_errors_not_fixed, &c->flags)) { - bch_err(c, "Second fsck run was not clean"); - set_bit(BCH_FS_errors_not_fixed, &c->flags); - } - - set_bit(BCH_FS_errors_fixed, &c->flags); - } - - if (enabled_qtypes(c)) { - bch_verbose(c, "reading quotas"); - ret = bch2_fs_quota_read(c); - if (ret) - goto err; - bch_verbose(c, "quotas done"); - } - - mutex_lock(&c->sb_lock); - ext = bch2_sb_field_get(c->disk_sb.sb, ext); - write_sb = false; - - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); - write_sb = true; - } - - if (!test_bit(BCH_FS_error, &c->flags) && - !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); - write_sb = true; - } - - if (!test_bit(BCH_FS_error, &c->flags) && - !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { - memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); - write_sb = true; - } - - if (c->opts.fsck && - !test_bit(BCH_FS_error, &c->flags) && - c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 && - ext->btrees_lost_data) { - ext->btrees_lost_data = 0; - write_sb = true; - } - - if (c->opts.fsck && - !test_bit(BCH_FS_error, &c->flags) && - !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { - SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); - SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); - write_sb = true; - } - - if (bch2_blacklist_entries_gc(c)) - write_sb = true; - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || - c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { - struct bch_move_stats stats; - - bch2_move_stats_init(&stats, "recovery"); - - struct printbuf buf = PRINTBUF; - bch2_version_to_text(&buf, c->sb.version_min); - bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); - printbuf_exit(&buf); - - ret = bch2_fs_read_write_early(c) ?: - bch2_scan_old_btree_nodes(c, &stats); - if (ret) - goto err; - bch_info(c, "scanning for old btree nodes done"); - } - - ret = 0; -out: - bch2_flush_fsck_errs(c); - - if (!ret && - test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && - !c->opts.nochanges) { - bch2_fs_read_write_early(c); - bch2_delete_dead_snapshots_async(c); - } - - bch_err_fn(c, ret); -final_out: - if (!IS_ERR(clean)) - kfree(clean); - return ret; -err: -fsck_err: - { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret)); - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - goto final_out; -} - -int bch2_fs_initialize(struct bch_fs *c) -{ - struct bch_inode_unpacked root_inode, lostfound_inode; - struct bkey_inode_buf packed_inode; - struct qstr lostfound = QSTR("lost+found"); - struct bch_member *m; - int ret; - - bch_notice(c, "initializing new filesystem"); - set_bit(BCH_FS_new_fs, &c->flags); - - mutex_lock(&c->sb_lock); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); - - bch2_check_version_downgrade(c); - - if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { - bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - bch2_write_super(c); - } - - for_each_member_device(c, ca) { - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); - ca->mi = bch2_mi_to_cpu(m); - } - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - set_bit(BCH_FS_btree_running, &c->flags); - set_bit(BCH_FS_may_go_rw, &c->flags); - - for (unsigned i = 0; i < BTREE_ID_NR; i++) - bch2_btree_root_alloc_fake(c, i, 0); - - ret = bch2_fs_journal_alloc(c); - if (ret) - goto err; - - /* - * journal_res_get() will crash if called before this has - * set up the journal.pin FIFO and journal.cur pointer: - */ - ret = bch2_fs_journal_start(&c->journal, 1, 1); - if (ret) - goto err; - - ret = bch2_fs_read_write_early(c); - if (ret) - goto err; - - set_bit(BCH_FS_accounting_replay_done, &c->flags); - bch2_journal_set_replay_done(&c->journal); - - for_each_member_device(c, ca) { - ret = bch2_dev_usage_init(ca, false); - if (ret) { - bch2_dev_put(ca); - goto err; - } - } - - /* - * Write out the superblock and journal buckets, now that we can do - * btree updates - */ - bch_verbose(c, "marking superblocks"); - ret = bch2_trans_mark_dev_sbs(c); - bch_err_msg(c, ret, "marking superblocks"); - if (ret) - goto err; - - ret = bch2_fs_freespace_init(c); - if (ret) - goto err; - - ret = bch2_initialize_subvolumes(c); - if (ret) - goto err; - - bch_verbose(c, "reading snapshots table"); - ret = bch2_snapshots_read(c); - if (ret) - goto err; - bch_verbose(c, "reading snapshots done"); - - bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); - root_inode.bi_inum = BCACHEFS_ROOT_INO; - root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - bch2_inode_pack(&packed_inode, &root_inode); - packed_inode.inode.k.p.snapshot = U32_MAX; - - ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "creating root directory"); - if (ret) - goto err; - - bch2_inode_init_early(c, &lostfound_inode); - - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_create_trans(trans, - BCACHEFS_ROOT_SUBVOL_INUM, - &root_inode, &lostfound_inode, - &lostfound, - 0, 0, S_IFDIR|0700, 0, - NULL, NULL, (subvol_inum) { 0 }, 0)); - bch_err_msg(c, ret, "creating lost+found"); - if (ret) - goto err; - - c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1; - - bch2_copygc_wakeup(c); - bch2_rebalance_wakeup(c); - - if (enabled_qtypes(c)) { - ret = bch2_fs_quota_read(c); - if (ret) - goto err; - } - - ret = bch2_journal_flush(&c->journal); - bch_err_msg(c, ret, "writing first journal entry"); - if (ret) - goto err; - - mutex_lock(&c->sb_lock); - SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - c->recovery.curr_pass = BCH_RECOVERY_PASS_NR; - return 0; -err: - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h deleted file mode 100644 index c023f52fc2d6..000000000000 --- a/fs/bcachefs/recovery.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_H -#define _BCACHEFS_RECOVERY_H - -int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id); -void bch2_reconstruct_alloc(struct bch_fs *); - -int bch2_journal_replay(struct bch_fs *); - -int bch2_fs_recovery(struct bch_fs *); -int bch2_fs_initialize(struct bch_fs *); - -#endif /* _BCACHEFS_RECOVERY_H */ diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c deleted file mode 100644 index 6a039e011064..000000000000 --- a/fs/bcachefs/recovery_passes.c +++ /dev/null @@ -1,646 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "backpointers.h" -#include "btree_gc.h" -#include "btree_node_scan.h" -#include "disk_accounting.h" -#include "ec.h" -#include "fsck.h" -#include "inode.h" -#include "journal.h" -#include "lru.h" -#include "logged_ops.h" -#include "movinggc.h" -#include "rebalance.h" -#include "recovery.h" -#include "recovery_passes.h" -#include "snapshot.h" -#include "subvolume.h" -#include "super.h" -#include "super-io.h" - -const char * const bch2_recovery_passes[] = { -#define x(_fn, ...) #_fn, - BCH_RECOVERY_PASSES() -#undef x - NULL -}; - -static const u8 passes_to_stable_map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, - BCH_RECOVERY_PASSES() -#undef x -}; - -static const u8 passes_from_stable_map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x -}; - -static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) -{ - return passes_to_stable_map[pass]; -} - -u64 bch2_recovery_passes_to_stable(u64 v) -{ - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(passes_to_stable_map[i]); - return ret; -} - -static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass) -{ - return pass < ARRAY_SIZE(passes_from_stable_map) - ? passes_from_stable_map[pass] - : 0; -} - -u64 bch2_recovery_passes_from_stable(u64 v) -{ - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(passes_from_stable_map[i]); - return ret; -} - -static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - return 0; -} - -static void bch2_sb_recovery_passes_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_recovery_passes *r = - field_to_type(f, recovery_passes); - unsigned nr = recovery_passes_nr_entries(r); - - if (out->nr_tabstops < 1) - printbuf_tabstop_push(out, 32); - if (out->nr_tabstops < 2) - printbuf_tabstop_push(out, 16); - - prt_printf(out, "Pass\tLast run\tLast runtime\n"); - - for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) { - if (!i->last_run) - continue; - - unsigned idx = i - r->start; - - prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]); - - bch2_prt_datetime(out, le64_to_cpu(i->last_run)); - prt_tab(out); - - bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); - - if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) - prt_str(out, " (no ratelimit)"); - - prt_newline(out); - } -} - -static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); - - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_recovery_passes *r = - bch2_sb_field_get(c->disk_sb.sb, recovery_passes); - - if (stable >= recovery_passes_nr_entries(r)) { - unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64); - - r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); - if (!r) { - bch_err(c, "error creating recovery_passes sb section"); - return NULL; - } - } - - return r->start + stable; -} - -static void bch2_sb_recovery_pass_complete(struct bch_fs *c, - enum bch_recovery_pass pass, - s64 start_time) -{ - guard(mutex)(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __clear_bit_le64(bch2_recovery_pass_to_stable(pass), - ext->recovery_passes_required); - - struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); - if (e) { - s64 end_time = ktime_get_real_seconds(); - e->last_run = cpu_to_le64(end_time); - e->last_runtime = cpu_to_le32(max(0, end_time - start_time)); - SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); - } - - bch2_write_super(c); -} - -void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - guard(mutex)(&c->sb_lock); - - struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); - if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) { - SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); - bch2_write_super(c); - } -} - -static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) -{ - enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); - bool ret = false; - - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_recovery_passes *r = - bch2_sb_field_get(c->disk_sb.sb, recovery_passes); - - if (stable < recovery_passes_nr_entries(r)) { - struct recovery_pass_entry *i = r->start + stable; - - /* - * Ratelimit if the last runtime was more than 1% of the time - * since we last ran - */ - ret = (u64) le32_to_cpu(i->last_runtime) * 100 > - ktime_get_real_seconds() - le64_to_cpu(i->last_run); - - if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) - ret = false; - } - - return ret; -} - -const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { - .validate = bch2_sb_recovery_passes_validate, - .to_text = bch2_sb_recovery_passes_to_text -}; - -/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ -static int bch2_recovery_pass_empty(struct bch_fs *c) -{ - return 0; -} - -static int bch2_set_may_go_rw(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - /* - * After we go RW, the journal keys buffer can't be modified (except for - * setting journal_key->overwritten: it will be accessed by multiple - * threads - */ - move_gap(keys, keys->nr); - - set_bit(BCH_FS_may_go_rw, &c->flags); - - if (go_rw_in_recovery(c)) { - if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { - bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); - bch2_reconstruct_alloc(c); - } - - return bch2_fs_read_write_early(c); - } - return 0; -} - -/* - * Make sure root inode is readable while we're still in recovery and can rewind - * for repair: - */ -static int bch2_lookup_root_inode(struct bch_fs *c) -{ - subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM; - struct bch_inode_unpacked inode_u; - struct bch_subvolume subvol; - - return bch2_trans_do(c, - bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); -} - -struct recovery_pass_fn { - int (*fn)(struct bch_fs *); - unsigned when; -}; - -static struct recovery_pass_fn recovery_pass_fns[] = { -#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, - BCH_RECOVERY_PASSES() -#undef x -}; - -static u64 bch2_recovery_passes_match(unsigned flags) -{ - u64 ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & flags) - ret |= BIT_ULL(i); - return ret; -} - -u64 bch2_fsck_recovery_passes(void) -{ - return bch2_recovery_passes_match(PASS_FSCK); -} - -static void bch2_run_async_recovery_passes(struct bch_fs *c) -{ - if (!down_trylock(&c->recovery.run_lock)) - return; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes)) - goto unlock; - - if (queue_work(system_long_wq, &c->recovery.work)) - return; - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); -unlock: - up(&c->recovery.run_lock); -} - -static bool recovery_pass_needs_set(struct bch_fs *c, - enum bch_recovery_pass pass, - enum bch_run_recovery_pass_flags *flags) -{ - struct bch_fs_recovery *r = &c->recovery; - - /* - * Never run scan_for_btree_nodes persistently: check_topology will run - * it if required - */ - if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) - *flags |= RUN_RECOVERY_PASS_nopersistent; - - if ((*flags & RUN_RECOVERY_PASS_ratelimit) && - !bch2_recovery_pass_want_ratelimit(c, pass)) - *flags &= ~RUN_RECOVERY_PASS_ratelimit; - - /* - * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do - * anything if the pass has already run: these mean we need a prior pass - * to run before we continue to repair, we don't expect that pass to fix - * the damage we encountered. - * - * Otherwise, we run run_explicit_recovery_pass when we find damage, so - * it should run again even if it's already run: - */ - bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); - bool rewind = in_recovery && - r->curr_pass > pass && - !(r->passes_complete & BIT_ULL(pass)); - - if (persistent - ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) - : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass))) - return true; - - if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && - (r->passes_ratelimiting & BIT_ULL(pass))) - return true; - - if (rewind) - return true; - - return false; -} - -/* - * For when we need to rewind recovery passes and run a pass we skipped: - */ -int __bch2_run_explicit_recovery_pass(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass, - enum bch_run_recovery_pass_flags flags) -{ - struct bch_fs_recovery *r = &c->recovery; - int ret = 0; - - lockdep_assert_held(&c->sb_lock); - - bch2_printbuf_make_room(out, 1024); - out->atomic++; - - unsigned long lockflags; - spin_lock_irqsave(&r->lock, lockflags); - - if (!recovery_pass_needs_set(c, pass, &flags)) - goto out; - - bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool rewind = in_recovery && - r->curr_pass > pass && - !(r->passes_complete & BIT_ULL(pass)); - bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; - - if (!(flags & RUN_RECOVERY_PASS_nopersistent)) { - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); - } - - if (pass < BCH_RECOVERY_PASS_set_may_go_rw && - (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { - prt_printf(out, "need recovery pass %s (%u), but already rw\n", - bch2_recovery_passes[pass], pass); - ret = bch_err_throw(c, cannot_rewind_recovery); - goto out; - } - - if (ratelimit) - r->passes_ratelimiting |= BIT_ULL(pass); - else - r->passes_ratelimiting &= ~BIT_ULL(pass); - - if (in_recovery && !ratelimit) { - prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[r->curr_pass], r->curr_pass, - rewind ? " - rewinding" : ""); - - r->passes_to_run |= BIT_ULL(pass); - - if (rewind) { - r->next_pass = pass; - r->passes_complete &= (1ULL << pass) >> 1; - ret = bch_err_throw(c, restart_recovery); - } - } else { - prt_printf(out, "scheduling recovery pass %s (%u)%s\n", - bch2_recovery_passes[pass], pass, - ratelimit ? " - ratelimiting" : ""); - - struct recovery_pass_fn *p = recovery_pass_fns + pass; - if (p->when & PASS_ONLINE) - bch2_run_async_recovery_passes(c); - } -out: - spin_unlock_irqrestore(&r->lock, lockflags); - --out->atomic; - return ret; -} - -int bch2_run_explicit_recovery_pass(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass, - enum bch_run_recovery_pass_flags flags) -{ - int ret = 0; - - if (recovery_pass_needs_set(c, pass, &flags)) { - guard(mutex)(&c->sb_lock); - ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); - bch2_write_super(c); - } - - return ret; -} - -/* - * Returns 0 if @pass has run recently, otherwise one of - * -BCH_ERR_restart_recovery - * -BCH_ERR_recovery_pass_will_run - */ -int bch2_require_recovery_pass(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass) -{ - if (test_bit(BCH_FS_in_recovery, &c->flags) && - c->recovery.passes_complete & BIT_ULL(pass)) - return 0; - - guard(mutex)(&c->sb_lock); - - if (bch2_recovery_pass_want_ratelimit(c, pass)) - return 0; - - enum bch_run_recovery_pass_flags flags = 0; - int ret = 0; - - if (recovery_pass_needs_set(c, pass, &flags)) { - ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); - bch2_write_super(c); - } - - return ret ?: bch_err_throw(c, recovery_pass_will_run); -} - -int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - enum bch_run_recovery_pass_flags flags = 0; - - if (!recovery_pass_needs_set(c, pass, &flags)) - return 0; - - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - mutex_lock(&c->sb_lock); - int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass, - RUN_RECOVERY_PASS_nopersistent); - mutex_unlock(&c->sb_lock); - - bch2_print_str(c, KERN_NOTICE, buf.buf); - printbuf_exit(&buf); - return ret; -} - -static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct bch_fs_recovery *r = &c->recovery; - struct recovery_pass_fn *p = recovery_pass_fns + pass; - - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - - s64 start_time = ktime_get_real_seconds(); - int ret = p->fn(c); - - r->passes_to_run &= ~BIT_ULL(pass); - - if (ret) { - r->passes_failing |= BIT_ULL(pass); - return ret; - } - - r->passes_failing = 0; - - if (!test_bit(BCH_FS_error, &c->flags)) - bch2_sb_recovery_pass_complete(c, pass, start_time); - - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_CONT " done\n"); - - return 0; -} - -static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, - bool online) -{ - struct bch_fs_recovery *r = &c->recovery; - int ret = 0; - - spin_lock_irq(&r->lock); - - if (online) - orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE); - - if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) - orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); - - /* - * A failed recovery pass will be retried after another pass succeeds - - * but not this iteration. - * - * This is because some passes depend on repair done by other passes: we - * may want to retry, but we don't want to loop on failing passes. - */ - - orig_passes_to_run &= ~r->passes_failing; - - r->passes_to_run = orig_passes_to_run; - - while (r->passes_to_run) { - unsigned prev_done = r->pass_done; - unsigned pass = __ffs64(r->passes_to_run); - r->curr_pass = pass; - r->next_pass = r->curr_pass + 1; - r->passes_to_run &= ~BIT_ULL(pass); - - spin_unlock_irq(&r->lock); - - int ret2 = bch2_run_recovery_pass(c, pass) ?: - bch2_journal_flush(&c->journal); - - spin_lock_irq(&r->lock); - - if (r->next_pass < r->curr_pass) { - /* Rewind: */ - r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass); - } else if (!ret2) { - r->pass_done = max(r->pass_done, pass); - r->passes_complete |= BIT_ULL(pass); - } else { - ret = ret2; - } - - if (ret && !online) - break; - - if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && - r->pass_done > BCH_RECOVERY_PASS_check_snapshots) { - bch2_copygc_wakeup(c); - bch2_rebalance_wakeup(c); - } - } - - clear_bit(BCH_FS_in_recovery, &c->flags); - spin_unlock_irq(&r->lock); - - return ret; -} - -static void bch2_async_recovery_passes_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, recovery.work); - struct bch_fs_recovery *r = &c->recovery; - - __bch2_run_recovery_passes(c, - c->sb.recovery_passes_required & ~r->passes_ratelimiting, - true); - - up(&r->run_lock); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); -} - -int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) -{ - return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); -} - -int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) -{ - u64 passes = - bch2_recovery_passes_match(PASS_ALWAYS) | - (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | - (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | - c->opts.recovery_passes | - c->sb.recovery_passes_required; - - if (c->opts.recovery_pass_last) - passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1; - - /* - * We can't allow set_may_go_rw to be excluded; that would cause us to - * use the journal replay keys for updates where it's not expected. - */ - c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - passes &= ~c->opts.recovery_passes_exclude; - - passes &= ~(BIT_ULL(from) - 1); - - down(&c->recovery.run_lock); - int ret = __bch2_run_recovery_passes(c, passes, false); - up(&c->recovery.run_lock); - - return ret; -} - -static void prt_passes(struct printbuf *out, const char *msg, u64 passes) -{ - prt_printf(out, "%s:\t", msg); - prt_bitflags(out, bch2_recovery_passes, passes); - prt_newline(out); -} - -void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bch_fs_recovery *r = &c->recovery; - - printbuf_tabstop_push(out, 32); - prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required); - prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required & - bch2_recovery_passes_match(PASS_ONLINE)); - prt_passes(out, "Complete passes", r->passes_complete); - prt_passes(out, "Failing passes", r->passes_failing); - - if (r->curr_pass) { - prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); - prt_passes(out, "Current passes", r->passes_to_run); - } -} - -void bch2_fs_recovery_passes_init(struct bch_fs *c) -{ - spin_lock_init(&c->recovery.lock); - sema_init(&c->recovery.run_lock, 1); - - INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work); -} diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h deleted file mode 100644 index 2117f0ce1922..000000000000 --- a/fs/bcachefs/recovery_passes.h +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef _BCACHEFS_RECOVERY_PASSES_H -#define _BCACHEFS_RECOVERY_PASSES_H - -extern const char * const bch2_recovery_passes[]; - -extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes; - -u64 bch2_recovery_passes_to_stable(u64 v); -u64 bch2_recovery_passes_from_stable(u64 v); - -u64 bch2_fsck_recovery_passes(void); - -void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass); - -enum bch_run_recovery_pass_flags { - RUN_RECOVERY_PASS_nopersistent = BIT(0), - RUN_RECOVERY_PASS_ratelimit = BIT(1), -}; - -static inline bool go_rw_in_recovery(struct bch_fs *c) -{ - return (c->journal_keys.nr || - !c->opts.read_only || - !c->sb.clean || - c->opts.recovery_passes || - (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))); -} - -int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); - -int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, - enum bch_recovery_pass, - enum bch_run_recovery_pass_flags); -int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, - enum bch_recovery_pass, - enum bch_run_recovery_pass_flags); - -int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *, - enum bch_recovery_pass); - -int bch2_run_online_recovery_passes(struct bch_fs *, u64); -int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); - -void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_recovery_passes_init(struct bch_fs *); - -#endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h deleted file mode 100644 index b63c20558d3d..000000000000 --- a/fs/bcachefs/recovery_passes_format.h +++ /dev/null @@ -1,106 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H -#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H - -#define PASS_SILENT BIT(0) -#define PASS_FSCK BIT(1) -#define PASS_UNCLEAN BIT(2) -#define PASS_ALWAYS BIT(3) -#define PASS_ONLINE BIT(4) -#define PASS_ALLOC BIT(5) -#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC) - -#ifdef CONFIG_BCACHEFS_DEBUG -#define PASS_FSCK_DEBUG BIT(1) -#else -#define PASS_FSCK_DEBUG 0 -#endif - -/* - * Passes may be reordered, but the second field is a persistent identifier and - * must never change: - */ -#define BCH_RECOVERY_PASSES() \ - x(recovery_pass_empty, 41, PASS_SILENT) \ - x(scan_for_btree_nodes, 37, 0) \ - x(check_topology, 4, 0) \ - x(accounting_read, 39, PASS_ALWAYS) \ - x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, 0) \ - x(initialize_subvolumes, 2, 0) \ - x(snapshots_read, 3, PASS_ALWAYS) \ - x(check_allocations, 5, PASS_FSCK_ALLOC) \ - x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ - x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ - x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ - x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ - x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ - x(bucket_gens_init, 17, 0) \ - x(reconstruct_snapshots, 38, 0) \ - x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ - x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ - x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ - x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ - x(fs_upgrade_for_subvolumes, 22, 0) \ - x(check_inodes, 24, PASS_FSCK) \ - x(check_extents, 25, PASS_FSCK) \ - x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ - x(check_dirents, 27, PASS_FSCK) \ - x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ - x(check_unreachable_inodes, 40, PASS_FSCK) \ - x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ - x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ - x(check_nlinks, 31, PASS_FSCK) \ - x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ - x(delete_dead_inodes, 32, PASS_ALWAYS) \ - x(fix_reflink_p, 33, 0) \ - x(set_fs_needs_rebalance, 34, 0) \ - x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT) - -/* We normally enumerate recovery passes in the order we run them: */ -enum bch_recovery_pass { -#define x(n, id, when) BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - BCH_RECOVERY_PASS_NR -}; - -/* But we also need stable identifiers that can be used in the superblock */ -enum bch_recovery_pass_stable { -#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, - BCH_RECOVERY_PASSES() -#undef x -}; - -struct recovery_pass_entry { - __le64 last_run; - __le32 last_runtime; - __le32 flags; -}; - -LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT, struct recovery_pass_entry, flags, 0, 1) - -struct bch_sb_field_recovery_passes { - struct bch_sb_field field; - struct recovery_pass_entry start[]; -}; - -static inline unsigned -recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r) -{ - return r - ? ((vstruct_end(&r->field) - (void *) &r->start[0]) / - sizeof(struct recovery_pass_entry)) - : 0; -} - -#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */ diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h deleted file mode 100644 index aa9526938cc3..000000000000 --- a/fs/bcachefs/recovery_passes_types.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H -#define _BCACHEFS_RECOVERY_PASSES_TYPES_H - -struct bch_fs_recovery { - /* - * Two different uses: - * "Has this fsck pass?" - i.e. should this type of error be an - * emergency read-only - * And, in certain situations fsck will rewind to an earlier pass: used - * for signaling to the toplevel code which pass we want to run now. - */ - enum bch_recovery_pass curr_pass; - enum bch_recovery_pass next_pass; - /* never rewinds version of curr_pass */ - enum bch_recovery_pass pass_done; - u64 passes_to_run; - /* bitmask of recovery passes that we actually ran */ - u64 passes_complete; - u64 passes_failing; - u64 passes_ratelimiting; - spinlock_t lock; - struct semaphore run_lock; - struct work_struct work; -}; - -#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c deleted file mode 100644 index 92b90cfe622b..000000000000 --- a/fs/bcachefs/reflink.c +++ /dev/null @@ -1,865 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "inode.h" -#include "io_misc.h" -#include "io_write.h" -#include "rebalance.h" -#include "reflink.h" -#include "subvolume.h" -#include "super-io.h" - -#include <linux/sched/signal.h> - -static inline bool bkey_extent_is_reflink_data(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_reflink_v: - case KEY_TYPE_indirect_inline_data: - return true; - default: - return false; - } -} - -static inline unsigned bkey_type_to_indirect(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_extent: - return KEY_TYPE_reflink_v; - case KEY_TYPE_inline_data: - return KEY_TYPE_indirect_inline_data; - default: - return 0; - } -} - -/* reflink pointers */ - -int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - int ret = 0; - - bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad), - c, reflink_p_front_pad_bad, - "idx < front_pad (%llu < %u)", - REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad)); -fsck_err: - return ret; -} - -void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - - prt_printf(out, "idx %llu front_pad %u back_pad %u", - REFLINK_P_IDX(p.v), - le32_to_cpu(p.v->front_pad), - le32_to_cpu(p.v->back_pad)); - - if (REFLINK_P_ERROR(p.v)) - prt_str(out, " error"); -} - -bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -{ - struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); - struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r); - - /* - * Disabled for now, the triggers code needs to be reworked for merging - * of reflink pointers to work: - */ - return false; - - if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v)) - return false; - - if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v)) - return false; - - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; -} - -/* indirect extents */ - -int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)), - c, reflink_v_pos_bad, - "indirect extent above maximum position 0:%llu", - REFLINK_P_IDX_MAX); - - ret = bch2_bkey_ptrs_validate(c, k, from); -fsck_err: - return ret; -} - -void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - - prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); - - bch2_bkey_ptrs_to_text(out, c, k); -} - -#if 0 -Currently disabled, needs to be debugged: - -bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -{ - struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); - - return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); -} -#endif - -/* indirect inline data */ - -int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return 0; -} - -void bch2_indirect_inline_data_to_text(struct printbuf *out, - struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); - unsigned datalen = bkey_inline_data_bytes(k.k); - - prt_printf(out, "refcount %llu datalen %u: %*phN", - le64_to_cpu(d.v->refcount), datalen, - min(datalen, 32U), d.v->data); -} - -/* lookup */ - -static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p, - bool should_commit) -{ - struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - SET_REFLINK_P_ERROR(&new->v, false); - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); - if (ret) - return ret; - - if (!should_commit) - return 0; - - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; -} - -static int bch2_indirect_extent_missing_error(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 missing_start, u64 missing_end, - bool should_commit) -{ - if (REFLINK_P_ERROR(p.v)) - return 0; - - struct bch_fs *c = trans->c; - u64 live_start = REFLINK_P_IDX(p.v); - u64 live_end = REFLINK_P_IDX(p.v) + p.k->size; - u64 refd_start = live_start - le32_to_cpu(p.v->front_pad); - u64 refd_end = live_end + le32_to_cpu(p.v->back_pad); - struct printbuf buf = PRINTBUF; - int ret = 0; - - BUG_ON(missing_start < refd_start); - BUG_ON(missing_end > refd_end); - - struct bpos missing_pos = bkey_start_pos(p.k); - missing_pos.offset += missing_start - live_start; - - prt_printf(&buf, "pointer to missing indirect extent in "); - ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos); - if (ret) - goto err; - - prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9); - bch2_bkey_val_to_text(&buf, c, p.s_c); - - prt_printf(&buf, "\nmissing reflink btree range %llu-%llu", - missing_start, missing_end); - - if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) { - struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - /* - * Is the missing range not actually needed? - * - * p.v->idx refers to the data that we actually want, but if the - * indirect extent we point to was bigger, front_pad and back_pad - * indicate the range we took a reference on. - */ - - if (missing_end <= live_start) { - new->v.front_pad = cpu_to_le32(live_start - missing_end); - } else if (missing_start >= live_end) { - new->v.back_pad = cpu_to_le32(missing_start - live_end); - } else { - struct bpos new_start = bkey_start_pos(&new->k); - struct bpos new_end = new->k.p; - - if (missing_start > live_start) - new_start.offset += missing_start - live_start; - if (missing_end < live_end) - new_end.offset -= live_end - missing_end; - - bch2_cut_front(new_start, &new->k_i); - bch2_cut_back(new_end, &new->k_i); - - SET_REFLINK_P_ERROR(&new->v, true); - } - - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); - if (ret) - goto err; - - if (should_commit) - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* - * This is used from the read path, which doesn't expect to have to do a - * transaction commit, and from triggers, which should not be doing a commit: - */ -struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, - struct btree_iter *iter, - s64 *offset_into_extent, - struct bkey_s_c_reflink_p p, - bool should_commit, - unsigned iter_flags) -{ - BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad))); - BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad)); - - u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent; - - struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink, - POS(0, reflink_offset), iter_flags); - if (bkey_err(k)) - return k; - - if (unlikely(!bkey_extent_is_reflink_data(k.k))) { - u64 missing_end = min(k.k->p.offset, - REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad)); - BUG_ON(reflink_offset == missing_end); - - int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, - missing_end, should_commit); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return bkey_s_c_err(ret); - } - } else if (unlikely(REFLINK_P_ERROR(p.v))) { - int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return bkey_s_c_err(ret); - } - } - - *offset_into_extent = reflink_offset - bkey_start_offset(k.k); - return k; -} - -/* reflink pointer trigger */ - -static int trans_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, u64 *idx, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v); - struct btree_iter iter; - struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false, - BTREE_ITER_intent| - BTREE_ITER_with_updates); - int ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_refcount_c(k)) { - if (!(flags & BTREE_TRIGGER_overwrite)) - ret = bch_err_throw(c, missing_indirect_extent); - goto next; - } - - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - __le64 *refcount = bkey_refcount(bkey_i_to_s(new)); - if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - log_fsck_err(trans, reflink_refcount_underflow, - "indirect extent refcount underflow while marking\n%s", - buf.buf); - goto next; - } - - if (flags & BTREE_TRIGGER_insert) { - struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; - u64 pad; - - pad = max_t(s64, le32_to_cpu(v->front_pad), - REFLINK_P_IDX(v) - bkey_start_offset(&new->k)); - BUG_ON(pad > U32_MAX); - v->front_pad = cpu_to_le32(pad); - - pad = max_t(s64, le32_to_cpu(v->back_pad), - new->k.p.offset - p.k->size - REFLINK_P_IDX(v)); - BUG_ON(pad > U32_MAX); - v->back_pad = cpu_to_le32(pad); - } - - le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1); - - bch2_btree_iter_set_pos_to_extent_start(&iter); - ret = bch2_trans_update(trans, &iter, new, 0); - if (ret) - goto err; -next: - *idx = k.k->p.offset; -err: -fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -} - -static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, u64 *idx, - enum btree_iter_update_trigger_flags flags, - size_t r_idx) -{ - struct bch_fs *c = trans->c; - struct reflink_gc *r; - int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; - u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); - s64 ret = 0; - struct printbuf buf = PRINTBUF; - - if (r_idx >= c->reflink_gc_nr) - goto not_found; - - r = genradix_ptr(&c->reflink_gc_table, r_idx); - next_idx = min(next_idx, r->offset - r->size); - if (*idx < next_idx) - goto not_found; - - BUG_ON((s64) r->refcount + add < 0); - - if (flags & BTREE_TRIGGER_gc) - r->refcount += add; - *idx = r->offset; - return 0; -not_found: - if (flags & BTREE_TRIGGER_check_repair) { - ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); - if (ret) - goto err; - } - - *idx = next_idx; -err: - printbuf_exit(&buf); - return ret; -} - -static int __trigger_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - int ret = 0; - - u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); - u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); - - if (flags & BTREE_TRIGGER_transactional) { - while (idx < end && !ret) - ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags); - } - - if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) { - size_t l = 0, r = c->reflink_gc_nr; - - while (l < r) { - size_t m = l + (r - l) / 2; - struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m); - if (ref->offset <= idx) - l = m + 1; - else - r = m; - } - - while (idx < end && !ret) - ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++); - } - - return ret; -} - -int bch2_trigger_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - if ((flags & BTREE_TRIGGER_transactional) && - (flags & BTREE_TRIGGER_insert)) { - struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v; - - v->front_pad = v->back_pad = 0; - } - - return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags); -} - -/* indirect extent trigger */ - -static inline void -check_indirect_extent_deleting(struct bkey_s new, - enum btree_iter_update_trigger_flags *flags) -{ - if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) { - new.k->type = KEY_TYPE_deleted; - new.k->size = 0; - set_bkey_val_u64s(new.k, 0); - *flags &= ~BTREE_TRIGGER_insert; - } -} - -int bch2_trigger_reflink_v(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - if ((flags & BTREE_TRIGGER_transactional) && - (flags & BTREE_TRIGGER_insert)) - check_indirect_extent_deleting(new, &flags); - - return bch2_trigger_extent(trans, btree_id, level, old, new, flags); -} - -int bch2_trigger_indirect_inline_data(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - check_indirect_extent_deleting(new, &flags); - - return 0; -} - -/* create */ - -static int bch2_make_extent_indirect(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *orig, - bool reflink_p_may_update_opts_field) -{ - struct bch_fs *c = trans->c; - struct btree_iter reflink_iter = {}; - struct bkey_s_c k; - struct bkey_i *r_v; - struct bkey_i_reflink_p *r_p; - __le64 *refcount; - int ret; - - if (orig->k.type == KEY_TYPE_inline_data) - bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); - - bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, - BTREE_ITER_intent); - k = bch2_btree_iter_peek_prev(trans, &reflink_iter); - ret = bkey_err(k); - if (ret) - goto err; - - /* - * XXX: we're assuming that 56 bits will be enough for the life of the - * filesystem: we need to implement wraparound, with a cursor in the - * logged ops btree: - */ - if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size))) - return -ENOSPC; - - r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); - ret = PTR_ERR_OR_ZERO(r_v); - if (ret) - goto err; - - bkey_init(&r_v->k); - r_v->k.type = bkey_type_to_indirect(&orig->k); - r_v->k.p = reflink_iter.pos; - bch2_key_resize(&r_v->k, orig->k.size); - r_v->k.bversion = orig->k.bversion; - - set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); - - refcount = bkey_refcount(bkey_i_to_s(r_v)); - *refcount = 0; - memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); - - ret = bch2_trans_update(trans, &reflink_iter, r_v, 0); - if (ret) - goto err; - - /* - * orig is in a bkey_buf which statically allocates 5 64s for the val, - * so we know it will be big enough: - */ - orig->k.type = KEY_TYPE_reflink_p; - r_p = bkey_i_to_reflink_p(orig); - set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); - - /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */ -#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) - __underlying_memset(&r_p->v, 0, sizeof(r_p->v)); -#else - memset(&r_p->v, 0, sizeof(r_p->v)); -#endif - - SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k)); - - if (reflink_p_may_update_opts_field) - SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true); - - ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, - BTREE_UPDATE_internal_snapshot_node); -err: - bch2_trans_iter_exit(trans, &reflink_iter); - - return ret; -} - -static struct bkey_s_c get_next_src(struct btree_trans *trans, - struct btree_iter *iter, struct bpos end) -{ - struct bkey_s_c k; - int ret; - - for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) { - if (bkey_extent_is_unwritten(k)) - continue; - - if (bkey_extent_is_data(k.k)) - return k; - } - - if (bkey_ge(iter->pos, end)) - bch2_btree_iter_set_pos(trans, iter, end); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; -} - -s64 bch2_remap_range(struct bch_fs *c, - subvol_inum dst_inum, u64 dst_offset, - subvol_inum src_inum, u64 src_offset, - u64 remap_sectors, - u64 new_i_size, s64 *i_sectors_delta, - bool may_change_src_io_path_opts) -{ - struct btree_trans *trans; - struct btree_iter dst_iter, src_iter; - struct bkey_s_c src_k; - struct bkey_buf new_dst, new_src; - struct bpos dst_start = POS(dst_inum.inum, dst_offset); - struct bpos src_start = POS(src_inum.inum, src_offset); - struct bpos dst_end = dst_start, src_end = src_start; - struct bch_io_opts opts; - struct bpos src_want; - u64 dst_done = 0; - u32 dst_snapshot, src_snapshot; - bool reflink_p_may_update_opts_field = - !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); - int ret = 0, ret2 = 0; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink)) - return bch_err_throw(c, erofs_no_writes); - - bch2_check_set_feature(c, BCH_FEATURE_reflink); - - dst_end.offset += remap_sectors; - src_end.offset += remap_sectors; - - bch2_bkey_buf_init(&new_dst); - bch2_bkey_buf_init(&new_src); - trans = bch2_trans_get(c); - - ret = bch2_inum_opts_get(trans, src_inum, &opts); - if (ret) - goto err; - - bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, - BTREE_ITER_intent); - bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, - BTREE_ITER_intent); - - while ((ret == 0 || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) && - bkey_lt(dst_iter.pos, dst_end)) { - struct disk_reservation disk_res = { 0 }; - - bch2_trans_begin(trans); - - if (fatal_signal_pending(current)) { - ret = -EINTR; - break; - } - - ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol, - &src_snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot); - - ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol, - &dst_snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot); - - if (dst_inum.inum < src_inum.inum) { - /* Avoid some lock cycle transaction restarts */ - ret = bch2_btree_iter_traverse(trans, &dst_iter); - if (ret) - continue; - } - - dst_done = dst_iter.pos.offset - dst_start.offset; - src_want = POS(src_start.inode, src_start.offset + dst_done); - bch2_btree_iter_set_pos(trans, &src_iter, src_want); - - src_k = get_next_src(trans, &src_iter, src_end); - ret = bkey_err(src_k); - if (ret) - continue; - - if (bkey_lt(src_want, src_iter.pos)) { - ret = bch2_fpunch_at(trans, &dst_iter, dst_inum, - min(dst_end.offset, - dst_iter.pos.offset + - src_iter.pos.offset - src_want.offset), - i_sectors_delta); - continue; - } - - if (src_k.k->type != KEY_TYPE_reflink_p) { - bch2_btree_iter_set_pos_to_extent_start(&src_iter); - - bch2_bkey_buf_reassemble(&new_src, c, src_k); - src_k = bkey_i_to_s_c(new_src.k); - - ret = bch2_make_extent_indirect(trans, &src_iter, - new_src.k, - reflink_p_may_update_opts_field); - if (ret) - continue; - - BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); - } - - if (src_k.k->type == KEY_TYPE_reflink_p) { - struct bkey_s_c_reflink_p src_p = - bkey_s_c_to_reflink_p(src_k); - struct bkey_i_reflink_p *dst_p = - bkey_reflink_p_init(new_dst.k); - - u64 offset = REFLINK_P_IDX(src_p.v) + - (src_want.offset - - bkey_start_offset(src_k.k)); - - SET_REFLINK_P_IDX(&dst_p->v, offset); - - if (reflink_p_may_update_opts_field && - may_change_src_io_path_opts && - REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v)) - SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); - } else { - BUG(); - } - - new_dst.k->k.p = dst_iter.pos; - bch2_key_resize(&new_dst.k->k, - min(src_k.k->p.offset - src_want.offset, - dst_end.offset - dst_iter.pos.offset)); - - ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?: - bch2_extent_update(trans, dst_inum, &dst_iter, - new_dst.k, &disk_res, - new_i_size, i_sectors_delta, - true); - bch2_disk_reservation_put(c, &disk_res); - } - bch2_trans_iter_exit(trans, &dst_iter); - bch2_trans_iter_exit(trans, &src_iter); - - BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end)); - BUG_ON(bkey_gt(dst_iter.pos, dst_end)); - - dst_done = dst_iter.pos.offset - dst_start.offset; - new_i_size = min(dst_iter.pos.offset << 9, new_i_size); - - do { - struct bch_inode_unpacked inode_u; - struct btree_iter inode_iter = {}; - - bch2_trans_begin(trans); - - ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u, - dst_inum, BTREE_ITER_intent); - - if (!ret2 && - inode_u.bi_size < new_i_size) { - inode_u.bi_size = new_i_size; - ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - } - - bch2_trans_iter_exit(trans, &inode_iter); - } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&new_src, c); - bch2_bkey_buf_exit(&new_dst, c); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink); - - return dst_done ?: ret ?: ret2; -} - -/* fsck */ - -static int bch2_gc_write_reflink_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - size_t *idx) -{ - struct bch_fs *c = trans->c; - const __le64 *refcount = bkey_refcount_c(k); - struct printbuf buf = PRINTBUF; - struct reflink_gc *r; - int ret = 0; - - if (!refcount) - return 0; - - while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && - r->offset < k.k->p.offset) - ++*idx; - - if (!r || - r->offset != k.k->p.offset || - r->size != k.k->size) { - bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); - return -EINVAL; - } - - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), - trans, reflink_v_refcount_wrong, - "reflink key has wrong refcount:\n" - "%s\n" - "should be %u", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf), - r->refcount)) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - if (!r->refcount) - new->k.type = KEY_TYPE_deleted; - else - *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); - ret = bch2_trans_update(trans, iter, new, 0); - } -out: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_gc_reflink_done(struct bch_fs *c) -{ - size_t idx = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_gc_write_reflink_key(trans, &iter, k, &idx))); - c->reflink_gc_nr = 0; - return ret; -} - -int bch2_gc_reflink_start(struct bch_fs *c) -{ - c->reflink_gc_nr = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch, k, ({ - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - continue; - - struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, - c->reflink_gc_nr++, GFP_KERNEL); - if (!r) { - ret = bch_err_throw(c, ENOMEM_gc_reflink_start); - break; - } - - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - 0; - }))); - - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h deleted file mode 100644 index 1632780bdf18..000000000000 --- a/fs/bcachefs/reflink.h +++ /dev/null @@ -1,87 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REFLINK_H -#define _BCACHEFS_REFLINK_H - -int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ - .key_validate = bch2_reflink_p_validate, \ - .val_to_text = bch2_reflink_p_to_text, \ - .key_merge = bch2_reflink_p_merge, \ - .trigger = bch2_trigger_reflink_p, \ - .min_val_size = 16, \ -}) - -int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ - .key_validate = bch2_reflink_v_validate, \ - .val_to_text = bch2_reflink_v_to_text, \ - .swab = bch2_ptr_swab, \ - .trigger = bch2_trigger_reflink_v, \ - .min_val_size = 8, \ -}) - -int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_indirect_inline_data_to_text(struct printbuf *, - struct bch_fs *, struct bkey_s_c); -int bch2_trigger_indirect_inline_data(struct btree_trans *, - enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ - .key_validate = bch2_indirect_inline_data_validate, \ - .val_to_text = bch2_indirect_inline_data_to_text, \ - .trigger = bch2_trigger_indirect_inline_data, \ - .min_val_size = 8, \ -}) - -static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_reflink_v: - return &bkey_s_c_to_reflink_v(k).v->refcount; - case KEY_TYPE_indirect_inline_data: - return &bkey_s_c_to_indirect_inline_data(k).v->refcount; - default: - return NULL; - } -} - -static inline __le64 *bkey_refcount(struct bkey_s k) -{ - switch (k.k->type) { - case KEY_TYPE_reflink_v: - return &bkey_s_to_reflink_v(k).v->refcount; - case KEY_TYPE_indirect_inline_data: - return &bkey_s_to_indirect_inline_data(k).v->refcount; - default: - return NULL; - } -} - -struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *, - s64 *, struct bkey_s_c_reflink_p, - bool, unsigned); - -s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, - subvol_inum, u64, u64, u64, s64 *, - bool); - -int bch2_gc_reflink_done(struct bch_fs *); -int bch2_gc_reflink_start(struct bch_fs *); - -#endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h deleted file mode 100644 index 92995e4f898e..000000000000 --- a/fs/bcachefs/reflink_format.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REFLINK_FORMAT_H -#define _BCACHEFS_REFLINK_FORMAT_H - -struct bch_reflink_p { - struct bch_val v; - __le64 idx_flags; - /* - * A reflink pointer might point to an indirect extent which is then - * later split (by copygc or rebalance). If we only pointed to part of - * the original indirect extent, and then one of the fragments is - * outside the range we point to, we'd leak a refcount: so when creating - * reflink pointers, we need to store pad values to remember the full - * range we were taking a reference on. - */ - __le32 front_pad; - __le32 back_pad; -} __packed __aligned(8); - -LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56); -LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57); -LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS, - struct bch_reflink_p, idx_flags, 57, 58); - -struct bch_reflink_v { - struct bch_val v; - __le64 refcount; - union bch_extent_entry start[0]; - __u64 _data[]; -} __packed __aligned(8); - -struct bch_indirect_inline_data { - struct bch_val v; - __le64 refcount; - u8 data[]; -}; - -#endif /* _BCACHEFS_REFLINK_FORMAT_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c deleted file mode 100644 index 8383bd7fdb3f..000000000000 --- a/fs/bcachefs/replicas.c +++ /dev/null @@ -1,918 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "buckets.h" -#include "disk_accounting.h" -#include "journal.h" -#include "replicas.h" -#include "super-io.h" - -#include <linux/sort.h> - -static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, - struct bch_replicas_cpu *); - -/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ -static int bch2_memcmp(const void *l, const void *r, const void *priv) -{ - size_t size = (size_t) priv; - return memcmp(l, r, size); -} - -/* Replicas tracking - in memory: */ - -static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - BUG_ON(!e->nr_devs); - BUG_ON(e->nr_required > 1 && - e->nr_required >= e->nr_devs); - - for (unsigned i = 0; i + 1 < e->nr_devs; i++) - BUG_ON(e->devs[i] >= e->devs[i + 1]); -#endif -} - -void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) -{ - bubble_sort(e->devs, e->nr_devs, u8_cmp); -} - -static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) -{ - eytzinger0_sort_r(r->entries, r->nr, r->entry_size, - bch2_memcmp, NULL, (void *)(size_t)r->entry_size); -} - -static void bch2_replicas_entry_v0_to_text(struct printbuf *out, - struct bch_replicas_entry_v0 *e) -{ - bch2_prt_data_type(out, e->data_type); - - prt_printf(out, ": %u [", e->nr_devs); - for (unsigned i = 0; i < e->nr_devs; i++) - prt_printf(out, i ? " %u" : "%u", e->devs[i]); - prt_printf(out, "]"); -} - -void bch2_replicas_entry_to_text(struct printbuf *out, - struct bch_replicas_entry_v1 *e) -{ - bch2_prt_data_type(out, e->data_type); - - prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); - for (unsigned i = 0; i < e->nr_devs; i++) - prt_printf(out, i ? " %u" : "%u", e->devs[i]); - prt_printf(out, "]"); -} - -static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, - struct bch_sb *sb, - struct printbuf *err) -{ - if (!r->nr_devs) { - prt_printf(err, "no devices in entry "); - goto bad; - } - - if (r->nr_required > 1 && - r->nr_required >= r->nr_devs) { - prt_printf(err, "bad nr_required in entry "); - goto bad; - } - - for (unsigned i = 0; i < r->nr_devs; i++) - if (r->devs[i] != BCH_SB_MEMBER_INVALID && - !bch2_member_exists(sb, r->devs[i])) { - prt_printf(err, "invalid device %u in entry ", r->devs[i]); - goto bad; - } - - return 0; -bad: - bch2_replicas_entry_to_text(err, r); - return -BCH_ERR_invalid_replicas_entry; -} - -int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, - struct bch_fs *c, - struct printbuf *err) -{ - if (!r->nr_devs) { - prt_printf(err, "no devices in entry "); - goto bad; - } - - if (r->nr_required > 1 && - r->nr_required >= r->nr_devs) { - prt_printf(err, "bad nr_required in entry "); - goto bad; - } - - for (unsigned i = 0; i < r->nr_devs; i++) - if (r->devs[i] != BCH_SB_MEMBER_INVALID && - !bch2_dev_exists(c, r->devs[i])) { - prt_printf(err, "invalid device %u in entry ", r->devs[i]); - goto bad; - } - - return 0; -bad: - bch2_replicas_entry_to_text(err, r); - return bch_err_throw(c, invalid_replicas_entry); -} - -void bch2_cpu_replicas_to_text(struct printbuf *out, - struct bch_replicas_cpu *r) -{ - struct bch_replicas_entry_v1 *e; - bool first = true; - - for_each_cpu_replicas_entry(r, e) { - if (!first) - prt_printf(out, " "); - first = false; - - bch2_replicas_entry_to_text(out, e); - } -} - -static void extent_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry_v1 *r) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - r->nr_required = 1; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.cached) - continue; - - if (!p.has_ec) - replicas_entry_add_dev(r, p.ptr.dev); - else - r->nr_required = 0; - } -} - -static void stripe_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry_v1 *r) -{ - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - const struct bch_extent_ptr *ptr; - - r->nr_required = s.v->nr_blocks - s.v->nr_redundant; - - for (ptr = s.v->ptrs; - ptr < s.v->ptrs + s.v->nr_blocks; - ptr++) - replicas_entry_add_dev(r, ptr->dev); -} - -void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, - struct bkey_s_c k) -{ - e->nr_devs = 0; - - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - e->data_type = BCH_DATA_btree; - extent_to_replicas(k, e); - break; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - e->data_type = BCH_DATA_user; - extent_to_replicas(k, e); - break; - case KEY_TYPE_stripe: - e->data_type = BCH_DATA_parity; - stripe_to_replicas(k, e); - break; - } - - bch2_replicas_entry_sort(e); -} - -void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, - enum bch_data_type data_type, - struct bch_devs_list devs) -{ - BUG_ON(!data_type || - data_type == BCH_DATA_sb || - data_type >= BCH_DATA_NR); - - e->data_type = data_type; - e->nr_devs = 0; - e->nr_required = 1; - - darray_for_each(devs, i) - replicas_entry_add_dev(e, *i); - - bch2_replicas_entry_sort(e); -} - -static struct bch_replicas_cpu -cpu_replicas_add_entry(struct bch_fs *c, - struct bch_replicas_cpu *old, - struct bch_replicas_entry_v1 *new_entry) -{ - struct bch_replicas_cpu new = { - .nr = old->nr + 1, - .entry_size = max_t(unsigned, old->entry_size, - replicas_entry_bytes(new_entry)), - }; - - new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); - if (!new.entries) - return new; - - for (unsigned i = 0; i < old->nr; i++) - memcpy(cpu_replicas_entry(&new, i), - cpu_replicas_entry(old, i), - old->entry_size); - - memcpy(cpu_replicas_entry(&new, old->nr), - new_entry, - replicas_entry_bytes(new_entry)); - - bch2_cpu_replicas_sort(&new); - return new; -} - -static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, - struct bch_replicas_entry_v1 *search) -{ - int idx, entry_size = replicas_entry_bytes(search); - - if (unlikely(entry_size > r->entry_size)) - return -1; - -#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) - idx = eytzinger0_find(r->entries, r->nr, r->entry_size, - entry_cmp, search); -#undef entry_cmp - - return idx < r->nr ? idx : -1; -} - -int bch2_replicas_entry_idx(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - bch2_replicas_entry_sort(search); - - return __replicas_entry_idx(&c->replicas, search); -} - -static bool __replicas_has_entry(struct bch_replicas_cpu *r, - struct bch_replicas_entry_v1 *search) -{ - return __replicas_entry_idx(r, search) >= 0; -} - -bool bch2_replicas_marked_locked(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - verify_replicas_entry(search); - - return !search->nr_devs || - (__replicas_has_entry(&c->replicas, search) && - (likely((!c->replicas_gc.entries)) || - __replicas_has_entry(&c->replicas_gc, search))); -} - -bool bch2_replicas_marked(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - percpu_down_read(&c->mark_lock); - bool ret = bch2_replicas_marked_locked(c, search); - percpu_up_read(&c->mark_lock); - - return ret; -} - -noinline -static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_entry_v1 *new_entry) -{ - struct bch_replicas_cpu new_r, new_gc; - int ret = 0; - - verify_replicas_entry(new_entry); - - memset(&new_r, 0, sizeof(new_r)); - memset(&new_gc, 0, sizeof(new_gc)); - - mutex_lock(&c->sb_lock); - - if (c->replicas_gc.entries && - !__replicas_has_entry(&c->replicas_gc, new_entry)) { - new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); - if (!new_gc.entries) { - ret = bch_err_throw(c, ENOMEM_cpu_replicas); - goto err; - } - } - - if (!__replicas_has_entry(&c->replicas, new_entry)) { - new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); - if (!new_r.entries) { - ret = bch_err_throw(c, ENOMEM_cpu_replicas); - goto err; - } - - ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); - if (ret) - goto err; - } - - if (!new_r.entries && - !new_gc.entries) - goto out; - - /* allocations done, now commit: */ - - if (new_r.entries) - bch2_write_super(c); - - /* don't update in memory replicas until changes are persistent */ - percpu_down_write(&c->mark_lock); - if (new_r.entries) - swap(c->replicas, new_r); - if (new_gc.entries) - swap(new_gc, c->replicas_gc); - percpu_up_write(&c->mark_lock); -out: - mutex_unlock(&c->sb_lock); - - kfree(new_r.entries); - kfree(new_gc.entries); - - return ret; -err: - bch_err_msg(c, ret, "adding replicas entry"); - goto out; -} - -int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) -{ - return likely(bch2_replicas_marked(c, r)) - ? 0 : bch2_mark_replicas_slowpath(c, r); -} - -/* - * Old replicas_gc mechanism: only used for journal replicas entries now, should - * die at some point: - */ - -int bch2_replicas_gc_end(struct bch_fs *c, int ret) -{ - lockdep_assert_held(&c->replicas_gc_lock); - - mutex_lock(&c->sb_lock); - percpu_down_write(&c->mark_lock); - - ret = ret ?: - bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); - if (!ret) - swap(c->replicas, c->replicas_gc); - - kfree(c->replicas_gc.entries); - c->replicas_gc.entries = NULL; - - percpu_up_write(&c->mark_lock); - - if (!ret) - bch2_write_super(c); - - mutex_unlock(&c->sb_lock); - - return ret; -} - -int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) -{ - struct bch_replicas_entry_v1 *e; - unsigned i = 0; - - lockdep_assert_held(&c->replicas_gc_lock); - - mutex_lock(&c->sb_lock); - BUG_ON(c->replicas_gc.entries); - - c->replicas_gc.nr = 0; - c->replicas_gc.entry_size = 0; - - for_each_cpu_replicas_entry(&c->replicas, e) { - /* Preserve unknown data types */ - if (e->data_type >= BCH_DATA_NR || - !((1 << e->data_type) & typemask)) { - c->replicas_gc.nr++; - c->replicas_gc.entry_size = - max_t(unsigned, c->replicas_gc.entry_size, - replicas_entry_bytes(e)); - } - } - - c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, - c->replicas_gc.entry_size, - GFP_KERNEL); - if (!c->replicas_gc.entries) { - mutex_unlock(&c->sb_lock); - bch_err(c, "error allocating c->replicas_gc"); - return bch_err_throw(c, ENOMEM_replicas_gc); - } - - for_each_cpu_replicas_entry(&c->replicas, e) - if (e->data_type >= BCH_DATA_NR || - !((1 << e->data_type) & typemask)) - memcpy(cpu_replicas_entry(&c->replicas_gc, i++), - e, c->replicas_gc.entry_size); - - bch2_cpu_replicas_sort(&c->replicas_gc); - mutex_unlock(&c->sb_lock); - - return 0; -} - -/* - * New much simpler mechanism for clearing out unneeded replicas entries - drop - * replicas entries that have 0 sectors used. - * - * However, we don't track sector counts for journal usage, so this doesn't drop - * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism - * is retained for that. - */ -int bch2_replicas_gc2(struct bch_fs *c) -{ - struct bch_replicas_cpu new = { 0 }; - unsigned nr; - int ret = 0; - - bch2_accounting_mem_gc(c); -retry: - nr = READ_ONCE(c->replicas.nr); - new.entry_size = READ_ONCE(c->replicas.entry_size); - new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); - if (!new.entries) { - bch_err(c, "error allocating c->replicas_gc"); - return bch_err_throw(c, ENOMEM_replicas_gc); - } - - mutex_lock(&c->sb_lock); - percpu_down_write(&c->mark_lock); - - if (nr != c->replicas.nr || - new.entry_size != c->replicas.entry_size) { - percpu_up_write(&c->mark_lock); - mutex_unlock(&c->sb_lock); - kfree(new.entries); - goto retry; - } - - for (unsigned i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); - - struct disk_accounting_pos k = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; - - unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), - "embedded variable length struct"); - - struct bpos p = disk_accounting_pos_to_bpos(&k); - - struct bch_accounting_mem *acc = &c->accounting; - bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &p) >= acc->k.nr; - - if (e->data_type == BCH_DATA_journal || !kill) - memcpy(cpu_replicas_entry(&new, new.nr++), - e, new.entry_size); - } - - bch2_cpu_replicas_sort(&new); - - ret = bch2_cpu_replicas_to_sb_replicas(c, &new); - - if (!ret) - swap(c->replicas, new); - - kfree(new.entries); - - percpu_up_write(&c->mark_lock); - - if (!ret) - bch2_write_super(c); - - mutex_unlock(&c->sb_lock); - - return ret; -} - -/* Replicas tracking - superblock: */ - -static int -__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, - struct bch_replicas_cpu *cpu_r) -{ - struct bch_replicas_entry_v1 *e, *dst; - unsigned nr = 0, entry_size = 0, idx = 0; - - for_each_replicas_entry(sb_r, e) { - entry_size = max_t(unsigned, entry_size, - replicas_entry_bytes(e)); - nr++; - } - - cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); - if (!cpu_r->entries) - return -BCH_ERR_ENOMEM_cpu_replicas; - - cpu_r->nr = nr; - cpu_r->entry_size = entry_size; - - for_each_replicas_entry(sb_r, e) { - dst = cpu_replicas_entry(cpu_r, idx++); - memcpy(dst, e, replicas_entry_bytes(e)); - bch2_replicas_entry_sort(dst); - } - - return 0; -} - -static int -__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, - struct bch_replicas_cpu *cpu_r) -{ - struct bch_replicas_entry_v0 *e; - unsigned nr = 0, entry_size = 0, idx = 0; - - for_each_replicas_entry(sb_r, e) { - entry_size = max_t(unsigned, entry_size, - replicas_entry_bytes(e)); - nr++; - } - - entry_size += sizeof(struct bch_replicas_entry_v1) - - sizeof(struct bch_replicas_entry_v0); - - cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); - if (!cpu_r->entries) - return -BCH_ERR_ENOMEM_cpu_replicas; - - cpu_r->nr = nr; - cpu_r->entry_size = entry_size; - - for_each_replicas_entry(sb_r, e) { - struct bch_replicas_entry_v1 *dst = - cpu_replicas_entry(cpu_r, idx++); - - dst->data_type = e->data_type; - dst->nr_devs = e->nr_devs; - dst->nr_required = 1; - memcpy(dst->devs, e->devs, e->nr_devs); - bch2_replicas_entry_sort(dst); - } - - return 0; -} - -int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) -{ - struct bch_sb_field_replicas *sb_v1; - struct bch_sb_field_replicas_v0 *sb_v0; - struct bch_replicas_cpu new_r = { 0, 0, NULL }; - int ret = 0; - - if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas))) - ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); - else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0))) - ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); - if (ret) - return ret; - - bch2_cpu_replicas_sort(&new_r); - - percpu_down_write(&c->mark_lock); - swap(c->replicas, new_r); - percpu_up_write(&c->mark_lock); - - kfree(new_r.entries); - - return 0; -} - -static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, - struct bch_replicas_cpu *r) -{ - struct bch_sb_field_replicas_v0 *sb_r; - struct bch_replicas_entry_v0 *dst; - struct bch_replicas_entry_v1 *src; - size_t bytes; - - bytes = sizeof(struct bch_sb_field_replicas); - - for_each_cpu_replicas_entry(r, src) - bytes += replicas_entry_bytes(src) - 1; - - sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, - DIV_ROUND_UP(bytes, sizeof(u64))); - if (!sb_r) - return bch_err_throw(c, ENOSPC_sb_replicas); - - bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); - sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); - - memset(&sb_r->entries, 0, - vstruct_end(&sb_r->field) - - (void *) &sb_r->entries); - - dst = sb_r->entries; - for_each_cpu_replicas_entry(r, src) { - dst->data_type = src->data_type; - dst->nr_devs = src->nr_devs; - memcpy(dst->devs, src->devs, src->nr_devs); - - dst = replicas_entry_next(dst); - - BUG_ON((void *) dst > vstruct_end(&sb_r->field)); - } - - return 0; -} - -static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, - struct bch_replicas_cpu *r) -{ - struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry_v1 *dst, *src; - bool need_v1 = false; - size_t bytes; - - bytes = sizeof(struct bch_sb_field_replicas); - - for_each_cpu_replicas_entry(r, src) { - bytes += replicas_entry_bytes(src); - if (src->nr_required != 1) - need_v1 = true; - } - - if (!need_v1) - return bch2_cpu_replicas_to_sb_replicas_v0(c, r); - - sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, - DIV_ROUND_UP(bytes, sizeof(u64))); - if (!sb_r) - return bch_err_throw(c, ENOSPC_sb_replicas); - - bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); - sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); - - memset(&sb_r->entries, 0, - vstruct_end(&sb_r->field) - - (void *) &sb_r->entries); - - dst = sb_r->entries; - for_each_cpu_replicas_entry(r, src) { - memcpy(dst, src, replicas_entry_bytes(src)); - - dst = replicas_entry_next(dst); - - BUG_ON((void *) dst > vstruct_end(&sb_r->field)); - } - - return 0; -} - -static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, - struct bch_sb *sb, - struct printbuf *err) -{ - unsigned i; - - sort_r(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - bch2_memcmp, NULL, - (void *)(size_t)cpu_r->entry_size); - - for (i = 0; i < cpu_r->nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(cpu_r, i); - - int ret = bch2_replicas_entry_sb_validate(e, sb, err); - if (ret) - return ret; - - if (i + 1 < cpu_r->nr) { - struct bch_replicas_entry_v1 *n = - cpu_replicas_entry(cpu_r, i + 1); - - BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); - - if (!memcmp(e, n, cpu_r->entry_size)) { - prt_printf(err, "duplicate replicas entry "); - bch2_replicas_entry_to_text(err, e); - return -BCH_ERR_invalid_sb_replicas; - } - } - } - - return 0; -} - -static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); - struct bch_replicas_cpu cpu_r; - int ret; - - ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r); - if (ret) - return ret; - - ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); - kfree(cpu_r.entries); - return ret; -} - -static void bch2_sb_replicas_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_replicas *r = field_to_type(f, replicas); - struct bch_replicas_entry_v1 *e; - bool first = true; - - for_each_replicas_entry(r, e) { - if (!first) - prt_printf(out, " "); - first = false; - - bch2_replicas_entry_to_text(out, e); - } - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_replicas = { - .validate = bch2_sb_replicas_validate, - .to_text = bch2_sb_replicas_to_text, -}; - -static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); - struct bch_replicas_cpu cpu_r; - int ret; - - ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r); - if (ret) - return ret; - - ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); - kfree(cpu_r.entries); - return ret; -} - -static void bch2_sb_replicas_v0_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); - struct bch_replicas_entry_v0 *e; - bool first = true; - - for_each_replicas_entry(sb_r, e) { - if (!first) - prt_printf(out, " "); - first = false; - - bch2_replicas_entry_v0_to_text(out, e); - } - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { - .validate = bch2_sb_replicas_v0_validate, - .to_text = bch2_sb_replicas_v0_to_text, -}; - -/* Query replicas: */ - -bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, - unsigned flags, bool print) -{ - struct bch_replicas_entry_v1 *e; - bool ret = true; - - percpu_down_read(&c->mark_lock); - for_each_cpu_replicas_entry(&c->replicas, e) { - unsigned nr_online = 0, nr_failed = 0, dflags = 0; - bool metadata = e->data_type < BCH_DATA_user; - - if (e->data_type == BCH_DATA_cached) - continue; - - scoped_guard(rcu) - for (unsigned i = 0; i < e->nr_devs; i++) { - if (e->devs[i] == BCH_SB_MEMBER_INVALID) { - nr_failed++; - continue; - } - - nr_online += test_bit(e->devs[i], devs.d); - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); - nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; - } - - if (nr_online + nr_failed == e->nr_devs) - continue; - - if (nr_online < e->nr_required) - dflags |= metadata - ? BCH_FORCE_IF_METADATA_LOST - : BCH_FORCE_IF_DATA_LOST; - - if (nr_online < e->nr_devs) - dflags |= metadata - ? BCH_FORCE_IF_METADATA_DEGRADED - : BCH_FORCE_IF_DATA_DEGRADED; - - if (dflags & ~flags) { - if (print) { - struct printbuf buf = PRINTBUF; - - bch2_replicas_entry_to_text(&buf, e); - bch_err(c, "insufficient devices online (%u) for replicas entry %s", - nr_online, buf.buf); - printbuf_exit(&buf); - } - ret = false; - break; - } - - } - percpu_up_read(&c->mark_lock); - - return ret; -} - -unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) -{ - struct bch_sb_field_replicas *replicas; - struct bch_sb_field_replicas_v0 *replicas_v0; - unsigned data_has = 0; - - replicas = bch2_sb_field_get(sb, replicas); - replicas_v0 = bch2_sb_field_get(sb, replicas_v0); - - if (replicas) { - struct bch_replicas_entry_v1 *r; - - for_each_replicas_entry(replicas, r) { - if (r->data_type >= sizeof(data_has) * 8) - continue; - - for (unsigned i = 0; i < r->nr_devs; i++) - if (r->devs[i] == dev) - data_has |= 1 << r->data_type; - } - - } else if (replicas_v0) { - struct bch_replicas_entry_v0 *r; - - for_each_replicas_entry_v0(replicas_v0, r) { - if (r->data_type >= sizeof(data_has) * 8) - continue; - - for (unsigned i = 0; i < r->nr_devs; i++) - if (r->devs[i] == dev) - data_has |= 1 << r->data_type; - } - } - - - return data_has; -} - -unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) -{ - mutex_lock(&c->sb_lock); - unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); - mutex_unlock(&c->sb_lock); - - return ret; -} - -void bch2_fs_replicas_exit(struct bch_fs *c) -{ - kfree(c->replicas.entries); - kfree(c->replicas_gc.entries); -} diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h deleted file mode 100644 index 5aba2c1ce133..000000000000 --- a/fs/bcachefs/replicas.h +++ /dev/null @@ -1,83 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REPLICAS_H -#define _BCACHEFS_REPLICAS_H - -#include "bkey.h" -#include "eytzinger.h" -#include "replicas_types.h" - -void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); -void bch2_replicas_entry_to_text(struct printbuf *, - struct bch_replicas_entry_v1 *); -int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *, - struct bch_fs *, struct printbuf *); -void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); - -static inline struct bch_replicas_entry_v1 * -cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -{ - return (void *) r->entries + r->entry_size * i; -} - -int bch2_replicas_entry_idx(struct bch_fs *, - struct bch_replicas_entry_v1 *); - -void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *, - enum bch_data_type, - struct bch_devs_list); - -bool bch2_replicas_marked_locked(struct bch_fs *, - struct bch_replicas_entry_v1 *); -bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *); -int bch2_mark_replicas(struct bch_fs *, - struct bch_replicas_entry_v1 *); - -void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c); - -static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e, - unsigned dev) -{ - e->data_type = BCH_DATA_cached; - e->nr_devs = 1; - e->nr_required = 1; - e->devs[0] = dev; -} - -bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, - unsigned, bool); - -unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); -unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); - -int bch2_replicas_gc_end(struct bch_fs *, int); -int bch2_replicas_gc_start(struct bch_fs *, unsigned); -int bch2_replicas_gc2(struct bch_fs *); - -#define for_each_cpu_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ - _i = (void *) (_i) + (_r)->entry_size) - -/* iterate over superblock replicas - used by userspace tools: */ - -#define replicas_entry_next(_i) \ - ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) - -#define for_each_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ - (_i) = replicas_entry_next(_i)) - -#define for_each_replicas_entry_v0(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ - (_i) = replicas_entry_next(_i)) - -int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); - -extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; -extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; - -void bch2_fs_replicas_exit(struct bch_fs *); - -#endif /* _BCACHEFS_REPLICAS_H */ diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h deleted file mode 100644 index b7eff904acdb..000000000000 --- a/fs/bcachefs/replicas_format.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REPLICAS_FORMAT_H -#define _BCACHEFS_REPLICAS_FORMAT_H - -struct bch_replicas_entry_v0 { - __u8 data_type; - __u8 nr_devs; - __u8 devs[] __counted_by(nr_devs); -} __packed; - -struct bch_sb_field_replicas_v0 { - struct bch_sb_field field; - struct bch_replicas_entry_v0 entries[]; -} __packed __aligned(8); - -struct bch_replicas_entry_v1 { - __u8 data_type; - __u8 nr_devs; - __u8 nr_required; - __u8 devs[] __counted_by(nr_devs); -} __packed; - -struct bch_sb_field_replicas { - struct bch_sb_field field; - struct bch_replicas_entry_v1 entries[]; -} __packed __aligned(8); - -#define replicas_entry_bytes(_i) \ - (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) - -#define replicas_entry_add_dev(e, d) ({ \ - (e)->nr_devs++; \ - (e)->devs[(e)->nr_devs - 1] = (d); \ -}) - -#endif /* _BCACHEFS_REPLICAS_FORMAT_H */ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h deleted file mode 100644 index fed71c861fe7..000000000000 --- a/fs/bcachefs/replicas_types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REPLICAS_TYPES_H -#define _BCACHEFS_REPLICAS_TYPES_H - -struct bch_replicas_cpu { - unsigned nr; - unsigned entry_size; - struct bch_replicas_entry_v1 *entries; -}; - -#endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c deleted file mode 100644 index 59c8770e4a0e..000000000000 --- a/fs/bcachefs/sb-clean.c +++ /dev/null @@ -1,340 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "error.h" -#include "journal_io.h" -#include "replicas.h" -#include "sb-clean.h" -#include "super-io.h" - -/* - * BCH_SB_FIELD_clean: - * - * Btree roots, and a few other things, are recovered from the journal after an - * unclean shutdown - but after a clean shutdown, to avoid having to read the - * journal, we can store them in the superblock. - * - * bch_sb_field_clean simply contains a list of journal entries, stored exactly - * as they would be in the journal: - */ - -int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, - int write) -{ - struct bkey_validate_context from = { - .flags = write, - .from = BKEY_VALIDATE_superblock, - }; - struct jset_entry *entry; - int ret; - - for (entry = clean->start; - entry < (struct jset_entry *) vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - if (vstruct_end(entry) > vstruct_end(&clean->field)) { - bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu", - le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s), - (u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field)); - bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun); - return -BCH_ERR_fsck_repair_unimplemented; - } - - ret = bch2_journal_entry_validate(c, NULL, entry, - le16_to_cpu(c->disk_sb.sb->version), - BCH_SB_BIG_ENDIAN(c->disk_sb.sb), - from); - if (ret) - return ret; - } - - return 0; -} - -static struct bkey_i *btree_root_find(struct bch_fs *c, - struct bch_sb_field_clean *clean, - struct jset *j, - enum btree_id id, unsigned *level) -{ - struct bkey_i *k; - struct jset_entry *entry, *start, *end; - - if (clean) { - start = clean->start; - end = vstruct_end(&clean->field); - } else { - start = j->start; - end = vstruct_last(j); - } - - for (entry = start; entry < end; entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_btree_root && - entry->btree_id == id) - goto found; - - return NULL; -found: - if (!entry->u64s) - return ERR_PTR(-EINVAL); - - k = entry->start; - *level = entry->level; - return k; -} - -int bch2_verify_superblock_clean(struct bch_fs *c, - struct bch_sb_field_clean **cleanp, - struct jset *j) -{ - unsigned i; - struct bch_sb_field_clean *clean = *cleanp; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - int ret = 0; - - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, - sb_clean_journal_seq_mismatch, - "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", - le64_to_cpu(clean->journal_seq), - le64_to_cpu(j->seq))) { - kfree(clean); - *cleanp = NULL; - return 0; - } - - for (i = 0; i < BTREE_ID_NR; i++) { - struct bkey_i *k1, *k2; - unsigned l1 = 0, l2 = 0; - - k1 = btree_root_find(c, clean, NULL, i, &l1); - k2 = btree_root_find(c, NULL, j, i, &l2); - - if (!k1 && !k2) - continue; - - printbuf_reset(&buf1); - printbuf_reset(&buf2); - - if (k1) - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); - else - prt_printf(&buf1, "(none)"); - - if (k2) - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); - else - prt_printf(&buf2, "(none)"); - - mustfix_fsck_err_on(!k1 || !k2 || - IS_ERR(k1) || - IS_ERR(k2) || - k1->k.u64s != k2->k.u64s || - memcmp(k1, k2, bkey_bytes(&k1->k)) || - l1 != l2, c, - sb_clean_btree_root_mismatch, - "superblock btree root %u doesn't match journal after clean shutdown\n" - "sb: l=%u %s\n" - "journal: l=%u %s\n", i, - l1, buf1.buf, - l2, buf2.buf); - } -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - -struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) -{ - struct bch_sb_field_clean *clean, *sb_clean; - int ret; - - mutex_lock(&c->sb_lock); - sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean); - - if (fsck_err_on(!sb_clean, c, - sb_clean_missing, - "superblock marked clean but clean section not present")) { - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->sb.clean = false; - mutex_unlock(&c->sb_lock); - return ERR_PTR(-BCH_ERR_invalid_sb_clean); - } - - clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), - GFP_KERNEL); - if (!clean) { - mutex_unlock(&c->sb_lock); - return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); - } - - ret = bch2_sb_clean_validate_late(c, clean, READ); - if (ret) { - kfree(clean); - mutex_unlock(&c->sb_lock); - return ERR_PTR(ret); - } - - mutex_unlock(&c->sb_lock); - - return clean; -fsck_err: - mutex_unlock(&c->sb_lock); - return ERR_PTR(ret); -} - -void bch2_journal_super_entries_add_common(struct bch_fs *c, - struct jset_entry **end, - u64 journal_seq) -{ - { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_key_version; - u->v = cpu_to_le64(atomic64_read(&c->key_version)); - } - - for (unsigned i = 0; i < 2; i++) { - struct jset_entry_clock *clock = - container_of(jset_entry_init(end, sizeof(*clock)), - struct jset_entry_clock, entry); - - clock->entry.type = BCH_JSET_ENTRY_clock; - clock->rw = i; - clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); - } -} - -static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_clean *clean = field_to_type(f, clean); - - if (vstruct_bytes(&clean->field) < sizeof(*clean)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&clean->field), sizeof(*clean)); - return -BCH_ERR_invalid_sb_clean; - } - - for (struct jset_entry *entry = clean->start; - entry != vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) { - prt_str(err, "entry type "); - bch2_prt_jset_entry_type(err, entry->type); - prt_str(err, " overruns end of section"); - return -BCH_ERR_invalid_sb_clean; - } - } - - return 0; -} - -static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_clean *clean = field_to_type(f, clean); - struct jset_entry *entry; - - prt_printf(out, "flags: %x\n", le32_to_cpu(clean->flags)); - prt_printf(out, "journal_seq: %llu\n", le64_to_cpu(clean->journal_seq)); - - for (entry = clean->start; - entry != vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) - break; - - if (entry->type == BCH_JSET_ENTRY_btree_keys && - !entry->u64s) - continue; - - bch2_journal_entry_to_text(out, NULL, entry); - prt_newline(out); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_clean = { - .validate = bch2_sb_clean_validate, - .to_text = bch2_sb_clean_to_text, -}; - -int bch2_fs_mark_dirty(struct bch_fs *c) -{ - int ret; - - /* - * Unconditionally write superblock, to verify it hasn't changed before - * we go rw: - */ - - mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); - - ret = bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; -} - -void bch2_fs_mark_clean(struct bch_fs *c) -{ - struct bch_sb_field_clean *sb_clean; - struct jset_entry *entry; - unsigned u64s; - int ret; - - mutex_lock(&c->sb_lock); - if (BCH_SB_CLEAN(c->disk_sb.sb)) - goto out; - - SET_BCH_SB_CLEAN(c->disk_sb.sb, true); - - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); - - u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; - - sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s); - if (!sb_clean) { - bch_err(c, "error resizing superblock while setting filesystem clean"); - goto out; - } - - sb_clean->flags = 0; - sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); - - /* Trying to catch outstanding bug: */ - BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); - - entry = sb_clean->start; - bch2_journal_super_entries_add_common(c, &entry, 0); - entry = bch2_btree_roots_to_journal_entries(c, entry, 0); - BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); - - memset(entry, 0, - vstruct_end(&sb_clean->field) - (void *) entry); - - /* - * this should be in the write path, and we should be validating every - * superblock section: - */ - ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); - if (ret) { - bch_err(c, "error writing marking filesystem clean: validate error"); - goto out; - } - - bch2_journal_pos_from_member_info_set(c); - - bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); -} diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h deleted file mode 100644 index 71caef281239..000000000000 --- a/fs/bcachefs/sb-clean.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_CLEAN_H -#define _BCACHEFS_SB_CLEAN_H - -int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); -int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **, - struct jset *); -struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *); -void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64); - -extern const struct bch_sb_field_ops bch_sb_field_ops_clean; - -int bch2_fs_mark_dirty(struct bch_fs *); -void bch2_fs_mark_clean(struct bch_fs *); - -#endif /* _BCACHEFS_SB_CLEAN_H */ diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c deleted file mode 100644 index 2b4b8445d418..000000000000 --- a/fs/bcachefs/sb-counters.c +++ /dev/null @@ -1,147 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "super-io.h" -#include "sb-counters.h" - -/* BCH_SB_FIELD_counters */ - -static const u8 counters_to_stable_map[] = { -#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n, - BCH_PERSISTENT_COUNTERS() -#undef x -}; - -const char * const bch2_counter_names[] = { -#define x(t, n, ...) (#t), - BCH_PERSISTENT_COUNTERS() -#undef x - NULL -}; - -static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) -{ - if (!ctrs) - return 0; - - return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; -} - -static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - return 0; -} - -static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_counters *ctrs = field_to_type(f, counters); - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { - unsigned stable = counters_to_stable_map[i]; - if (stable < nr) - prt_printf(out, "%s \t%llu\n", - bch2_counter_names[i], - le64_to_cpu(ctrs->d[stable])); - } -} - -int bch2_sb_counters_to_cpu(struct bch_fs *c) -{ - struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) - c->counters_on_mount[i] = 0; - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { - unsigned stable = counters_to_stable_map[i]; - if (stable < nr) { - u64 v = le64_to_cpu(ctrs->d[stable]); - percpu_u64_set(&c->counters[i], v); - c->counters_on_mount[i] = v; - } - } - - return 0; -} - -int bch2_sb_counters_from_cpu(struct bch_fs *c) -{ - struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - struct bch_sb_field_counters *ret; - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - - if (nr < BCH_COUNTER_NR) { - ret = bch2_sb_field_resize(&c->disk_sb, counters, - sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); - if (ret) { - ctrs = ret; - nr = bch2_sb_counter_nr_entries(ctrs); - } - } - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { - unsigned stable = counters_to_stable_map[i]; - if (stable < nr) - ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i])); - } - - return 0; -} - -void bch2_fs_counters_exit(struct bch_fs *c) -{ - free_percpu(c->counters); -} - -int bch2_fs_counters_init(struct bch_fs *c) -{ - c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64)); - if (!c->counters) - return -BCH_ERR_ENOMEM_fs_counters_init; - - return bch2_sb_counters_to_cpu(c); -} - -const struct bch_sb_field_ops bch_sb_field_ops_counters = { - .validate = bch2_sb_counters_validate, - .to_text = bch2_sb_counters_to_text, -}; - -#ifndef NO_BCACHEFS_CHARDEV -long bch2_ioctl_query_counters(struct bch_fs *c, - struct bch_ioctl_query_counters __user *user_arg) -{ - struct bch_ioctl_query_counters arg; - int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)); - if (ret) - return ret; - - if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) || - arg.pad) - return -EINVAL; - - arg.nr = min(arg.nr, BCH_COUNTER_NR); - ret = put_user(arg.nr, &user_arg->nr); - if (ret) - return ret; - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { - unsigned stable = counters_to_stable_map[i]; - - if (stable < arg.nr) { - u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT) - ? percpu_u64_get(&c->counters[i]) - : c->counters_on_mount[i]; - - ret = put_user(v, &user_arg->d[stable]); - if (ret) - return ret; - } - } - - return 0; -} -#endif diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h deleted file mode 100644 index a4329ad8dd1b..000000000000 --- a/fs/bcachefs/sb-counters.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_COUNTERS_H -#define _BCACHEFS_SB_COUNTERS_H - -#include "bcachefs.h" -#include "super-io.h" - -int bch2_sb_counters_to_cpu(struct bch_fs *); -int bch2_sb_counters_from_cpu(struct bch_fs *); - -void bch2_fs_counters_exit(struct bch_fs *); -int bch2_fs_counters_init(struct bch_fs *); - -extern const char * const bch2_counter_names[]; -extern const struct bch_sb_field_ops bch_sb_field_ops_counters; - -long bch2_ioctl_query_counters(struct bch_fs *, - struct bch_ioctl_query_counters __user *); - -#endif // _BCACHEFS_SB_COUNTERS_H diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h deleted file mode 100644 index b868702a431a..000000000000 --- a/fs/bcachefs/sb-counters_format.h +++ /dev/null @@ -1,117 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H -#define _BCACHEFS_SB_COUNTERS_FORMAT_H - -enum counters_flags { - TYPE_COUNTER = BIT(0), /* event counters */ - TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */ -}; - -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0, TYPE_SECTORS) \ - x(io_read_inline, 80, TYPE_SECTORS) \ - x(io_read_hole, 81, TYPE_SECTORS) \ - x(io_read_promote, 30, TYPE_COUNTER) \ - x(io_read_bounce, 31, TYPE_COUNTER) \ - x(io_read_split, 33, TYPE_COUNTER) \ - x(io_read_reuse_race, 34, TYPE_COUNTER) \ - x(io_read_retry, 32, TYPE_COUNTER) \ - x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ - x(io_write, 1, TYPE_SECTORS) \ - x(io_move, 2, TYPE_SECTORS) \ - x(io_move_read, 35, TYPE_SECTORS) \ - x(io_move_write, 36, TYPE_SECTORS) \ - x(io_move_finish, 37, TYPE_SECTORS) \ - x(io_move_fail, 38, TYPE_COUNTER) \ - x(io_move_write_fail, 82, TYPE_COUNTER) \ - x(io_move_start_fail, 39, TYPE_COUNTER) \ - x(io_move_created_rebalance, 83, TYPE_COUNTER) \ - x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \ - x(bucket_invalidate, 3, TYPE_COUNTER) \ - x(bucket_discard, 4, TYPE_COUNTER) \ - x(bucket_discard_fast, 79, TYPE_COUNTER) \ - x(bucket_alloc, 5, TYPE_COUNTER) \ - x(bucket_alloc_fail, 6, TYPE_COUNTER) \ - x(btree_cache_scan, 7, TYPE_COUNTER) \ - x(btree_cache_reap, 8, TYPE_COUNTER) \ - x(btree_cache_cannibalize, 9, TYPE_COUNTER) \ - x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \ - x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \ - x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \ - x(btree_node_write, 13, TYPE_COUNTER) \ - x(btree_node_read, 14, TYPE_COUNTER) \ - x(btree_node_compact, 15, TYPE_COUNTER) \ - x(btree_node_merge, 16, TYPE_COUNTER) \ - x(btree_node_split, 17, TYPE_COUNTER) \ - x(btree_node_rewrite, 18, TYPE_COUNTER) \ - x(btree_node_alloc, 19, TYPE_COUNTER) \ - x(btree_node_free, 20, TYPE_COUNTER) \ - x(btree_node_set_root, 21, TYPE_COUNTER) \ - x(btree_path_relock_fail, 22, TYPE_COUNTER) \ - x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \ - x(btree_reserve_get_fail, 24, TYPE_COUNTER) \ - x(journal_entry_full, 25, TYPE_COUNTER) \ - x(journal_full, 26, TYPE_COUNTER) \ - x(journal_reclaim_finish, 27, TYPE_COUNTER) \ - x(journal_reclaim_start, 28, TYPE_COUNTER) \ - x(journal_write, 29, TYPE_COUNTER) \ - x(copygc, 40, TYPE_COUNTER) \ - x(copygc_wait, 41, TYPE_COUNTER) \ - x(gc_gens_end, 42, TYPE_COUNTER) \ - x(gc_gens_start, 43, TYPE_COUNTER) \ - x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \ - x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \ - x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \ - x(trans_restart_fault_inject, 47, TYPE_COUNTER) \ - x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \ - x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \ - x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \ - x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \ - x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \ - x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \ - x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \ - x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \ - x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \ - x(trans_restart_relock, 57, TYPE_COUNTER) \ - x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \ - x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \ - x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \ - x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \ - x(trans_restart_relock_path, 62, TYPE_COUNTER) \ - x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \ - x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \ - x(trans_restart_traverse, 65, TYPE_COUNTER) \ - x(trans_restart_upgrade, 66, TYPE_COUNTER) \ - x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \ - x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \ - x(trans_restart_injected, 69, TYPE_COUNTER) \ - x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \ - x(trans_traverse_all, 71, TYPE_COUNTER) \ - x(transaction_commit, 72, TYPE_COUNTER) \ - x(write_super, 73, TYPE_COUNTER) \ - x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \ - x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \ - x(trans_restart_split_race, 76, TYPE_COUNTER) \ - x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \ - x(write_buffer_flush_sync, 78, TYPE_COUNTER) - -enum bch_persistent_counters { -#define x(t, n, ...) BCH_COUNTER_##t, - BCH_PERSISTENT_COUNTERS() -#undef x - BCH_COUNTER_NR -}; - -enum bch_persistent_counters_stable { -#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n, - BCH_PERSISTENT_COUNTERS() -#undef x - BCH_COUNTER_STABLE_NR -}; - -struct bch_sb_field_counters { - struct bch_sb_field field; - __le64 d[]; -}; - -#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c deleted file mode 100644 index 1506d05e0665..000000000000 --- a/fs/bcachefs/sb-downgrade.c +++ /dev/null @@ -1,457 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Superblock section that contains a list of recovery passes to run when - * downgrading past a given version - */ - -#include "bcachefs.h" -#include "darray.h" -#include "recovery_passes.h" -#include "sb-downgrade.h" -#include "sb-errors.h" -#include "super-io.h" - -#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63) - -/* - * Upgrade, downgrade tables - run certain recovery passes, fix certain errors - * - * x(version, recovery_passes, errors...) - */ -#define UPGRADE_TABLE() \ - x(snapshot_2, \ - RECOVERY_PASS_ALL_FSCK, \ - BCH_FSCK_ERR_subvol_root_wrong_bi_subvol, \ - BCH_FSCK_ERR_subvol_not_master_and_not_snapshot) \ - x(backpointers, \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_v3, \ - RECOVERY_PASS_ALL_FSCK) \ - x(unwritten_extents, \ - RECOVERY_PASS_ALL_FSCK) \ - x(bucket_gens, \ - BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ - RECOVERY_PASS_ALL_FSCK) \ - x(lru_v2, \ - RECOVERY_PASS_ALL_FSCK) \ - x(fragmentation_lru, \ - RECOVERY_PASS_ALL_FSCK) \ - x(no_bps_in_alloc_keys, \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot_trees, \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot_skiplists, \ - BIT_ULL(BCH_RECOVERY_PASS_check_snapshots), \ - BCH_FSCK_ERR_snapshot_bad_depth, \ - BCH_FSCK_ERR_snapshot_bad_skiplist) \ - x(deleted_inodes, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ - BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \ - x(rebalance_work, \ - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \ - x(subvolume_fs_parent, \ - BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ - BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ - x(btree_subvolume_children, \ - BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ - BCH_FSCK_ERR_subvol_children_not_set) \ - x(mi_btree_bitmap, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_btree_bitmap_not_marked) \ - x(disk_accounting_v2, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_bkey_version_in_future, \ - BCH_FSCK_ERR_dev_usage_buckets_wrong, \ - BCH_FSCK_ERR_dev_usage_sectors_wrong, \ - BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_accounting_mismatch) \ - x(disk_accounting_v3, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_bkey_version_in_future, \ - BCH_FSCK_ERR_dev_usage_buckets_wrong, \ - BCH_FSCK_ERR_dev_usage_sectors_wrong, \ - BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \ - BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(disk_accounting_inum, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) \ - x(rebalance_work_acct_fix, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) \ - x(inode_has_child_snapshots, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ - BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \ - x(backpointer_bucket_gen, \ - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_backpointer_to_missing_ptr, \ - BCH_FSCK_ERR_ptr_to_missing_backpointer) \ - x(disk_accounting_big_endian, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(cached_backpointers, \ - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_ptr_to_missing_backpointer) \ - x(stripe_backpointers, \ - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_ptr_to_missing_backpointer) \ - x(inode_has_case_insensitive, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ - BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \ - BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set) - -#define DOWNGRADE_TABLE() \ - x(bucket_stripe_sectors, \ - 0) \ - x(disk_accounting_v2, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_dev_usage_buckets_wrong, \ - BCH_FSCK_ERR_dev_usage_sectors_wrong, \ - BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_fs_usage_hidden_wrong, \ - BCH_FSCK_ERR_fs_usage_btree_wrong, \ - BCH_FSCK_ERR_fs_usage_data_wrong, \ - BCH_FSCK_ERR_fs_usage_cached_wrong, \ - BCH_FSCK_ERR_fs_usage_reserved_wrong, \ - BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ - BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ - BCH_FSCK_ERR_fs_usage_replicas_wrong, \ - BCH_FSCK_ERR_bkey_version_in_future) \ - x(disk_accounting_v3, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_dev_usage_buckets_wrong, \ - BCH_FSCK_ERR_dev_usage_sectors_wrong, \ - BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_fs_usage_hidden_wrong, \ - BCH_FSCK_ERR_fs_usage_btree_wrong, \ - BCH_FSCK_ERR_fs_usage_data_wrong, \ - BCH_FSCK_ERR_fs_usage_cached_wrong, \ - BCH_FSCK_ERR_fs_usage_reserved_wrong, \ - BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ - BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ - BCH_FSCK_ERR_fs_usage_replicas_wrong, \ - BCH_FSCK_ERR_accounting_replicas_not_marked, \ - BCH_FSCK_ERR_bkey_version_in_future) \ - x(rebalance_work_acct_fix, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(backpointer_bucket_gen, \ - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \ - BCH_FSCK_ERR_backpointer_to_missing_ptr, \ - BCH_FSCK_ERR_ptr_to_missing_backpointer) \ - x(disk_accounting_big_endian, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) - -struct upgrade_downgrade_entry { - u64 recovery_passes; - u16 version; - u16 nr_errors; - const u16 *errors; -}; - -#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ }; -UPGRADE_TABLE() -#undef x - -static const struct upgrade_downgrade_entry upgrade_table[] = { -#define x(ver, passes, ...) { \ - .recovery_passes = passes, \ - .version = bcachefs_metadata_version_##ver,\ - .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \ - .errors = upgrade_##ver##_errors, \ -}, -UPGRADE_TABLE() -#undef x -}; - -static int have_stripes(struct bch_fs *c) -{ - if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b)) - return 0; - - return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b); -} - -int bch2_sb_set_upgrade_extra(struct bch_fs *c) -{ - unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; - unsigned new_version = c->sb.version; - bool write_sb = false; - int ret = 0; - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - if (old_version < bcachefs_metadata_version_bucket_stripe_sectors && - new_version >= bcachefs_metadata_version_bucket_stripe_sectors && - (ret = have_stripes(c) > 0)) { - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent); - write_sb = true; - } - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret < 0 ? ret : 0; -} - -void bch2_sb_set_upgrade(struct bch_fs *c, - unsigned old_version, - unsigned new_version) -{ - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - for (const struct upgrade_downgrade_entry *i = upgrade_table; - i < upgrade_table + ARRAY_SIZE(upgrade_table); - i++) - if (i->version > old_version && i->version <= new_version) { - u64 passes = i->recovery_passes; - - if (passes & RECOVERY_PASS_ALL_FSCK) - passes |= bch2_fsck_recovery_passes(); - passes &= ~RECOVERY_PASS_ALL_FSCK; - - ext->recovery_passes_required[0] |= - cpu_to_le64(bch2_recovery_passes_to_stable(passes)); - - for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++) - __set_bit_le64(*e, ext->errors_silent); - } -} - -#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ }; -DOWNGRADE_TABLE() -#undef x - -static const struct upgrade_downgrade_entry downgrade_table[] = { -#define x(ver, passes, ...) { \ - .recovery_passes = passes, \ - .version = bcachefs_metadata_version_##ver,\ - .nr_errors = ARRAY_SIZE(downgrade_##ver##_errors), \ - .errors = downgrade_##ver##_errors, \ -}, -DOWNGRADE_TABLE() -#undef x -}; - -static int downgrade_table_extra(struct bch_fs *c, darray_char *table) -{ - unsigned dst_offset = table->nr; - struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table); - unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); - int ret = 0; - - unsigned nr_errors = le16_to_cpu(dst->nr_errors); - - switch (le16_to_cpu(dst->version)) { - case bcachefs_metadata_version_bucket_stripe_sectors: - if (have_stripes(c)) { - bytes += sizeof(dst->errors[0]) * 2; - - ret = darray_make_room(table, bytes); - if (ret) - return ret; - - dst = (void *) &table->data[dst_offset]; - dst->nr_errors = cpu_to_le16(nr_errors + 1); - - /* open coded __set_bit_le64, as dst is packed and - * dst->recovery_passes is misaligned */ - unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations; - dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64)); - - dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong); - } - break; - } - - return ret; -} - -static inline const struct bch_sb_field_downgrade_entry * -downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e) -{ - return (void *) &e->errors[le16_to_cpu(e->nr_errors)]; -} - -#define for_each_downgrade_entry(_d, _i) \ - for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \ - (void *) _i < vstruct_end(&(_d)->field) && \ - (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) && \ - (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field); \ - _i = downgrade_entry_next_c(_i)) - -static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_downgrade *e = field_to_type(f, downgrade); - - for (const struct bch_sb_field_downgrade_entry *i = e->entries; - (void *) i < vstruct_end(&e->field); - i = downgrade_entry_next_c(i)) { - /* - * Careful: sb_field_downgrade_entry is only 2 byte aligned, but - * section sizes are 8 byte aligned - an empty entry spanning - * the end of the section is allowed (and ignored): - */ - if ((void *) &i->errors[0] > vstruct_end(&e->field)) - break; - - if (flags & BCH_VALIDATE_write && - (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) { - prt_printf(err, "downgrade entry overruns end of superblock section"); - return -BCH_ERR_invalid_sb_downgrade; - } - - if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) != - BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) { - prt_printf(err, "downgrade entry with mismatched major version (%u != %u)", - BCH_VERSION_MAJOR(le16_to_cpu(i->version)), - BCH_VERSION_MAJOR(le16_to_cpu(sb->version))); - return -BCH_ERR_invalid_sb_downgrade; - } - } - - return 0; -} - -static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_downgrade *e = field_to_type(f, downgrade); - - if (out->nr_tabstops <= 1) - printbuf_tabstop_push(out, 16); - - for_each_downgrade_entry(e, i) { - prt_str(out, "version:\t"); - bch2_version_to_text(out, le16_to_cpu(i->version)); - prt_newline(out); - - prt_str(out, "recovery passes:\t"); - prt_bitflags(out, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0]))); - prt_newline(out); - - prt_str(out, "errors:\t"); - bool first = true; - for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { - if (!first) - prt_char(out, ','); - first = false; - bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j])); - } - prt_newline(out); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_downgrade = { - .validate = bch2_sb_downgrade_validate, - .to_text = bch2_sb_downgrade_to_text, -}; - -int bch2_sb_downgrade_update(struct bch_fs *c) -{ - if (!test_bit(BCH_FS_btree_running, &c->flags)) - return 0; - - darray_char table = {}; - int ret = 0; - - for (const struct upgrade_downgrade_entry *src = downgrade_table; - src < downgrade_table + ARRAY_SIZE(downgrade_table); - src++) { - if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) - continue; - - if (src->version < c->sb.version_incompat) - continue; - - struct bch_sb_field_downgrade_entry *dst; - unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors; - - ret = darray_make_room(&table, bytes); - if (ret) - goto out; - - dst = (void *) &darray_top(table); - dst->version = cpu_to_le16(src->version); - dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes)); - dst->recovery_passes[1] = 0; - dst->nr_errors = cpu_to_le16(src->nr_errors); - for (unsigned i = 0; i < src->nr_errors; i++) - dst->errors[i] = cpu_to_le16(src->errors[i]); - - ret = downgrade_table_extra(c, &table); - if (ret) - goto out; - - if (!dst->recovery_passes[0] && - !dst->recovery_passes[1] && - !dst->nr_errors) - continue; - - table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); - } - - struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade); - - unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64)); - - if (d && le32_to_cpu(d->field.u64s) > sb_u64s) - goto out; - - d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s); - if (!d) { - ret = bch_err_throw(c, ENOSPC_sb_downgrade); - goto out; - } - - memcpy(d->entries, table.data, table.nr); - memset_u64s_tail(d->entries, 0, table.nr); -out: - darray_exit(&table); - return ret; -} - -void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor) -{ - struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade); - if (!d) - return; - - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - for_each_downgrade_entry(d, i) { - unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version)); - if (new_minor < minor && minor <= old_minor) { - ext->recovery_passes_required[0] |= i->recovery_passes[0]; - ext->recovery_passes_required[1] |= i->recovery_passes[1]; - - for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { - unsigned e = le16_to_cpu(i->errors[j]); - if (e < BCH_FSCK_ERR_MAX) - __set_bit(e, c->sb.errors_silent); - if (e < sizeof(ext->errors_silent) * 8) - __set_bit_le64(e, ext->errors_silent); - } - } - } -} diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h deleted file mode 100644 index 095b7cc9bb47..000000000000 --- a/fs/bcachefs/sb-downgrade.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_DOWNGRADE_H -#define _BCACHEFS_SB_DOWNGRADE_H - -extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade; - -int bch2_sb_downgrade_update(struct bch_fs *); -void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned); -int bch2_sb_set_upgrade_extra(struct bch_fs *); -void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned); - -#endif /* _BCACHEFS_SB_DOWNGRADE_H */ diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h deleted file mode 100644 index cffd932be3ec..000000000000 --- a/fs/bcachefs/sb-downgrade_format.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H -#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H - -struct bch_sb_field_downgrade_entry { - __le16 version; - __le64 recovery_passes[2]; - __le16 nr_errors; - __le16 errors[] __counted_by(nr_errors); -} __packed __aligned(2); - -struct bch_sb_field_downgrade { - struct bch_sb_field field; - struct bch_sb_field_downgrade_entry entries[]; -}; - -#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c deleted file mode 100644 index 48853efdc105..000000000000 --- a/fs/bcachefs/sb-errors.c +++ /dev/null @@ -1,198 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "sb-errors.h" -#include "super-io.h" - -const char * const bch2_sb_error_strs[] = { -#define x(t, n, ...) [n] = #t, - BCH_SB_ERRS() -#undef x -}; - -void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) -{ - if (id < BCH_FSCK_ERR_MAX) - prt_str(out, bch2_sb_error_strs[id]); - else - prt_printf(out, "(unknown error %u)", id); -} - -static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e) -{ - return bch2_sb_field_nr_entries(e); -} - -static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) -{ - return (sizeof(struct bch_sb_field_errors) + - sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64); -} - -static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_errors *e = field_to_type(f, errors); - unsigned i, nr = bch2_sb_field_errors_nr_entries(e); - - for (i = 0; i < nr; i++) { - if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) { - prt_printf(err, "entry with count 0 (id "); - bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); - prt_printf(err, ")"); - return -BCH_ERR_invalid_sb_errors; - } - - if (i + 1 < nr && - BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >= - BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) { - prt_printf(err, "entries out of order"); - return -BCH_ERR_invalid_sb_errors; - } - } - - return 0; -} - -static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_errors *e = field_to_type(f, errors); - unsigned i, nr = bch2_sb_field_errors_nr_entries(e); - - if (out->nr_tabstops <= 1) - printbuf_tabstop_push(out, 16); - - for (i = 0; i < nr; i++) { - bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); - prt_tab(out); - prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i])); - prt_tab(out); - bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time)); - prt_newline(out); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_errors = { - .validate = bch2_sb_errors_validate, - .to_text = bch2_sb_errors_to_text, -}; - -void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c) -{ - if (out->nr_tabstops < 1) - printbuf_tabstop_push(out, 48); - if (out->nr_tabstops < 2) - printbuf_tabstop_push(out, 8); - if (out->nr_tabstops < 3) - printbuf_tabstop_push(out, 16); - - guard(mutex)(&c->fsck_error_counts_lock); - - bch_sb_errors_cpu *e = &c->fsck_error_counts; - darray_for_each(*e, i) { - bch2_sb_error_id_to_text(out, i->id); - prt_tab(out); - prt_u64(out, i->nr); - prt_tab(out); - bch2_prt_datetime(out, i->last_error_time); - prt_newline(out); - } -} - -void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) -{ - bch_sb_errors_cpu *e = &c->fsck_error_counts; - struct bch_sb_error_entry_cpu n = { - .id = err, - .nr = 1, - .last_error_time = ktime_get_real_seconds() - }; - unsigned i; - - mutex_lock(&c->fsck_error_counts_lock); - for (i = 0; i < e->nr; i++) { - if (err == e->data[i].id) { - e->data[i].nr++; - e->data[i].last_error_time = n.last_error_time; - goto out; - } - if (err < e->data[i].id) - break; - } - - if (darray_make_room(e, 1)) - goto out; - - darray_insert_item(e, i, n); -out: - mutex_unlock(&c->fsck_error_counts_lock); -} - -void bch2_sb_errors_from_cpu(struct bch_fs *c) -{ - bch_sb_errors_cpu *src = &c->fsck_error_counts; - struct bch_sb_field_errors *dst; - unsigned i; - - mutex_lock(&c->fsck_error_counts_lock); - - dst = bch2_sb_field_resize(&c->disk_sb, errors, - bch2_sb_field_errors_u64s(src->nr)); - - if (!dst) - goto err; - - for (i = 0; i < src->nr; i++) { - SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); - SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); - dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); - } - -err: - mutex_unlock(&c->fsck_error_counts_lock); -} - -static int bch2_sb_errors_to_cpu(struct bch_fs *c) -{ - struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors); - bch_sb_errors_cpu *dst = &c->fsck_error_counts; - unsigned i, nr = bch2_sb_field_errors_nr_entries(src); - int ret; - - if (!nr) - return 0; - - mutex_lock(&c->fsck_error_counts_lock); - ret = darray_make_room(dst, nr); - if (ret) - goto err; - - dst->nr = nr; - - for (i = 0; i < nr; i++) { - dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]); - dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]); - dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time); - } -err: - mutex_unlock(&c->fsck_error_counts_lock); - - return ret; -} - -void bch2_fs_sb_errors_exit(struct bch_fs *c) -{ - darray_exit(&c->fsck_error_counts); -} - -void bch2_fs_sb_errors_init_early(struct bch_fs *c) -{ - mutex_init(&c->fsck_error_counts_lock); - darray_init(&c->fsck_error_counts); -} - -int bch2_fs_sb_errors_init(struct bch_fs *c) -{ - return bch2_sb_errors_to_cpu(c); -} diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h deleted file mode 100644 index e86267264692..000000000000 --- a/fs/bcachefs/sb-errors.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_ERRORS_H -#define _BCACHEFS_SB_ERRORS_H - -#include "sb-errors_types.h" - -extern const char * const bch2_sb_error_strs[]; - -void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id); -void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *); - -extern const struct bch_sb_field_ops bch_sb_field_ops_errors; - -void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id); - -void bch2_sb_errors_from_cpu(struct bch_fs *); - -void bch2_fs_sb_errors_exit(struct bch_fs *); -void bch2_fs_sb_errors_init_early(struct bch_fs *); -int bch2_fs_sb_errors_init(struct bch_fs *); - -#endif /* _BCACHEFS_SB_ERRORS_H */ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h deleted file mode 100644 index d154b7651d28..000000000000 --- a/fs/bcachefs/sb-errors_format.h +++ /dev/null @@ -1,353 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H -#define _BCACHEFS_SB_ERRORS_FORMAT_H - -enum bch_fsck_flags { - FSCK_CAN_FIX = BIT(0), - FSCK_CAN_IGNORE = BIT(1), - FSCK_AUTOFIX = BIT(2), - FSCK_ERR_NO_LOG = BIT(3), -}; - -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0, 0) \ - x(dirty_but_no_journal_entries, 1, 0) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \ - x(sb_clean_journal_seq_mismatch, 3, 0) \ - x(sb_clean_btree_root_mismatch, 4, 0) \ - x(sb_clean_missing, 5, 0) \ - x(jset_unsupported_version, 6, 0) \ - x(jset_unknown_csum, 7, 0) \ - x(jset_last_seq_newer_than_seq, 8, 0) \ - x(jset_past_bucket_end, 9, 0) \ - x(jset_seq_blacklisted, 10, 0) \ - x(journal_entries_missing, 11, 0) \ - x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \ - x(journal_entry_past_jset_end, 13, 0) \ - x(journal_entry_replicas_data_mismatch, 14, 0) \ - x(journal_entry_bkey_u64s_0, 15, 0) \ - x(journal_entry_bkey_past_end, 16, 0) \ - x(journal_entry_bkey_bad_format, 17, 0) \ - x(journal_entry_bkey_invalid, 18, 0) \ - x(journal_entry_btree_root_bad_size, 19, 0) \ - x(journal_entry_blacklist_bad_size, 20, 0) \ - x(journal_entry_blacklist_v2_bad_size, 21, 0) \ - x(journal_entry_blacklist_v2_start_past_end, 22, 0) \ - x(journal_entry_usage_bad_size, 23, 0) \ - x(journal_entry_data_usage_bad_size, 24, 0) \ - x(journal_entry_clock_bad_size, 25, 0) \ - x(journal_entry_clock_bad_rw, 26, 0) \ - x(journal_entry_dev_usage_bad_size, 27, 0) \ - x(journal_entry_dev_usage_bad_dev, 28, 0) \ - x(journal_entry_dev_usage_bad_pad, 29, 0) \ - x(btree_node_unreadable, 30, 0) \ - x(btree_node_fault_injected, 31, 0) \ - x(btree_node_bad_magic, 32, 0) \ - x(btree_node_bad_seq, 33, 0) \ - x(btree_node_unsupported_version, 34, 0) \ - x(btree_node_bset_older_than_sb_min, 35, 0) \ - x(btree_node_bset_newer_than_sb, 36, 0) \ - x(btree_node_data_missing, 37, FSCK_AUTOFIX) \ - x(btree_node_bset_after_end, 38, 0) \ - x(btree_node_replicas_sectors_written_mismatch, 39, 0) \ - x(btree_node_replicas_data_mismatch, 40, 0) \ - x(bset_unknown_csum, 41, 0) \ - x(bset_bad_csum, 42, 0) \ - x(bset_past_end_of_btree_node, 43, 0) \ - x(bset_wrong_sector_offset, 44, 0) \ - x(bset_empty, 45, 0) \ - x(bset_bad_seq, 46, 0) \ - x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \ - x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ - x(btree_node_bad_btree, 49, 0) \ - x(btree_node_bad_level, 50, 0) \ - x(btree_node_bad_min_key, 51, 0) \ - x(btree_node_bad_max_key, 52, 0) \ - x(btree_node_bad_format, 53, 0) \ - x(btree_node_bkey_past_bset_end, 54, 0) \ - x(btree_node_bkey_bad_format, 55, 0) \ - x(btree_node_bad_bkey, 56, 0) \ - x(btree_node_bkey_out_of_order, 57, FSCK_AUTOFIX) \ - x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \ - x(btree_root_read_error, 59, FSCK_AUTOFIX) \ - x(btree_root_bad_min_key, 60, 0) \ - x(btree_root_bad_max_key, 61, 0) \ - x(btree_node_read_error, 62, FSCK_AUTOFIX) \ - x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \ - x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \ - x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \ - x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \ - x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \ - x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ - x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ - x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ - x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \ - x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \ - x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \ - x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \ - x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \ - x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \ - x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \ - x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \ - x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \ - x(bkey_version_in_future, 80, 0) \ - x(bkey_u64s_too_small, 81, 0) \ - x(bkey_invalid_type_for_btree, 82, 0) \ - x(bkey_extent_size_zero, 83, 0) \ - x(bkey_extent_size_greater_than_offset, 84, 0) \ - x(bkey_size_nonzero, 85, 0) \ - x(bkey_snapshot_nonzero, 86, 0) \ - x(bkey_snapshot_zero, 87, 0) \ - x(bkey_at_pos_max, 88, 0) \ - x(bkey_before_start_of_btree_node, 89, 0) \ - x(bkey_after_end_of_btree_node, 90, 0) \ - x(bkey_val_size_nonzero, 91, 0) \ - x(bkey_val_size_too_small, 92, 0) \ - x(alloc_v1_val_size_bad, 93, 0) \ - x(alloc_v2_unpack_error, 94, 0) \ - x(alloc_v3_unpack_error, 95, 0) \ - x(alloc_v4_val_size_bad, 96, 0) \ - x(alloc_v4_backpointers_start_bad, 97, 0) \ - x(alloc_key_data_type_bad, 98, 0) \ - x(alloc_key_empty_but_have_data, 99, 0) \ - x(alloc_key_dirty_sectors_0, 100, 0) \ - x(alloc_key_data_type_inconsistency, 101, 0) \ - x(alloc_key_to_missing_dev_bucket, 102, 0) \ - x(alloc_key_cached_inconsistency, 103, 0) \ - x(alloc_key_cached_but_read_time_zero, 104, FSCK_AUTOFIX) \ - x(alloc_key_to_missing_lru_entry, 105, FSCK_AUTOFIX) \ - x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \ - x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \ - x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \ - x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ - x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ - x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ - x(alloc_key_journal_seq_in_future, 298, FSCK_AUTOFIX) \ - x(bucket_sector_count_overflow, 112, 0) \ - x(bucket_metadata_type_mismatch, 113, 0) \ - x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \ - x(freespace_key_wrong, 115, FSCK_AUTOFIX) \ - x(freespace_hole_missing, 116, FSCK_AUTOFIX) \ - x(bucket_gens_val_size_bad, 117, 0) \ - x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \ - x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \ - x(bucket_gens_to_invalid_dev, 120, FSCK_AUTOFIX) \ - x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ - x(need_discard_freespace_key_bad, 124, FSCK_AUTOFIX) \ - x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ - x(backpointer_bucket_offset_wrong, 125, 0) \ - x(backpointer_level_bad, 294, 0) \ - x(backpointer_dev_bad, 297, 0) \ - x(backpointer_to_missing_device, 126, FSCK_AUTOFIX) \ - x(backpointer_to_missing_alloc, 127, FSCK_AUTOFIX) \ - x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \ - x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \ - x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \ - x(lru_entry_bad, 131, FSCK_AUTOFIX) \ - x(btree_ptr_val_too_big, 132, 0) \ - x(btree_ptr_v2_val_too_big, 133, 0) \ - x(btree_ptr_has_non_ptr, 134, 0) \ - x(extent_ptrs_invalid_entry, 135, 0) \ - x(extent_ptrs_no_ptrs, 136, 0) \ - x(extent_ptrs_too_many_ptrs, 137, 0) \ - x(extent_ptrs_redundant_crc, 138, 0) \ - x(extent_ptrs_redundant_stripe, 139, 0) \ - x(extent_ptrs_unwritten, 140, 0) \ - x(extent_ptrs_written_and_unwritten, 141, 0) \ - x(ptr_to_invalid_device, 142, 0) \ - x(ptr_to_duplicate_device, 143, 0) \ - x(ptr_after_last_bucket, 144, 0) \ - x(ptr_before_first_bucket, 145, 0) \ - x(ptr_spans_multiple_buckets, 146, 0) \ - x(ptr_to_missing_backpointer, 147, FSCK_AUTOFIX) \ - x(ptr_to_missing_alloc_key, 148, FSCK_AUTOFIX) \ - x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \ - x(ptr_to_missing_stripe, 150, 0) \ - x(ptr_to_incorrect_stripe, 151, 0) \ - x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \ - x(ptr_too_stale, 153, 0) \ - x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ - x(ptr_bucket_data_type_mismatch, 155, 0) \ - x(ptr_cached_and_erasure_coded, 156, 0) \ - x(ptr_crc_uncompressed_size_too_small, 157, 0) \ - x(ptr_crc_uncompressed_size_too_big, 161, 0) \ - x(ptr_crc_uncompressed_size_mismatch, 300, 0) \ - x(ptr_crc_csum_type_unknown, 158, 0) \ - x(ptr_crc_compression_type_unknown, 159, 0) \ - x(ptr_crc_redundant, 160, 0) \ - x(ptr_crc_nonce_mismatch, 162, 0) \ - x(ptr_stripe_redundant, 163, 0) \ - x(extent_flags_not_at_start, 306, 0) \ - x(reservation_key_nr_replicas_invalid, 164, 0) \ - x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ - x(reflink_v_pos_bad, 292, 0) \ - x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \ - x(reflink_refcount_underflow, 293, 0) \ - x(stripe_pos_bad, 167, 0) \ - x(stripe_val_size_bad, 168, 0) \ - x(stripe_csum_granularity_bad, 290, 0) \ - x(stripe_sector_count_wrong, 169, 0) \ - x(snapshot_tree_pos_bad, 170, 0) \ - x(snapshot_tree_to_missing_snapshot, 171, 0) \ - x(snapshot_tree_to_missing_subvol, 172, 0) \ - x(snapshot_tree_to_wrong_subvol, 173, 0) \ - x(snapshot_tree_to_snapshot_subvol, 174, 0) \ - x(snapshot_pos_bad, 175, 0) \ - x(snapshot_parent_bad, 176, 0) \ - x(snapshot_children_not_normalized, 177, 0) \ - x(snapshot_child_duplicate, 178, 0) \ - x(snapshot_child_bad, 179, 0) \ - x(snapshot_skiplist_not_normalized, 180, 0) \ - x(snapshot_skiplist_bad, 181, 0) \ - x(snapshot_should_not_have_subvol, 182, 0) \ - x(snapshot_to_bad_snapshot_tree, 183, FSCK_AUTOFIX) \ - x(snapshot_bad_depth, 184, 0) \ - x(snapshot_bad_skiplist, 185, 0) \ - x(subvol_pos_bad, 186, 0) \ - x(subvol_not_master_and_not_snapshot, 187, FSCK_AUTOFIX) \ - x(subvol_to_missing_root, 188, 0) \ - x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \ - x(bkey_in_missing_snapshot, 190, 0) \ - x(bkey_in_deleted_snapshot, 315, FSCK_AUTOFIX) \ - x(inode_pos_inode_nonzero, 191, 0) \ - x(inode_pos_blockdev_range, 192, 0) \ - x(inode_alloc_cursor_inode_bad, 301, 0) \ - x(inode_unpack_error, 193, 0) \ - x(inode_str_hash_invalid, 194, 0) \ - x(inode_v3_fields_start_bad, 195, 0) \ - x(inode_snapshot_mismatch, 196, 0) \ - x(snapshot_key_missing_inode_snapshot, 314, FSCK_AUTOFIX) \ - x(inode_unlinked_but_clean, 197, 0) \ - x(inode_unlinked_but_nlink_nonzero, 198, 0) \ - x(inode_unlinked_and_not_open, 281, 0) \ - x(inode_unlinked_but_has_dirent, 285, 0) \ - x(inode_checksum_type_invalid, 199, 0) \ - x(inode_compression_type_invalid, 200, 0) \ - x(inode_subvol_root_but_not_dir, 201, 0) \ - x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \ - x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \ - x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \ - x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \ - x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \ - x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \ - x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \ - x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \ - x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ - x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ - x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ - x(inode_has_child_snapshots_wrong, 287, FSCK_AUTOFIX) \ - x(inode_unreachable, 210, FSCK_AUTOFIX) \ - x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ - x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \ - x(inode_has_case_insensitive_not_set, 316, FSCK_AUTOFIX) \ - x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \ - x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ - x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ - x(vfs_bad_inode_rm, 320, 0) \ - x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ - x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ - x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ - x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ - x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ - x(extent_overlapping, 215, 0) \ - x(key_in_missing_inode, 216, FSCK_AUTOFIX) \ - x(key_in_wrong_inode_type, 217, 0) \ - x(extent_past_end_of_inode, 218, FSCK_AUTOFIX) \ - x(dirent_empty_name, 219, 0) \ - x(dirent_val_too_big, 220, 0) \ - x(dirent_name_too_long, 221, 0) \ - x(dirent_name_embedded_nul, 222, 0) \ - x(dirent_name_dot_or_dotdot, 223, 0) \ - x(dirent_name_has_slash, 224, 0) \ - x(dirent_d_type_wrong, 225, FSCK_AUTOFIX) \ - x(inode_bi_parent_wrong, 226, 0) \ - x(dirent_in_missing_dir_inode, 227, 0) \ - x(dirent_in_non_dir_inode, 228, 0) \ - x(dirent_to_missing_inode, 229, FSCK_AUTOFIX) \ - x(dirent_to_overwritten_inode, 302, 0) \ - x(dirent_to_missing_subvol, 230, 0) \ - x(dirent_to_itself, 231, 0) \ - x(dirent_casefold_mismatch, 318, FSCK_AUTOFIX) \ - x(quota_type_invalid, 232, 0) \ - x(xattr_val_size_too_small, 233, 0) \ - x(xattr_val_size_too_big, 234, 0) \ - x(xattr_invalid_type, 235, 0) \ - x(xattr_name_invalid_chars, 236, 0) \ - x(xattr_in_missing_inode, 237, 0) \ - x(root_subvol_missing, 238, 0) \ - x(root_dir_missing, 239, 0) \ - x(root_inode_not_dir, 240, 0) \ - x(dir_loop, 241, 0) \ - x(hash_table_key_duplicate, 242, FSCK_AUTOFIX) \ - x(hash_table_key_wrong_offset, 243, FSCK_AUTOFIX) \ - x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \ - x(reflink_p_front_pad_bad, 245, 0) \ - x(journal_entry_dup_same_device, 246, 0) \ - x(inode_bi_subvol_missing, 247, 0) \ - x(inode_bi_subvol_wrong, 248, 0) \ - x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \ - x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \ - x(inode_bi_parent_nonzero, 251, 0) \ - x(dirent_to_missing_parent_subvol, 252, 0) \ - x(dirent_not_visible_in_parent_subvol, 253, 0) \ - x(subvol_fs_path_parent_wrong, 254, 0) \ - x(subvol_root_fs_path_parent_nonzero, 255, 0) \ - x(subvol_children_not_set, 256, 0) \ - x(subvol_children_bad, 257, 0) \ - x(subvol_loop, 258, 0) \ - x(subvol_unreachable, 259, FSCK_AUTOFIX) \ - x(btree_node_bkey_bad_u64s, 260, 0) \ - x(btree_node_topology_empty_interior_node, 261, 0) \ - x(btree_ptr_v2_min_key_bad, 262, 0) \ - x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ - x(snapshot_node_missing, 264, FSCK_AUTOFIX) \ - x(dup_backpointer_to_bad_csum_extent, 265, 0) \ - x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ - x(sb_clean_entry_overrun, 267, 0) \ - x(btree_ptr_v2_written_0, 268, 0) \ - x(subvol_snapshot_bad, 269, 0) \ - x(subvol_inode_bad, 270, 0) \ - x(subvol_missing, 308, FSCK_AUTOFIX) \ - x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ - x(accounting_mismatch, 272, FSCK_AUTOFIX) \ - x(accounting_replicas_not_marked, 273, 0) \ - x(accounting_to_invalid_device, 289, 0) \ - x(invalid_btree_id, 274, FSCK_AUTOFIX) \ - x(alloc_key_io_time_bad, 275, 0) \ - x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ - x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ - x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ - x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ - x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ - x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ - x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \ - x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ - x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ - x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ - x(dirent_cf_name_too_big, 304, 0) \ - x(dirent_stray_data_after_cf_name, 305, 0) \ - x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ - x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 321, 0) - -enum bch_sb_error_id { -#define x(t, n, ...) BCH_FSCK_ERR_##t = n, - BCH_SB_ERRS() -#undef x -}; - -struct bch_sb_field_errors { - struct bch_sb_field field; - struct bch_sb_field_error_entry { - __le64 v; - __le64 last_error_time; - } entries[]; -}; - -LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); -LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); - -#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h deleted file mode 100644 index 40325239c3b0..000000000000 --- a/fs/bcachefs/sb-errors_types.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_ERRORS_TYPES_H -#define _BCACHEFS_SB_ERRORS_TYPES_H - -#include "darray.h" - -struct bch_sb_error_entry_cpu { - u64 id:16, - nr:48; - u64 last_error_time; -}; - -typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; - -#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c deleted file mode 100644 index 6245e342a8a8..000000000000 --- a/fs/bcachefs/sb-members.c +++ /dev/null @@ -1,606 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "disk_groups.h" -#include "error.h" -#include "opts.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-members.h" -#include "super-io.h" - -int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) -{ - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev); - bch2_bkey_val_to_text(&buf, c, k); - - bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); - - int ret = bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_allocations, 0); - - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return ret; -} - -void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev) -{ - if (dev != BCH_SB_MEMBER_INVALID) - bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); -} - -void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket) -{ - bch2_fs_inconsistent(ca->fs, - "pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)", - bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets); -} - -#define x(t, n, ...) [n] = #t, -static const char * const bch2_iops_measurements[] = { - BCH_IOPS_MEASUREMENTS() - NULL -}; - -char * const bch2_member_error_strs[] = { - BCH_MEMBER_ERROR_TYPES() - NULL -}; -#undef x - -/* Code for bch_sb_field_members_v1: */ - -struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i) -{ - return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); -} - -static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i) -{ - struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i); - memset(&ret, 0, sizeof(ret)); - memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret))); - return ret; -} - -static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i) -{ - return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES); -} - -static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i) -{ - struct bch_member ret, *p = members_v1_get_mut(mi, i); - memset(&ret, 0, sizeof(ret)); - memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret))); - return ret; -} - -struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i) -{ - struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2); - if (mi2) - return members_v2_get(mi2, i); - struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1); - return members_v1_get(mi1, i); -} - -static int sb_members_v2_resize_entries(struct bch_fs *c) -{ - struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - - if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) { - unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) * - c->disk_sb.sb->nr_devices), 8); - - mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); - if (!mi) - return bch_err_throw(c, ENOSPC_sb_members_v2); - - for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { - void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); - memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes)); - memset(dst + le16_to_cpu(mi->member_bytes), - 0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes))); - } - mi->member_bytes = cpu_to_le16(sizeof(struct bch_member)); - } - return 0; -} - -int bch2_sb_members_v2_init(struct bch_fs *c) -{ - struct bch_sb_field_members_v1 *mi1; - struct bch_sb_field_members_v2 *mi2; - - if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) { - mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2, - DIV_ROUND_UP(sizeof(*mi2) + - sizeof(struct bch_member) * c->sb.nr_devices, - sizeof(u64))); - mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1); - memcpy(&mi2->_members[0], &mi1->_members[0], - BCH_MEMBER_V1_BYTES * c->sb.nr_devices); - memset(&mi2->pad[0], 0, sizeof(mi2->pad)); - mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES); - } - - return sb_members_v2_resize_entries(c); -} - -int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) -{ - struct bch_sb_field_members_v1 *mi1; - struct bch_sb_field_members_v2 *mi2; - - if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) { - bch2_sb_field_resize(disk_sb, members_v1, 0); - return 0; - } - - mi1 = bch2_sb_field_resize(disk_sb, members_v1, - DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * - disk_sb->sb->nr_devices, sizeof(u64))); - if (!mi1) - return -BCH_ERR_ENOSPC_sb_members; - - mi2 = bch2_sb_field_get(disk_sb->sb, members_v2); - - for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++) - memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES); - - return 0; -} - -static int validate_member(struct printbuf *err, - struct bch_member m, - struct bch_sb *sb, - int i) -{ - if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) { - prt_printf(err, "device %u: too many buckets (got %llu, max %u)", - i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX); - return -BCH_ERR_invalid_sb_members; - } - - if (le64_to_cpu(m.nbuckets) - - le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) { - prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", - i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS); - return -BCH_ERR_invalid_sb_members; - } - - if (le16_to_cpu(m.bucket_size) < - le16_to_cpu(sb->block_size)) { - prt_printf(err, "device %u: bucket size %u smaller than block size %u", - i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size)); - return -BCH_ERR_invalid_sb_members; - } - - if (le16_to_cpu(m.bucket_size) < - BCH_SB_BTREE_NODE_SIZE(sb)) { - prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", - i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); - return -BCH_ERR_invalid_sb_members; - } - - if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) { - prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift); - return -BCH_ERR_invalid_sb_members; - } - - if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) && - sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) { - prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i); - return -BCH_ERR_invalid_sb_members; - } - - return 0; -} - -static void member_to_text(struct printbuf *out, - struct bch_member m, - struct bch_sb_field_disk_groups *gi, - struct bch_sb *sb, - int i) -{ - unsigned data_have = bch2_sb_dev_has_data(sb, i); - u64 bucket_size = le16_to_cpu(m.bucket_size); - u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; - - if (!bch2_member_alive(&m)) - return; - - prt_printf(out, "Device:\t%u\n", i); - - printbuf_indent_add(out, 2); - - prt_printf(out, "Label:\t"); - if (BCH_MEMBER_GROUP(&m)) - bch2_disk_path_to_text_sb(out, sb, - BCH_MEMBER_GROUP(&m) - 1); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "UUID:\t"); - pr_uuid(out, m.uuid.b); - prt_newline(out); - - prt_printf(out, "Size:\t"); - prt_units_u64(out, device_size << 9); - prt_newline(out); - - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) - prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i])); - - for (unsigned i = 0; i < BCH_IOPS_NR; i++) - prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i])); - - prt_printf(out, "Bucket size:\t"); - prt_units_u64(out, bucket_size << 9); - prt_newline(out); - - prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket)); - prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets)); - - prt_printf(out, "Last mount:\t"); - if (m.last_mount) - bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); - else - prt_printf(out, "(never)"); - prt_newline(out); - - prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq)); - - prt_printf(out, "State:\t%s\n", - BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR - ? bch2_member_states[BCH_MEMBER_STATE(&m)] - : "unknown"); - - prt_printf(out, "Data allowed:\t"); - if (BCH_MEMBER_DATA_ALLOWED(&m)) - prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "Has data:\t"); - if (data_have) - prt_bitflags(out, __bch2_data_types, data_have); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "Btree allocated bitmap blocksize:\t"); - if (m.btree_bitmap_shift < 64) - prt_units_u64(out, 1ULL << m.btree_bitmap_shift); - else - prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift); - prt_newline(out); - - prt_printf(out, "Btree allocated bitmap:\t"); - bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64); - prt_newline(out); - - prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); - - prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); - prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); - prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m)); - - printbuf_indent_sub(out, 2); -} - -static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); - unsigned i; - - if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) { - prt_printf(err, "too many devices for section size"); - return -BCH_ERR_invalid_sb_members; - } - - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member m = members_v1_get(mi, i); - - int ret = validate_member(err, m, sb, i); - if (ret) - return ret; - } - - return 0; -} - -static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); - struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - - if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { - prt_printf(out, "field ends before start of entries"); - return; - } - - unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]); - if (nr != sb->nr_devices) - prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); - - for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) - member_to_text(out, members_v1_get(mi, i), gi, sb, i); -} - -const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = { - .validate = bch2_sb_members_v1_validate, - .to_text = bch2_sb_members_v1_to_text, -}; - -static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); - struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - - if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { - prt_printf(out, "field ends before start of entries"); - return; - } - - if (!le16_to_cpu(mi->member_bytes)) { - prt_printf(out, "member_bytes 0"); - return; - } - - unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes); - if (nr != sb->nr_devices) - prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); - - /* - * We call to_text() on superblock sections that haven't passed - * validate, so we can't trust sb->nr_devices. - */ - - for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) - member_to_text(out, members_v2_get(mi, i), gi, sb, i); -} - -static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); - size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - - (void *) mi; - - if (mi_bytes > vstruct_bytes(&mi->field)) { - prt_printf(err, "section too small (%zu > %zu)", - mi_bytes, vstruct_bytes(&mi->field)); - return -BCH_ERR_invalid_sb_members; - } - - for (unsigned i = 0; i < sb->nr_devices; i++) { - int ret = validate_member(err, members_v2_get(mi, i), sb, i); - if (ret) - return ret; - } - - return 0; -} - -const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = { - .validate = bch2_sb_members_v2_validate, - .to_text = bch2_sb_members_v2_to_text, -}; - -void bch2_sb_members_from_cpu(struct bch_fs *c) -{ - struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - - guard(rcu)(); - for_each_member_device_rcu(c, ca, NULL) { - struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); - - for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++) - m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); - } -} - -void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_member m; - - mutex_lock(&ca->fs->sb_lock); - m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); - mutex_unlock(&ca->fs->sb_lock); - - printbuf_tabstop_push(out, 12); - - prt_str(out, "IO errors since filesystem creation"); - prt_newline(out); - - printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) - prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i])); - printbuf_indent_sub(out, 2); - - prt_str(out, "IO errors since "); - bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC); - prt_str(out, " ago"); - prt_newline(out); - - printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) - prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], - atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); - printbuf_indent_sub(out, 2); -} - -void bch2_dev_errors_reset(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_member *m; - - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) - m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); - m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds()); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); -} - -/* - * Per member "range has btree nodes" bitmap: - * - * This is so that if we ever have to run the btree node scan to repair we don't - * have to scan full devices: - */ - -bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) -{ - guard(rcu)(); - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (ca && - !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) - return false; - } - return true; -} - -static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, - u64 start, unsigned sectors) -{ - struct bch_member *m = __bch2_members_v2_get_mut(mi, dev); - u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap); - - u64 end = start + sectors; - - int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6); - if (resize > 0) { - u64 new_bitmap = 0; - - for (unsigned i = 0; i < 64; i++) - if (bitmap & BIT_ULL(i)) - new_bitmap |= BIT_ULL(i >> resize); - bitmap = new_bitmap; - m->btree_bitmap_shift += resize; - } - - BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX); - BUG_ON(end > 64ULL << m->btree_bitmap_shift); - - for (unsigned bit = start >> m->btree_bitmap_shift; - (u64) bit << m->btree_bitmap_shift < end; - bit++) - bitmap |= BIT_ULL(bit); - - m->btree_allocated_bitmap = cpu_to_le64(bitmap); -} - -void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) -{ - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - if (!bch2_member_exists(c->disk_sb.sb, ptr->dev)) - continue; - - __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); - } -} - -unsigned bch2_sb_nr_devices(const struct bch_sb *sb) -{ - unsigned nr = 0; - - for (unsigned i = 0; i < sb->nr_devices; i++) - nr += bch2_member_exists((struct bch_sb *) sb, i); - return nr; -} - -int bch2_sb_member_alloc(struct bch_fs *c) -{ - unsigned dev_idx = c->sb.nr_devices; - struct bch_sb_field_members_v2 *mi; - unsigned nr_devices; - unsigned u64s; - int best = -1; - u64 best_last_mount = 0; - unsigned nr_deleted = 0; - - if (dev_idx < BCH_SB_MEMBERS_MAX) - goto have_slot; - - for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) { - /* eventually BCH_SB_MEMBERS_MAX will be raised */ - if (dev_idx == BCH_SB_MEMBER_INVALID) - continue; - - struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - - nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); - - if (!bch2_is_zero(&m.uuid, sizeof(m.uuid))) - continue; - - u64 last_mount = le64_to_cpu(m.last_mount); - if (best < 0 || last_mount < best_last_mount) { - best = dev_idx; - best_last_mount = last_mount; - } - } - if (best >= 0) { - dev_idx = best; - goto have_slot; - } - - if (nr_deleted) - bch_err(c, "unable to allocate new member, but have %u deleted: run fsck", - nr_deleted); - - return -BCH_ERR_ENOSPC_sb_members; -have_slot: - nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); - - mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + - le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); - - mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); - if (!mi) - return -BCH_ERR_ENOSPC_sb_members; - - c->disk_sb.sb->nr_devices = nr_devices; - return dev_idx; -} - -void bch2_sb_members_clean_deleted(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - bool write_sb = false; - - for (unsigned i = 0; i < c->sb.nr_devices; i++) { - struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i); - - if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) { - memset(&m->uuid, 0, sizeof(m->uuid)); - write_sb = true; - } - } - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); -} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h deleted file mode 100644 index 8d8a8a857648..000000000000 --- a/fs/bcachefs/sb-members.h +++ /dev/null @@ -1,377 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_MEMBERS_H -#define _BCACHEFS_SB_MEMBERS_H - -#include "darray.h" -#include "bkey_types.h" -#include "enumerated_ref.h" - -extern char * const bch2_member_error_strs[]; - -static inline struct bch_member * -__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i) -{ - return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); -} - -int bch2_sb_members_v2_init(struct bch_fs *c); -int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); -struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i); -struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); - -static inline bool bch2_dev_is_online(struct bch_dev *ca) -{ - return !enumerated_ref_is_zero(&ca->io_ref[READ]); -} - -static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); - -static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) -{ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu(c, dev); - return ca && bch2_dev_is_online(ca); -} - -static inline bool bch2_dev_is_healthy(struct bch_dev *ca) -{ - return bch2_dev_is_online(ca) && - ca->mi.state != BCH_MEMBER_STATE_failed; -} - -static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) -{ - return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); -} - -static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, - unsigned dev) -{ - darray_for_each(devs, i) - if (*i == dev) - return true; - return false; -} - -static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, - unsigned dev) -{ - darray_for_each(*devs, i) - if (*i == dev) { - darray_remove_item(devs, i); - return; - } -} - -static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, - unsigned dev) -{ - if (!bch2_dev_list_has_dev(*devs, dev)) { - BUG_ON(devs->nr >= ARRAY_SIZE(devs->data)); - devs->data[devs->nr++] = dev; - } -} - -static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) -{ - return (struct bch_devs_list) { .nr = 1, .data[0] = dev }; -} - -static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx, - const struct bch_devs_mask *mask) -{ - struct bch_dev *ca = NULL; - - while ((idx = mask - ? find_next_bit(mask->d, c->sb.nr_devices, idx) - : idx) < c->sb.nr_devices && - !(ca = rcu_dereference_check(c->devs[idx], - lockdep_is_held(&c->state_lock)))) - idx++; - - return ca; -} - -static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca, - const struct bch_devs_mask *mask) -{ - return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask); -} - -#define for_each_member_device_rcu(_c, _ca, _mask) \ - for (struct bch_dev *_ca = NULL; \ - (_ca = __bch2_next_dev((_c), _ca, (_mask)));) - -#define for_each_online_member_rcu(_c, _ca) \ - for_each_member_device_rcu(_c, _ca, &(_c)->online_devs) - -#define for_each_rw_member_rcu(_c, _ca) \ - for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free]) - -static inline void bch2_dev_get(struct bch_dev *ca) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L); -#else - percpu_ref_get(&ca->ref); -#endif -} - -static inline void __bch2_dev_put(struct bch_dev *ca) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - long r = atomic_long_dec_return(&ca->ref); - if (r < (long) !ca->dying) - panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put); - ca->last_put = _THIS_IP_; - if (!r) - complete(&ca->ref_completion); -#else - percpu_ref_put(&ca->ref); -#endif -} - -static inline void bch2_dev_put(struct bch_dev *ca) -{ - if (ca) - __bch2_dev_put(ca); -} - -static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) -{ - guard(rcu)(); - bch2_dev_put(ca); - if ((ca = __bch2_next_dev(c, ca, NULL))) - bch2_dev_get(ca); - return ca; -} - -/* - * If you break early, you must drop your ref on the current device - */ -#define __for_each_member_device(_c, _ca) \ - for (; (_ca = bch2_get_next_dev(_c, _ca));) - -#define for_each_member_device(_c, _ca) \ - for (struct bch_dev *_ca = NULL; \ - (_ca = bch2_get_next_dev(_c, _ca));) - -static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, - struct bch_dev *ca, - unsigned state_mask, - int rw, unsigned ref_idx) -{ - guard(rcu)(); - if (ca) - enumerated_ref_put(&ca->io_ref[rw], ref_idx); - - while ((ca = __bch2_next_dev(c, ca, NULL)) && - (!((1 << ca->mi.state) & state_mask) || - !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))) - ; - - return ca; -} - -#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \ - for (struct bch_dev *_ca = NULL; \ - (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));) - -#define for_each_online_member(c, ca, ref_idx) \ - __for_each_online_member(c, ca, ~0, READ, ref_idx) - -#define for_each_rw_member(c, ca, ref_idx) \ - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx) - -#define for_each_readable_member(c, ca, ref_idx) \ - __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx) - -static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) -{ - return dev < c->sb.nr_devices && c->devs[dev]; -} - -static inline bool bucket_valid(const struct bch_dev *ca, u64 b) -{ - return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first; -} - -static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev) -{ - EBUG_ON(!bch2_dev_exists(c, dev)); - - return rcu_dereference_check(c->devs[dev], 1); -} - -static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev) -{ - EBUG_ON(!bch2_dev_exists(c, dev)); - - return rcu_dereference_protected(c->devs[dev], - lockdep_is_held(&c->sb_lock) || - lockdep_is_held(&c->state_lock)); -} - -static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev) -{ - return c && dev < c->sb.nr_devices - ? rcu_dereference(c->devs[dev]) - : NULL; -} - -int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned); - -void bch2_dev_missing_atomic(struct bch_fs *, unsigned); - -static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) -{ - struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); - if (unlikely(!ca)) - bch2_dev_missing_atomic(c, dev); - return ca; -} - -static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) -{ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); - if (ca) - bch2_dev_get(ca); - return ca; -} - -static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) -{ - struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); - if (unlikely(!ca)) - bch2_dev_missing_atomic(c, dev); - return ca; -} - -static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket) -{ - struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode); - if (ca && unlikely(!bucket_valid(ca, bucket.offset))) { - bch2_dev_put(ca); - ca = NULL; - } - return ca; -} - -void bch2_dev_bucket_missing(struct bch_dev *, u64); - -static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket) -{ - struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); - if (ca && unlikely(!bucket_valid(ca, bucket.offset))) { - bch2_dev_bucket_missing(ca, bucket.offset); - bch2_dev_put(ca); - ca = NULL; - } - return ca; -} - -static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) -{ - if (ca && ca->dev_idx == dev_idx) - return ca; - bch2_dev_put(ca); - return bch2_dev_tryget_noerror(c, dev_idx); -} - -static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) -{ - if (ca && ca->dev_idx == dev_idx) - return ca; - bch2_dev_put(ca); - return bch2_dev_tryget(c, dev_idx); -} - -static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, - int rw, unsigned ref_idx) -{ - might_sleep(); - - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu(c, dev); - if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) - return NULL; - - if (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) - return ca; - - enumerated_ref_put(&ca->io_ref[rw], ref_idx); - return NULL; -} - -extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; -extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; - -static inline bool bch2_member_alive(struct bch_member *m) -{ - return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) && - !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID); -} - -static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) -{ - if (dev < sb->nr_devices) { - struct bch_member m = bch2_sb_member_get(sb, dev); - return bch2_member_alive(&m); - } - return false; -} - -unsigned bch2_sb_nr_devices(const struct bch_sb *); - -static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) -{ - return (struct bch_member_cpu) { - .nbuckets = le64_to_cpu(mi->nbuckets), - .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) - - le16_to_cpu(mi->first_bucket), - .first_bucket = le16_to_cpu(mi->first_bucket), - .bucket_size = le16_to_cpu(mi->bucket_size), - .group = BCH_MEMBER_GROUP(mi), - .state = BCH_MEMBER_STATE(mi), - .discard = BCH_MEMBER_DISCARD(mi), - .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), - .durability = BCH_MEMBER_DURABILITY(mi) - ? BCH_MEMBER_DURABILITY(mi) - 1 - : 1, - .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), - .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi), - .valid = bch2_member_alive(mi), - .btree_bitmap_shift = mi->btree_bitmap_shift, - .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), - }; -} - -void bch2_sb_members_from_cpu(struct bch_fs *); - -void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); -void bch2_dev_errors_reset(struct bch_dev *); - -static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors) -{ - u64 end = start + sectors; - - if (end > 64ULL << ca->mi.btree_bitmap_shift) - return false; - - for (unsigned bit = start >> ca->mi.btree_bitmap_shift; - (u64) bit << ca->mi.btree_bitmap_shift < end; - bit++) - if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit))) - return false; - return true; -} - -bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); -void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); - -int bch2_sb_member_alloc(struct bch_fs *); -void bch2_sb_members_clean_deleted(struct bch_fs *); - -#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h deleted file mode 100644 index fb72ad730518..000000000000 --- a/fs/bcachefs/sb-members_format.h +++ /dev/null @@ -1,128 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H -#define _BCACHEFS_SB_MEMBERS_FORMAT_H - -/* - * We refer to members with bitmasks in various places - but we need to get rid - * of this limit: - */ -#define BCH_SB_MEMBERS_MAX 64 - -/* - * Sentinal value - indicates a device that does not exist - */ -#define BCH_SB_MEMBER_INVALID 255 - -#define BCH_SB_MEMBER_DELETED_UUID \ - UUID_INIT(0xffffffff, 0xffff, 0xffff, \ - 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) - -#define BCH_MIN_NR_NBUCKETS (1 << 6) - -#define BCH_IOPS_MEASUREMENTS() \ - x(seqread, 0) \ - x(seqwrite, 1) \ - x(randread, 2) \ - x(randwrite, 3) - -enum bch_iops_measurement { -#define x(t, n) BCH_IOPS_##t = n, - BCH_IOPS_MEASUREMENTS() -#undef x - BCH_IOPS_NR -}; - -#define BCH_MEMBER_ERROR_TYPES() \ - x(read, 0) \ - x(write, 1) \ - x(checksum, 2) - -enum bch_member_error_type { -#define x(t, n) BCH_MEMBER_ERROR_##t = n, - BCH_MEMBER_ERROR_TYPES() -#undef x - BCH_MEMBER_ERROR_NR -}; - -struct bch_member { - __uuid_t uuid; - __le64 nbuckets; /* device size */ - __le16 first_bucket; /* index of first bucket used */ - __le16 bucket_size; /* sectors */ - __u8 btree_bitmap_shift; - __u8 pad[3]; - __le64 last_mount; /* time_t */ - - __le64 flags; - __le32 iops[4]; - __le64 errors[BCH_MEMBER_ERROR_NR]; - __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; - __le64 errors_reset_time; - __le64 seq; - __le64 btree_allocated_bitmap; - /* - * On recovery from a clean shutdown we don't normally read the journal, - * but we still want to resume writing from where we left off so we - * don't overwrite more than is necessary, for list journal debugging: - */ - __le32 last_journal_bucket; - __le32 last_journal_bucket_offset; -}; - -/* - * btree_allocated_bitmap can represent sector addresses of a u64: it itself has - * 64 elements, so 64 - ilog2(64) - */ -#define BCH_MI_BTREE_BITMAP_SHIFT_MAX 58 - -/* - * This limit comes from the bucket_gens array - it's a single allocation, and - * kernel allocation are limited to INT_MAX - */ -#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) - -#define BCH_MEMBER_V1_BYTES 56 - -LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16) -LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) -/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ -LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) -LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) -LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) -LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) -LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, - struct bch_member, flags, 30, 31) -LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT, - struct bch_member, flags, 31, 32) - -#if 0 -LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -#endif - -#define BCH_MEMBER_STATES() \ - x(rw, 0) \ - x(ro, 1) \ - x(failed, 2) \ - x(spare, 3) - -enum bch_member_state { -#define x(t, n) BCH_MEMBER_STATE_##t = n, - BCH_MEMBER_STATES() -#undef x - BCH_MEMBER_STATE_NR -}; - -struct bch_sb_field_members_v1 { - struct bch_sb_field field; - struct bch_member _members[]; //Members are now variable size -}; - -struct bch_sb_field_members_v2 { - struct bch_sb_field field; - __le16 member_bytes; //size of single member entry - u8 pad[6]; - struct bch_member _members[]; -}; - -#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */ diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h deleted file mode 100644 index d6443e186872..000000000000 --- a/fs/bcachefs/sb-members_types.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H -#define _BCACHEFS_SB_MEMBERS_TYPES_H - -struct bch_member_cpu { - u64 nbuckets; /* device size */ - u64 nbuckets_minus_first; - u16 first_bucket; /* index of first bucket used */ - u16 bucket_size; /* sectors */ - u16 group; - u8 state; - u8 discard; - u8 data_allowed; - u8 durability; - u8 freespace_initialized; - u8 resize_on_mount; - u8 valid; - u8 btree_bitmap_shift; - u64 btree_allocated_bitmap; -}; - -#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h deleted file mode 100644 index c4b3d8d3f414..000000000000 --- a/fs/bcachefs/seqmutex.h +++ /dev/null @@ -1,45 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SEQMUTEX_H -#define _BCACHEFS_SEQMUTEX_H - -#include <linux/mutex.h> - -struct seqmutex { - struct mutex lock; - u32 seq; -}; - -#define seqmutex_init(_lock) mutex_init(&(_lock)->lock) - -static inline bool seqmutex_trylock(struct seqmutex *lock) -{ - return mutex_trylock(&lock->lock); -} - -static inline void seqmutex_lock(struct seqmutex *lock) -{ - mutex_lock(&lock->lock); - lock->seq++; -} - -static inline u32 seqmutex_unlock(struct seqmutex *lock) -{ - u32 seq = lock->seq; - mutex_unlock(&lock->lock); - return seq; -} - -static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq) -{ - if (lock->seq != seq || !mutex_trylock(&lock->lock)) - return false; - - if (lock->seq != seq) { - mutex_unlock(&lock->lock); - return false; - } - - return true; -} - -#endif /* _BCACHEFS_SEQMUTEX_H */ diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c deleted file mode 100644 index a1cc44e66c7e..000000000000 --- a/fs/bcachefs/siphash.c +++ /dev/null @@ -1,173 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ - -/*- - * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d - * are the number of compression rounds and the number of finalization rounds. - * A compression round is identical to a finalization round and this round - * function is called SipRound. Given a 128-bit key k and a (possibly empty) - * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). - * - * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, - * by Jean-Philippe Aumasson and Daniel J. Bernstein, - * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa - * https://131002.net/siphash/siphash.pdf - * https://131002.net/siphash/ - */ - -#include <asm/byteorder.h> -#include <linux/unaligned.h> -#include <linux/bitops.h> -#include <linux/string.h> - -#include "siphash.h" - -static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) -{ - while (rounds--) { - ctx->v[0] += ctx->v[1]; - ctx->v[2] += ctx->v[3]; - ctx->v[1] = rol64(ctx->v[1], 13); - ctx->v[3] = rol64(ctx->v[3], 16); - - ctx->v[1] ^= ctx->v[0]; - ctx->v[3] ^= ctx->v[2]; - ctx->v[0] = rol64(ctx->v[0], 32); - - ctx->v[2] += ctx->v[1]; - ctx->v[0] += ctx->v[3]; - ctx->v[1] = rol64(ctx->v[1], 17); - ctx->v[3] = rol64(ctx->v[3], 21); - - ctx->v[1] ^= ctx->v[2]; - ctx->v[3] ^= ctx->v[0]; - ctx->v[2] = rol64(ctx->v[2], 32); - } -} - -static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) -{ - u64 m = get_unaligned_le64(ptr); - - ctx->v[3] ^= m; - SipHash_Rounds(ctx, rounds); - ctx->v[0] ^= m; -} - -void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) -{ - u64 k0, k1; - - k0 = le64_to_cpu(key->k0); - k1 = le64_to_cpu(key->k1); - - ctx->v[0] = 0x736f6d6570736575ULL ^ k0; - ctx->v[1] = 0x646f72616e646f6dULL ^ k1; - ctx->v[2] = 0x6c7967656e657261ULL ^ k0; - ctx->v[3] = 0x7465646279746573ULL ^ k1; - - memset(ctx->buf, 0, sizeof(ctx->buf)); - ctx->bytes = 0; -} - -void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, - const void *src, size_t len) -{ - const u8 *ptr = src; - size_t left, used; - - if (len == 0) - return; - - used = ctx->bytes % sizeof(ctx->buf); - ctx->bytes += len; - - if (used > 0) { - left = sizeof(ctx->buf) - used; - - if (len >= left) { - memcpy(&ctx->buf[used], ptr, left); - SipHash_CRounds(ctx, ctx->buf, rc); - len -= left; - ptr += left; - } else { - memcpy(&ctx->buf[used], ptr, len); - return; - } - } - - while (len >= sizeof(ctx->buf)) { - SipHash_CRounds(ctx, ptr, rc); - len -= sizeof(ctx->buf); - ptr += sizeof(ctx->buf); - } - - if (len > 0) - memcpy(&ctx->buf[used], ptr, len); -} - -void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) -{ - u64 r; - - r = SipHash_End(ctx, rc, rf); - - *((__le64 *) dst) = cpu_to_le64(r); -} - -u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) -{ - u64 r; - size_t left, used; - - used = ctx->bytes % sizeof(ctx->buf); - left = sizeof(ctx->buf) - used; - memset(&ctx->buf[used], 0, left - 1); - ctx->buf[7] = ctx->bytes; - - SipHash_CRounds(ctx, ctx->buf, rc); - ctx->v[2] ^= 0xff; - SipHash_Rounds(ctx, rf); - - r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); - memset(ctx, 0, sizeof(*ctx)); - return r; -} - -u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) -{ - SIPHASH_CTX ctx; - - SipHash_Init(&ctx, key); - SipHash_Update(&ctx, rc, rf, src, len); - return SipHash_End(&ctx, rc, rf); -} diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h deleted file mode 100644 index 3dfaf34a43b2..000000000000 --- a/fs/bcachefs/siphash.h +++ /dev/null @@ -1,87 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause */ -/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ -/*- - * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -/* - * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) - * optimized for speed on short messages returning a 64bit hash/digest value. - * - * The number of rounds is defined during the initialization: - * SipHash24_Init() for the fast and resonable strong version - * SipHash48_Init() for the strong version (half as fast) - * - * struct SIPHASH_CTX ctx; - * SipHash24_Init(&ctx); - * SipHash_SetKey(&ctx, "16bytes long key"); - * SipHash_Update(&ctx, pointer_to_string, length_of_string); - * SipHash_Final(output, &ctx); - */ - -#ifndef _SIPHASH_H_ -#define _SIPHASH_H_ - -#include <linux/types.h> - -#define SIPHASH_BLOCK_LENGTH 8 -#define SIPHASH_KEY_LENGTH 16 -#define SIPHASH_DIGEST_LENGTH 8 - -typedef struct _SIPHASH_CTX { - u64 v[4]; - u8 buf[SIPHASH_BLOCK_LENGTH]; - u32 bytes; -} SIPHASH_CTX; - -typedef struct { - __le64 k0; - __le64 k1; -} SIPHASH_KEY; - -void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); -void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); -u64 SipHash_End(SIPHASH_CTX *, int, int); -void SipHash_Final(void *, SIPHASH_CTX *, int, int); -u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); - -#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) -#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) -#define SipHash24_End(_d) SipHash_End((_d), 2, 4) -#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) -#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) - -#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) -#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) -#define SipHash48_End(_d) SipHash_End((_d), 4, 8) -#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) -#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) - -#endif /* _SIPHASH_H_ */ diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c deleted file mode 100644 index 538c324f4765..000000000000 --- a/fs/bcachefs/six.c +++ /dev/null @@ -1,878 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/export.h> -#include <linux/log2.h> -#include <linux/percpu.h> -#include <linux/preempt.h> -#include <linux/rcupdate.h> -#include <linux/sched.h> -#include <linux/sched/clock.h> -#include <linux/sched/rt.h> -#include <linux/sched/task.h> -#include <linux/slab.h> - -#include <trace/events/lock.h> - -#include "six.h" - -#ifdef DEBUG -#define EBUG_ON(cond) BUG_ON(cond) -#else -#define EBUG_ON(cond) do {} while (0) -#endif - -#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) -#define six_release(l, ip) lock_release(l, ip) - -static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); - -#define SIX_LOCK_HELD_read_OFFSET 0 -#define SIX_LOCK_HELD_read ~(~0U << 26) -#define SIX_LOCK_HELD_intent (1U << 26) -#define SIX_LOCK_HELD_write (1U << 27) -#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) -#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) -#define SIX_LOCK_NOSPIN (1U << 31) - -struct six_lock_vals { - /* Value we add to the lock in order to take the lock: */ - u32 lock_val; - - /* If the lock has this value (used as a mask), taking the lock fails: */ - u32 lock_fail; - - /* Mask that indicates lock is held for this type: */ - u32 held_mask; - - /* Waitlist we wakeup when releasing the lock: */ - enum six_lock_type unlock_wakeup; -}; - -static const struct six_lock_vals l[] = { - [SIX_LOCK_read] = { - .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET, - .lock_fail = SIX_LOCK_HELD_write, - .held_mask = SIX_LOCK_HELD_read, - .unlock_wakeup = SIX_LOCK_write, - }, - [SIX_LOCK_intent] = { - .lock_val = SIX_LOCK_HELD_intent, - .lock_fail = SIX_LOCK_HELD_intent, - .held_mask = SIX_LOCK_HELD_intent, - .unlock_wakeup = SIX_LOCK_intent, - }, - [SIX_LOCK_write] = { - .lock_val = SIX_LOCK_HELD_write, - .lock_fail = SIX_LOCK_HELD_read, - .held_mask = SIX_LOCK_HELD_write, - .unlock_wakeup = SIX_LOCK_read, - }, -}; - -static inline void six_set_bitmask(struct six_lock *lock, u32 mask) -{ - if ((atomic_read(&lock->state) & mask) != mask) - atomic_or(mask, &lock->state); -} - -static inline void six_clear_bitmask(struct six_lock *lock, u32 mask) -{ - if (atomic_read(&lock->state) & mask) - atomic_and(~mask, &lock->state); -} - -static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, - u32 old, struct task_struct *owner) -{ - if (type != SIX_LOCK_intent) - return; - - if (!(old & SIX_LOCK_HELD_intent)) { - EBUG_ON(lock->owner); - lock->owner = owner; - } else { - EBUG_ON(lock->owner != current); - } -} - -static inline unsigned pcpu_read_count(struct six_lock *lock) -{ - unsigned read_count = 0; - int cpu; - - for_each_possible_cpu(cpu) - read_count += *per_cpu_ptr(lock->readers, cpu); - return read_count; -} - -/* - * __do_six_trylock() - main trylock routine - * - * Returns 1 on success, 0 on failure - * - * In percpu reader mode, a failed trylock may cause a spurious trylock failure - * for anoter thread taking the competing lock type, and we may havve to do a - * wakeup: when a wakeup is required, we return -1 - wakeup_type. - */ -static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, - struct task_struct *task, bool try) -{ - int ret; - u32 old; - - EBUG_ON(type == SIX_LOCK_write && lock->owner != task); - EBUG_ON(type == SIX_LOCK_write && - (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write))); - - /* - * Percpu reader mode: - * - * The basic idea behind this algorithm is that you can implement a lock - * between two threads without any atomics, just memory barriers: - * - * For two threads you'll need two variables, one variable for "thread a - * has the lock" and another for "thread b has the lock". - * - * To take the lock, a thread sets its variable indicating that it holds - * the lock, then issues a full memory barrier, then reads from the - * other thread's variable to check if the other thread thinks it has - * the lock. If we raced, we backoff and retry/sleep. - * - * Failure to take the lock may cause a spurious trylock failure in - * another thread, because we temporarily set the lock to indicate that - * we held it. This would be a problem for a thread in six_lock(), when - * they are calling trylock after adding themself to the waitlist and - * prior to sleeping. - * - * Therefore, if we fail to get the lock, and there were waiters of the - * type we conflict with, we will have to issue a wakeup. - * - * Since we may be called under wait_lock (and by the wakeup code - * itself), we return that the wakeup has to be done instead of doing it - * here. - */ - if (type == SIX_LOCK_read && lock->readers) { - preempt_disable(); - this_cpu_inc(*lock->readers); /* signal that we own lock */ - - smp_mb(); - - old = atomic_read(&lock->state); - ret = !(old & l[type].lock_fail); - - this_cpu_sub(*lock->readers, !ret); - preempt_enable(); - - if (!ret) { - smp_mb(); - if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write) - ret = -1 - SIX_LOCK_write; - } - } else if (type == SIX_LOCK_write && lock->readers) { - if (try) - atomic_add(SIX_LOCK_HELD_write, &lock->state); - - /* - * Make sure atomic_add happens before pcpu_read_count and - * six_set_bitmask in slow path happens before pcpu_read_count. - * - * Paired with the smp_mb() in read lock fast path (per-cpu mode) - * and the one before atomic_read in read unlock path. - */ - smp_mb(); - ret = !pcpu_read_count(lock); - - if (try && !ret) { - old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state); - if (old & SIX_LOCK_WAITING_read) - ret = -1 - SIX_LOCK_read; - } - } else { - old = atomic_read(&lock->state); - do { - ret = !(old & l[type].lock_fail); - if (!ret || (type == SIX_LOCK_write && !try)) { - smp_mb(); - break; - } - } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val)); - - EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask)); - } - - if (ret > 0) - six_set_owner(lock, type, old, task); - - EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 && - (atomic_read(&lock->state) & SIX_LOCK_HELD_write)); - - return ret; -} - -static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) -{ - struct six_lock_waiter *w, *next; - struct task_struct *task; - bool saw_one; - int ret; -again: - ret = 0; - saw_one = false; - raw_spin_lock(&lock->wait_lock); - - list_for_each_entry_safe(w, next, &lock->wait_list, list) { - if (w->lock_want != lock_type) - continue; - - if (saw_one && lock_type != SIX_LOCK_read) - goto unlock; - saw_one = true; - - ret = __do_six_trylock(lock, lock_type, w->task, false); - if (ret <= 0) - goto unlock; - - /* - * Similar to percpu_rwsem_wake_function(), we need to guard - * against the wakee noticing w->lock_acquired, returning, and - * then exiting before we do the wakeup: - */ - task = get_task_struct(w->task); - __list_del(w->list.prev, w->list.next); - /* - * The release barrier here ensures the ordering of the - * __list_del before setting w->lock_acquired; @w is on the - * stack of the thread doing the waiting and will be reused - * after it sees w->lock_acquired with no other locking: - * pairs with smp_load_acquire() in six_lock_slowpath() - */ - smp_store_release(&w->lock_acquired, true); - wake_up_process(task); - put_task_struct(task); - } - - six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type); -unlock: - raw_spin_unlock(&lock->wait_lock); - - if (ret < 0) { - lock_type = -ret - 1; - goto again; - } -} - -__always_inline -static void six_lock_wakeup(struct six_lock *lock, u32 state, - enum six_lock_type lock_type) -{ - if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read)) - return; - - if (!(state & (SIX_LOCK_WAITING_read << lock_type))) - return; - - __six_lock_wakeup(lock, lock_type); -} - -__always_inline -static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try) -{ - int ret; - - ret = __do_six_trylock(lock, type, current, try); - if (ret < 0) - __six_lock_wakeup(lock, -ret - 1); - - return ret > 0; -} - -/** - * six_trylock_ip - attempt to take a six lock without blocking - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * Return: true on success, false on failure. - */ -bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) -{ - if (!do_six_trylock(lock, type, true)) - return false; - - if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); - return true; -} -EXPORT_SYMBOL_GPL(six_trylock_ip); - -/** - * six_relock_ip - attempt to re-take a lock that was held previously - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @seq: lock sequence number obtained from six_lock_seq() while lock was - * held previously - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * Return: true on success, false on failure. - */ -bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, - unsigned seq, unsigned long ip) -{ - if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip)) - return false; - - if (six_lock_seq(lock) != seq) { - six_unlock_ip(lock, type, ip); - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(six_relock_ip); - -#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN - -static inline bool six_owner_running(struct six_lock *lock) -{ - /* - * When there's no owner, we might have preempted between the owner - * acquiring the lock and setting the owner field. If we're an RT task - * that will live-lock because we won't let the owner complete. - */ - guard(rcu)(); - struct task_struct *owner = READ_ONCE(lock->owner); - return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); -} - -static inline bool six_optimistic_spin(struct six_lock *lock, - struct six_lock_waiter *wait, - enum six_lock_type type) -{ - unsigned loop = 0; - u64 end_time; - - if (type == SIX_LOCK_write) - return false; - - if (lock->wait_list.next != &wait->list) - return false; - - if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN) - return false; - - preempt_disable(); - end_time = sched_clock() + 10 * NSEC_PER_USEC; - - while (!need_resched() && six_owner_running(lock)) { - /* - * Ensures that writes to the waitlist entry happen after we see - * wait->lock_acquired: pairs with the smp_store_release in - * __six_lock_wakeup - */ - if (smp_load_acquire(&wait->lock_acquired)) { - preempt_enable(); - return true; - } - - if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { - six_set_bitmask(lock, SIX_LOCK_NOSPIN); - break; - } - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - cpu_relax(); - } - - preempt_enable(); - return false; -} - -#else /* CONFIG_LOCK_SPIN_ON_OWNER */ - -static inline bool six_optimistic_spin(struct six_lock *lock, - struct six_lock_waiter *wait, - enum six_lock_type type) -{ - return false; -} - -#endif - -noinline -static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - int ret = 0; - - if (type == SIX_LOCK_write) { - EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write); - atomic_add(SIX_LOCK_HELD_write, &lock->state); - smp_mb__after_atomic(); - } - - trace_contention_begin(lock, 0); - lock_contended(&lock->dep_map, ip); - - wait->task = current; - wait->lock_want = type; - wait->lock_acquired = false; - - raw_spin_lock(&lock->wait_lock); - six_set_bitmask(lock, SIX_LOCK_WAITING_read << type); - /* - * Retry taking the lock after taking waitlist lock, in case we raced - * with an unlock: - */ - ret = __do_six_trylock(lock, type, current, false); - if (ret <= 0) { - wait->start_time = local_clock(); - - if (!list_empty(&lock->wait_list)) { - struct six_lock_waiter *last = - list_last_entry(&lock->wait_list, - struct six_lock_waiter, list); - - if (time_before_eq64(wait->start_time, last->start_time)) - wait->start_time = last->start_time + 1; - } - - list_add_tail(&wait->list, &lock->wait_list); - } - raw_spin_unlock(&lock->wait_lock); - - if (unlikely(ret > 0)) { - ret = 0; - goto out; - } - - if (unlikely(ret < 0)) { - __six_lock_wakeup(lock, -ret - 1); - ret = 0; - } - - if (six_optimistic_spin(lock, wait, type)) - goto out; - - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - - /* - * Ensures that writes to the waitlist entry happen after we see - * wait->lock_acquired: pairs with the smp_store_release in - * __six_lock_wakeup - */ - if (smp_load_acquire(&wait->lock_acquired)) - break; - - ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; - if (unlikely(ret)) { - bool acquired; - - /* - * If should_sleep_fn() returns an error, we are - * required to return that error even if we already - * acquired the lock - should_sleep_fn() might have - * modified external state (e.g. when the deadlock cycle - * detector in bcachefs issued a transaction restart) - */ - raw_spin_lock(&lock->wait_lock); - acquired = wait->lock_acquired; - if (!acquired) - list_del(&wait->list); - raw_spin_unlock(&lock->wait_lock); - - if (unlikely(acquired)) { - do_six_unlock_type(lock, type); - } else if (type == SIX_LOCK_write) { - six_clear_bitmask(lock, SIX_LOCK_HELD_write); - six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); - } - break; - } - - schedule(); - } - - __set_current_state(TASK_RUNNING); -out: - trace_contention_end(lock, 0); - - return ret; -} - -/** - * six_lock_ip_waiter - take a lock, with full waitlist interface - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @wait: pointer to wait object, which will be added to lock's waitlist - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * This is the most general six_lock() variant, with parameters to support full - * cycle detection for deadlock avoidance. - * - * The code calling this function must implement tracking of held locks, and the - * @wait object should be embedded into the struct that tracks held locks - - * which must also be accessible in a thread-safe way. - * - * @should_sleep_fn should invoke the cycle detector; it should walk each - * lock's waiters, and for each waiter recursively walk their held locks. - * - * When this function must block, @wait will be added to @lock's waitlist before - * calling trylock, and before calling @should_sleep_fn, and @wait will not be - * removed from the lock waitlist until the lock has been successfully acquired, - * or we abort. - * - * @wait.start_time will be monotonically increasing for any given waitlist, and - * thus may be used as a loop cursor. - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - int ret; - - wait->start_time = 0; - - if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); - - ret = do_six_trylock(lock, type, true) ? 0 - : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip); - - if (ret && type != SIX_LOCK_write) - six_release(&lock->dep_map, ip); - if (!ret) - lock_acquired(&lock->dep_map, ip); - - return ret; -} -EXPORT_SYMBOL_GPL(six_lock_ip_waiter); - -__always_inline -static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) -{ - u32 state; - - if (type == SIX_LOCK_intent) - lock->owner = NULL; - - if (type == SIX_LOCK_read && - lock->readers) { - smp_mb(); /* unlock barrier */ - this_cpu_dec(*lock->readers); - smp_mb(); /* between unlocking and checking for waiters */ - state = atomic_read(&lock->state); - } else { - u32 v = l[type].lock_val; - - if (type != SIX_LOCK_read) - v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN; - - EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask)); - state = atomic_sub_return_release(v, &lock->state); - } - - six_lock_wakeup(lock, state, l[type].unlock_wakeup); -} - -/** - * six_unlock_ip - drop a six lock - * @lock: lock to unlock - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * When a lock is held multiple times (because six_lock_incement()) was used), - * this decrements the 'lock held' counter by one. - * - * For example: - * six_lock_read(&foo->lock); read count 1 - * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 - */ -void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) -{ - EBUG_ON(type == SIX_LOCK_write && - !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); - EBUG_ON((type == SIX_LOCK_write || - type == SIX_LOCK_intent) && - lock->owner != current); - - if (type != SIX_LOCK_write) - six_release(&lock->dep_map, ip); - - if (type == SIX_LOCK_intent && - lock->intent_lock_recurse) { - --lock->intent_lock_recurse; - return; - } - - if (type == SIX_LOCK_write && - lock->write_lock_recurse) { - --lock->write_lock_recurse; - return; - } - - if (type == SIX_LOCK_write) - lock->seq++; - - do_six_unlock_type(lock, type); -} -EXPORT_SYMBOL_GPL(six_unlock_ip); - -/** - * six_lock_downgrade - convert an intent lock to a read lock - * @lock: lock to dowgrade - * - * @lock will have read count incremented and intent count decremented - */ -void six_lock_downgrade(struct six_lock *lock) -{ - six_lock_increment(lock, SIX_LOCK_read); - six_unlock_intent(lock); -} -EXPORT_SYMBOL_GPL(six_lock_downgrade); - -/** - * six_lock_tryupgrade - attempt to convert read lock to an intent lock - * @lock: lock to upgrade - * - * On success, @lock will have intent count incremented and read count - * decremented - * - * Return: true on success, false on failure - */ -bool six_lock_tryupgrade(struct six_lock *lock) -{ - u32 old = atomic_read(&lock->state), new; - - do { - new = old; - - if (new & SIX_LOCK_HELD_intent) - return false; - - if (!lock->readers) { - EBUG_ON(!(new & SIX_LOCK_HELD_read)); - new -= l[SIX_LOCK_read].lock_val; - } - - new |= SIX_LOCK_HELD_intent; - } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new)); - - if (lock->readers) - this_cpu_dec(*lock->readers); - - six_set_owner(lock, SIX_LOCK_intent, old, current); - - return true; -} -EXPORT_SYMBOL_GPL(six_lock_tryupgrade); - -/** - * six_trylock_convert - attempt to convert a held lock from one type to another - * @lock: lock to upgrade - * @from: SIX_LOCK_read or SIX_LOCK_intent - * @to: SIX_LOCK_read or SIX_LOCK_intent - * - * On success, @lock will have intent count incremented and read count - * decremented - * - * Return: true on success, false on failure - */ -bool six_trylock_convert(struct six_lock *lock, - enum six_lock_type from, - enum six_lock_type to) -{ - EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); - - if (to == from) - return true; - - if (to == SIX_LOCK_read) { - six_lock_downgrade(lock); - return true; - } else { - return six_lock_tryupgrade(lock); - } -} -EXPORT_SYMBOL_GPL(six_trylock_convert); - -/** - * six_lock_increment - increase held lock count on a lock that is already held - * @lock: lock to increment - * @type: SIX_LOCK_read or SIX_LOCK_intent - * - * @lock must already be held, with a lock type that is greater than or equal to - * @type - * - * A corresponding six_unlock_type() call will be required for @lock to be fully - * unlocked. - */ -void six_lock_increment(struct six_lock *lock, enum six_lock_type type) -{ - six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_); - - /* XXX: assert already locked, and that we don't overflow: */ - - switch (type) { - case SIX_LOCK_read: - if (lock->readers) { - this_cpu_inc(*lock->readers); - } else { - EBUG_ON(!(atomic_read(&lock->state) & - (SIX_LOCK_HELD_read| - SIX_LOCK_HELD_intent))); - atomic_add(l[type].lock_val, &lock->state); - } - break; - case SIX_LOCK_write: - lock->write_lock_recurse++; - fallthrough; - case SIX_LOCK_intent: - EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); - lock->intent_lock_recurse++; - break; - } -} -EXPORT_SYMBOL_GPL(six_lock_increment); - -/** - * six_lock_wakeup_all - wake up all waiters on @lock - * @lock: lock to wake up waiters for - * - * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then - * abort the lock operation. - * - * This function is never needed in a bug-free program; it's only useful in - * debug code, e.g. to determine if a cycle detector is at fault. - */ -void six_lock_wakeup_all(struct six_lock *lock) -{ - u32 state = atomic_read(&lock->state); - struct six_lock_waiter *w; - - six_lock_wakeup(lock, state, SIX_LOCK_read); - six_lock_wakeup(lock, state, SIX_LOCK_intent); - six_lock_wakeup(lock, state, SIX_LOCK_write); - - raw_spin_lock(&lock->wait_lock); - list_for_each_entry(w, &lock->wait_list, list) - wake_up_process(w->task); - raw_spin_unlock(&lock->wait_lock); -} -EXPORT_SYMBOL_GPL(six_lock_wakeup_all); - -/** - * six_lock_counts - return held lock counts, for each lock type - * @lock: lock to return counters for - * - * Return: the number of times a lock is held for read, intent and write. - */ -struct six_lock_count six_lock_counts(struct six_lock *lock) -{ - struct six_lock_count ret; - - ret.n[SIX_LOCK_read] = !lock->readers - ? atomic_read(&lock->state) & SIX_LOCK_HELD_read - : pcpu_read_count(lock); - ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) + - lock->intent_lock_recurse; - ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write); - - return ret; -} -EXPORT_SYMBOL_GPL(six_lock_counts); - -/** - * six_lock_readers_add - directly manipulate reader count of a lock - * @lock: lock to add/subtract readers for - * @nr: reader count to add/subtract - * - * When an upper layer is implementing lock reentrency, we may have both read - * and intent locks on the same lock. - * - * When we need to take a write lock, the read locks will cause self-deadlock, - * because six locks themselves do not track which read locks are held by the - * current thread and which are held by a different thread - it does no - * per-thread tracking of held locks. - * - * The upper layer that is tracking held locks may however, if trylock() has - * failed, count up its own read locks, subtract them, take the write lock, and - * then re-add them. - * - * As in any other situation when taking a write lock, @lock must be held for - * intent one (or more) times, so @lock will never be left unlocked. - */ -void six_lock_readers_add(struct six_lock *lock, int nr) -{ - if (lock->readers) { - this_cpu_add(*lock->readers, nr); - } else { - EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0); - /* reader count starts at bit 0 */ - atomic_add(nr, &lock->state); - } -} -EXPORT_SYMBOL_GPL(six_lock_readers_add); - -/** - * six_lock_exit - release resources held by a lock prior to freeing - * @lock: lock to exit - * - * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is - * required to free the percpu read counts. - */ -void six_lock_exit(struct six_lock *lock) -{ - WARN_ON(lock->readers && pcpu_read_count(lock)); - WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read); - - free_percpu(lock->readers); - lock->readers = NULL; -} -EXPORT_SYMBOL_GPL(six_lock_exit); - -void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags, - gfp_t gfp) -{ - atomic_set(&lock->state, 0); - raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - debug_check_no_locks_freed((void *) lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - - /* - * Don't assume that we have real percpu variables available in - * userspace: - */ -#ifdef __KERNEL__ - if (flags & SIX_LOCK_INIT_PCPU) { - /* - * We don't return an error here on memory allocation failure - * since percpu is an optimization, and locks will work with the - * same semantics in non-percpu mode: callers can check for - * failure if they wish by checking lock->readers, but generally - * will not want to treat it as an error. - */ - lock->readers = alloc_percpu_gfp(unsigned, gfp); - } -#endif -} -EXPORT_SYMBOL_GPL(__six_lock_init); diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h deleted file mode 100644 index 59b851cf8bac..000000000000 --- a/fs/bcachefs/six.h +++ /dev/null @@ -1,388 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _LINUX_SIX_H -#define _LINUX_SIX_H - -/** - * DOC: SIX locks overview - * - * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores - * but with an additional state: read/shared, intent, exclusive/write - * - * The purpose of the intent state is to allow for greater concurrency on tree - * structures without deadlocking. In general, a read can't be upgraded to a - * write lock without deadlocking, so an operation that updates multiple nodes - * will have to take write locks for the full duration of the operation. - * - * But by adding an intent state, which is exclusive with other intent locks but - * not with readers, we can take intent locks at the start of the operation, - * and then take write locks only for the actual update to each individual - * nodes, without deadlocking. - * - * Example usage: - * six_lock_read(&foo->lock); - * six_unlock_read(&foo->lock); - * - * An intent lock must be held before taking a write lock: - * six_lock_intent(&foo->lock); - * six_lock_write(&foo->lock); - * six_unlock_write(&foo->lock); - * six_unlock_intent(&foo->lock); - * - * Other operations: - * six_trylock_read() - * six_trylock_intent() - * six_trylock_write() - * - * six_lock_downgrade() convert from intent to read - * six_lock_tryupgrade() attempt to convert from read to intent, may fail - * - * There are also interfaces that take the lock type as an enum: - * - * six_lock_type(&foo->lock, SIX_LOCK_read); - * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) - * six_lock_type(&foo->lock, SIX_LOCK_write); - * six_unlock_type(&foo->lock, SIX_LOCK_write); - * six_unlock_type(&foo->lock, SIX_LOCK_intent); - * - * Lock sequence numbers - unlock(), relock(): - * - * Locks embed sequences numbers, which are incremented on write lock/unlock. - * This allows locks to be dropped and the retaken iff the state they protect - * hasn't changed; this makes it much easier to avoid holding locks while e.g. - * doing IO or allocating memory. - * - * Example usage: - * six_lock_read(&foo->lock); - * u32 seq = six_lock_seq(&foo->lock); - * six_unlock_read(&foo->lock); - * - * some_operation_that_may_block(); - * - * if (six_relock_read(&foo->lock, seq)) { ... } - * - * If the relock operation succeeds, it is as if the lock was never unlocked. - * - * Reentrancy: - * - * Six locks are not by themselves reentrant, but have counters for both the - * read and intent states that can be used to provide reentrancy by an upper - * layer that tracks held locks. If a lock is known to already be held in the - * read or intent state, six_lock_increment() can be used to bump the "lock - * held in this state" counter, increasing the number of unlock calls that - * will be required to fully unlock it. - * - * Example usage: - * six_lock_read(&foo->lock); - * six_lock_increment(&foo->lock, SIX_LOCK_read); - * six_unlock_read(&foo->lock); - * six_unlock_read(&foo->lock); - * foo->lock is now fully unlocked. - * - * Since the intent state supercedes read, it's legal to increment the read - * counter when holding an intent lock, but not the reverse. - * - * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) - * is not legal. - * - * should_sleep_fn: - * - * There is a six_lock() variant that takes a function pointer that is called - * immediately prior to schedule() when blocking, and may return an error to - * abort. - * - * One possible use for this feature is when objects being locked are part of - * a cache and may reused, and lock ordering is based on a property of the - * object that will change when the object is reused - i.e. logical key order. - * - * If looking up an object in the cache may race with object reuse, and lock - * ordering is required to prevent deadlock, object reuse may change the - * correct lock order for that object and cause a deadlock. should_sleep_fn - * can be used to check if the object is still the object we want and avoid - * this deadlock. - * - * Wait list entry interface: - * - * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a - * wait list entry. By embedding six_lock_waiter into another object, and by - * traversing lock waitlists, it is then possible for an upper layer to - * implement full cycle detection for deadlock avoidance. - * - * should_sleep_fn should be used for invoking the cycle detector, walking the - * graph of held locks to check for a deadlock. The upper layer must track - * held locks for each thread, and each thread's held locks must be reachable - * from its six_lock_waiter object. - * - * six_lock_waiter() will add the wait object to the waitlist re-trying taking - * the lock, and before calling should_sleep_fn, and the wait object will not - * be removed from the waitlist until either the lock has been successfully - * acquired, or we aborted because should_sleep_fn returned an error. - * - * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will - * have timestamps in strictly ascending order - this is so the timestamp can - * be used as a cursor for lock graph traverse. - */ - -#include <linux/lockdep.h> -#include <linux/sched.h> -#include <linux/types.h> - -enum six_lock_type { - SIX_LOCK_read, - SIX_LOCK_intent, - SIX_LOCK_write, -}; - -struct six_lock { - atomic_t state; - u32 seq; - unsigned intent_lock_recurse; - unsigned write_lock_recurse; - struct task_struct *owner; - unsigned __percpu *readers; - raw_spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - -struct six_lock_waiter { - struct list_head list; - struct task_struct *task; - enum six_lock_type lock_want; - bool lock_acquired; - u64 start_time; -}; - -typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); - -void six_lock_exit(struct six_lock *lock); - -enum six_lock_init_flags { - SIX_LOCK_INIT_PCPU = 1U << 0, -}; - -void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags, - gfp_t gfp); - -/** - * six_lock_init - initialize a six lock - * @lock: lock to initialize - * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU - */ -#define six_lock_init(lock, flags, gfp) \ -do { \ - static struct lock_class_key __key; \ - \ - __six_lock_init((lock), #lock, &__key, flags, gfp); \ -} while (0) - -/** - * six_lock_seq - obtain current lock sequence number - * @lock: six_lock to obtain sequence number for - * - * @lock should be held for read or intent, and not write - * - * By saving the lock sequence number, we can unlock @lock and then (typically - * after some blocking operation) attempt to relock it: the relock will succeed - * if the sequence number hasn't changed, meaning no write locks have been taken - * and state corresponding to what @lock protects is still valid. - */ -static inline u32 six_lock_seq(const struct six_lock *lock) -{ - return lock->seq; -} - -bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); - -/** - * six_trylock_type - attempt to take a six lock without blocking - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * - * Return: true on success, false on failure. - */ -static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) -{ - return six_trylock_ip(lock, type, _THIS_IP_); -} - -int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip); - -/** - * six_lock_waiter - take a lock, with full waitlist interface - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @wait: pointer to wait object, which will be added to lock's waitlist - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * - * This is a convenience wrapper around six_lock_ip_waiter(), see that function - * for full documentation. - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); -} - -/** - * six_lock_ip - take a six lock lock - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - struct six_lock_waiter wait; - - return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); -} - -/** - * six_lock_type - take a six lock lock - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - struct six_lock_waiter wait; - - return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); -} - -bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, - unsigned seq, unsigned long ip); - -/** - * six_relock_type - attempt to re-take a lock that was held previously - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @seq: lock sequence number obtained from six_lock_seq() while lock was - * held previously - * - * Return: true on success, false on failure. - */ -static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, - unsigned seq) -{ - return six_relock_ip(lock, type, seq, _THIS_IP_); -} - -void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); - -/** - * six_unlock_type - drop a six lock - * @lock: lock to unlock - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * - * When a lock is held multiple times (because six_lock_incement()) was used), - * this decrements the 'lock held' counter by one. - * - * For example: - * six_lock_read(&foo->lock); read count 1 - * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 - */ -static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) -{ - six_unlock_ip(lock, type, _THIS_IP_); -} - -#define __SIX_LOCK(type) \ -static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ -{ \ - return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ -} \ - \ -static inline bool six_trylock_##type(struct six_lock *lock) \ -{ \ - return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ -} \ - \ -static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ - struct six_lock_waiter *wait, \ - six_lock_should_sleep_fn should_sleep_fn, void *p,\ - unsigned long ip) \ -{ \ - return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ -} \ - \ -static inline int six_lock_ip_##type(struct six_lock *lock, \ - six_lock_should_sleep_fn should_sleep_fn, void *p, \ - unsigned long ip) \ -{ \ - return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ -} \ - \ -static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ -{ \ - return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ -} \ - \ -static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ -{ \ - return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ -} \ - \ -static inline int six_lock_##type(struct six_lock *lock, \ - six_lock_should_sleep_fn fn, void *p)\ -{ \ - return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ -} \ - \ -static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ -{ \ - six_unlock_ip(lock, SIX_LOCK_##type, ip); \ -} \ - \ -static inline void six_unlock_##type(struct six_lock *lock) \ -{ \ - six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ -} - -__SIX_LOCK(read) -__SIX_LOCK(intent) -__SIX_LOCK(write) -#undef __SIX_LOCK - -void six_lock_downgrade(struct six_lock *); -bool six_lock_tryupgrade(struct six_lock *); -bool six_trylock_convert(struct six_lock *, enum six_lock_type, - enum six_lock_type); - -void six_lock_increment(struct six_lock *, enum six_lock_type); - -void six_lock_wakeup_all(struct six_lock *); - -struct six_lock_count { - unsigned n[3]; -}; - -struct six_lock_count six_lock_counts(struct six_lock *); -void six_lock_readers_add(struct six_lock *, int); - -#endif /* _LINUX_SIX_H */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c deleted file mode 100644 index 4c43d2a2c1f5..000000000000 --- a/fs/bcachefs/snapshot.c +++ /dev/null @@ -1,2043 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bbpos.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "buckets.h" -#include "enumerated_ref.h" -#include "errcode.h" -#include "error.h" -#include "fs.h" -#include "recovery_passes.h" -#include "snapshot.h" - -#include <linux/random.h> - -/* - * Snapshot trees: - * - * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they - * exist to provide a stable identifier for the whole lifetime of a snapshot - * tree. - */ - -void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); - - prt_printf(out, "subvol %u root snapshot %u", - le32_to_cpu(t.v->master_subvol), - le32_to_cpu(t.v->root_snapshot)); -} - -int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), - c, snapshot_tree_pos_bad, - "bad pos"); -fsck_err: - return ret; -} - -int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, - struct bch_snapshot_tree *s) -{ - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), - BTREE_ITER_with_updates, snapshot_tree, s); - - if (bch2_err_matches(ret, ENOENT)) - ret = bch_err_throw(trans->c, ENOENT_snapshot_tree); - return ret; -} - -struct bkey_i_snapshot_tree * -__bch2_snapshot_tree_create(struct btree_trans *trans) -{ - struct btree_iter iter; - int ret = bch2_bkey_get_empty_slot(trans, &iter, - BTREE_ID_snapshot_trees, POS(0, U32_MAX)); - struct bkey_i_snapshot_tree *s_t; - - if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree); - if (ret) - return ERR_PTR(ret); - - s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(s_t); - bch2_trans_iter_exit(trans, &iter); - return ret ? ERR_PTR(ret) : s_t; -} - -static int bch2_snapshot_tree_create(struct btree_trans *trans, - u32 root_id, u32 subvol_id, u32 *tree_id) -{ - struct bkey_i_snapshot_tree *n_tree = - __bch2_snapshot_tree_create(trans); - - if (IS_ERR(n_tree)) - return PTR_ERR(n_tree); - - n_tree->v.master_subvol = cpu_to_le32(subvol_id); - n_tree->v.root_snapshot = cpu_to_le32(root_id); - *tree_id = n_tree->k.p.offset; - return 0; -} - -/* Snapshot nodes: */ - -static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor) -{ - while (id && id < ancestor) { - const struct snapshot_t *s = __snapshot_t(t, id); - id = s ? s->parent : 0; - } - return id == ancestor; -} - -static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) -{ - guard(rcu)(); - return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); -} - -static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) -{ - const struct snapshot_t *s = __snapshot_t(t, id); - if (!s) - return 0; - - if (s->skip[2] <= ancestor) - return s->skip[2]; - if (s->skip[1] <= ancestor) - return s->skip[1]; - if (s->skip[0] <= ancestor) - return s->skip[0]; - return s->parent; -} - -static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) -{ - const struct snapshot_t *s = __snapshot_t(t, id); - if (!s) - return false; - - return test_bit(ancestor - id - 1, s->is_ancestor); -} - -bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - u32 orig_id = id; -#endif - - guard(rcu)(); - struct snapshot_table *t = rcu_dereference(c->snapshots); - - if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) - return __bch2_snapshot_is_ancestor_early(t, id, ancestor); - - if (likely(ancestor >= IS_ANCESTOR_BITMAP)) - while (id && id < ancestor - IS_ANCESTOR_BITMAP) - id = get_ancestor_below(t, id, ancestor); - - bool ret = id && id < ancestor - ? test_ancestor_bitmap(t, id, ancestor) - : id == ancestor; - - EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor)); - return ret; -} - -static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) -{ - size_t idx = U32_MAX - id; - struct snapshot_table *new, *old; - - size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); - size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); - - if (unlikely(new_bytes > INT_MAX)) - return NULL; - - new = kvzalloc(new_bytes, GFP_KERNEL); - if (!new) - return NULL; - - new->nr = new_size; - - old = rcu_dereference_protected(c->snapshots, true); - if (old) - memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr); - - rcu_assign_pointer(c->snapshots, new); - kvfree_rcu(old, rcu); - - return &rcu_dereference_protected(c->snapshots, - lockdep_is_held(&c->snapshot_table_lock))->s[idx]; -} - -static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) -{ - size_t idx = U32_MAX - id; - struct snapshot_table *table = - rcu_dereference_protected(c->snapshots, - lockdep_is_held(&c->snapshot_table_lock)); - - lockdep_assert_held(&c->snapshot_table_lock); - - if (likely(table && idx < table->nr)) - return &table->s[idx]; - - return __snapshot_t_mut(c, id); -} - -void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - - if (BCH_SNAPSHOT_SUBVOL(s.v)) - prt_str(out, "subvol "); - if (BCH_SNAPSHOT_WILL_DELETE(s.v)) - prt_str(out, "will_delete "); - if (BCH_SNAPSHOT_DELETED(s.v)) - prt_str(out, "deleted "); - - prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u", - le32_to_cpu(s.v->parent), - le32_to_cpu(s.v->children[0]), - le32_to_cpu(s.v->children[1]), - le32_to_cpu(s.v->subvol), - le32_to_cpu(s.v->tree)); - - if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) - prt_printf(out, " depth %u skiplist %u %u %u", - le32_to_cpu(s.v->depth), - le32_to_cpu(s.v->skip[0]), - le32_to_cpu(s.v->skip[1]), - le32_to_cpu(s.v->skip[2])); -} - -int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_snapshot s; - u32 i, id; - int ret = 0; - - bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), - c, snapshot_pos_bad, - "bad pos"); - - s = bkey_s_c_to_snapshot(k); - - id = le32_to_cpu(s.v->parent); - bkey_fsck_err_on(id && id <= k.k->p.offset, - c, snapshot_parent_bad, - "bad parent node (%u <= %llu)", - id, k.k->p.offset); - - bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), - c, snapshot_children_not_normalized, - "children not normalized"); - - bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], - c, snapshot_child_duplicate, - "duplicate child nodes"); - - for (i = 0; i < 2; i++) { - id = le32_to_cpu(s.v->children[i]); - - bkey_fsck_err_on(id >= k.k->p.offset, - c, snapshot_child_bad, - "bad child node (%u >= %llu)", - id, k.k->p.offset); - } - - if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { - bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || - le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), - c, snapshot_skiplist_not_normalized, - "skiplist not normalized"); - - for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { - id = le32_to_cpu(s.v->skip[i]); - - bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), - c, snapshot_skiplist_bad, - "bad skiplist node %u", id); - } - } -fsck_err: - return ret; -} - -static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id) -{ - mutex_lock(&c->snapshot_table_lock); - int ret = snapshot_t_mut(c, id) - ? 0 - : bch_err_throw(c, ENOMEM_mark_snapshot); - mutex_unlock(&c->snapshot_table_lock); - return ret; -} - -static int __bch2_mark_snapshot(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct snapshot_t *t; - u32 id = new.k->p.offset; - int ret = 0; - - mutex_lock(&c->snapshot_table_lock); - - t = snapshot_t_mut(c, id); - if (!t) { - ret = bch_err_throw(c, ENOMEM_mark_snapshot); - goto err; - } - - if (new.k->type == KEY_TYPE_snapshot) { - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - - t->state = !BCH_SNAPSHOT_DELETED(s.v) - ? SNAPSHOT_ID_live - : SNAPSHOT_ID_deleted; - t->parent = le32_to_cpu(s.v->parent); - t->children[0] = le32_to_cpu(s.v->children[0]); - t->children[1] = le32_to_cpu(s.v->children[1]); - t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; - t->tree = le32_to_cpu(s.v->tree); - - if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { - t->depth = le32_to_cpu(s.v->depth); - t->skip[0] = le32_to_cpu(s.v->skip[0]); - t->skip[1] = le32_to_cpu(s.v->skip[1]); - t->skip[2] = le32_to_cpu(s.v->skip[2]); - } else { - t->depth = 0; - t->skip[0] = 0; - t->skip[1] = 0; - t->skip[2] = 0; - } - - u32 parent = id; - - while ((parent = bch2_snapshot_parent_early(c, parent)) && - parent - id - 1 < IS_ANCESTOR_BITMAP) - __set_bit(parent - id - 1, t->is_ancestor); - - if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots) - bch2_delete_dead_snapshots_async(c); - } - } else { - memset(t, 0, sizeof(*t)); - } -err: - mutex_unlock(&c->snapshot_table_lock); - return ret; -} - -int bch2_mark_snapshot(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags); -} - -int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, - struct bch_snapshot *s) -{ - return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_with_updates, snapshot, s); -} - -/* fsck: */ - -static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) -{ - return snapshot_t(c, id)->children[child]; -} - -static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) -{ - return bch2_snapshot_child(c, id, 0); -} - -static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) -{ - return bch2_snapshot_child(c, id, 1); -} - -static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) -{ - u32 n, parent; - - n = bch2_snapshot_left_child(c, id); - if (n) - return n; - - while ((parent = bch2_snapshot_parent(c, id))) { - n = bch2_snapshot_right_child(c, parent); - if (n && n != id) - return n; - id = parent; - } - - return 0; -} - -u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root, - snapshot_id_list *skip) -{ - guard(rcu)(); - u32 id, subvol = 0, s; -retry: - id = snapshot_root; - while (id && bch2_snapshot_exists(c, id)) { - if (!(skip && snapshot_list_has_id(skip, id))) { - s = snapshot_t(c, id)->subvol; - - if (s && (!subvol || s < subvol)) - subvol = s; - } - id = bch2_snapshot_tree_next(c, id); - if (id == snapshot_root) - break; - } - - if (!subvol && skip) { - skip = NULL; - goto retry; - } - - return subvol; -} - -static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, - u32 snapshot_root, u32 *subvol_id) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - bool found = false; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, - 0, k, ret) { - if (k.k->type != KEY_TYPE_subvolume) - continue; - - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) - continue; - if (!BCH_SUBVOLUME_SNAP(s.v)) { - *subvol_id = s.k->p.offset; - found = true; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - if (!ret && !found) { - struct bkey_i_subvolume *u; - - *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL); - - u = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, *subvol_id), - 0, subvolume); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - SET_BCH_SUBVOLUME_SNAP(&u->v, false); - } - - return ret; -} - -static int check_snapshot_tree(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_snapshot_tree st; - struct bch_snapshot s; - struct bch_subvolume subvol; - struct printbuf buf = PRINTBUF; - struct btree_iter snapshot_iter = {}; - u32 root_id; - int ret; - - if (k.k->type != KEY_TYPE_snapshot_tree) - return 0; - - st = bkey_s_c_to_snapshot_tree(k); - root_id = le32_to_cpu(st.v->root_snapshot); - - struct bkey_s_c_snapshot snapshot_k = - bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, - POS(0, root_id), 0, snapshot); - ret = bkey_err(snapshot_k); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (!ret) - bkey_val_copy(&s, snapshot_k); - - if (fsck_err_on(ret || - root_id != bch2_snapshot_root(c, root_id) || - st.k->p.offset != le32_to_cpu(s.tree), - trans, snapshot_tree_to_missing_snapshot, - "snapshot tree points to missing/incorrect snapshot:\n%s", - (bch2_bkey_val_to_text(&buf, c, st.s_c), - prt_newline(&buf), - ret - ? prt_printf(&buf, "(%s)", bch2_err_str(ret)) - : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c), - buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto err; - } - - if (!st.v->master_subvol) - goto out; - - ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, - trans, snapshot_tree_to_missing_subvol, - "snapshot tree points to missing subvolume:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(!bch2_snapshot_is_ancestor(c, - le32_to_cpu(subvol.snapshot), - root_id), - trans, snapshot_tree_to_wrong_subvol, - "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), - trans, snapshot_tree_to_snapshot_subvol, - "snapshot tree points to snapshot subvolume:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { - struct bkey_i_snapshot_tree *u; - u32 subvol_id; - - ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); - bch_err_fn(c, ret); - - if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ - ret = 0; - goto err; - } - - if (ret) - goto err; - - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.master_subvol = cpu_to_le32(subvol_id); - st = snapshot_tree_i_to_s_c(u); - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &snapshot_iter); - printbuf_exit(&buf); - return ret; -} - -/* - * For each snapshot_tree, make sure it points to the root of a snapshot tree - * and that snapshot entry points back to it, or delete it. - * - * And, make sure it points to a subvolume within that snapshot tree, or correct - * it to point to the oldest subvolume within that snapshot tree. - */ -int bch2_check_snapshot_trees(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_snapshot_trees, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_snapshot_tree(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -/* - * Look up snapshot tree for @tree_id and find root, - * make sure @snap_id is a descendent: - */ -static int snapshot_tree_ptr_good(struct btree_trans *trans, - u32 snap_id, u32 tree_id) -{ - struct bch_snapshot_tree s_t; - int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); - - if (bch2_err_matches(ret, ENOENT)) - return 0; - if (ret) - return ret; - - return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); -} - -u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) -{ - if (!id) - return 0; - - guard(rcu)(); - const struct snapshot_t *s = snapshot_t(c, id); - return s->parent - ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)) - : id; -} - -static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) -{ - unsigned i; - - for (i = 0; i < 3; i++) - if (!s.parent) { - if (s.skip[i]) - return false; - } else { - if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i]))) - return false; - } - - return true; -} - -/* - * snapshot_tree pointer was incorrect: look up root snapshot node, make sure - * its snapshot_tree pointer is correct (allocate new one if necessary), then - * update this node's pointer to root node's pointer: - */ -static int snapshot_tree_ptr_repair(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_snapshot *s) -{ - struct bch_fs *c = trans->c; - struct btree_iter root_iter; - struct bch_snapshot_tree s_t; - struct bkey_s_c_snapshot root; - struct bkey_i_snapshot *u; - u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; - int ret; - - root = bch2_bkey_get_iter_typed(trans, &root_iter, - BTREE_ID_snapshots, POS(0, root_id), - BTREE_ITER_with_updates, snapshot); - ret = bkey_err(root); - if (ret) - goto err; - - tree_id = le32_to_cpu(root.v->tree); - - ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { - u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u) ?: - bch2_snapshot_tree_create(trans, root_id, - bch2_snapshot_oldest_subvol(c, root_id, NULL), - &tree_id); - if (ret) - goto err; - - u->v.tree = cpu_to_le32(tree_id); - if (k.k->p.offset == root_id) - *s = u->v; - } - - if (k.k->p.offset != root_id) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.tree = cpu_to_le32(tree_id); - *s = u->v; - } -err: - bch2_trans_iter_exit(trans, &root_iter); - return ret; -} - -static int check_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bch_snapshot s; - struct bch_subvolume subvol; - struct bch_snapshot v; - struct bkey_i_snapshot *u; - u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); - u32 real_depth; - struct printbuf buf = PRINTBUF; - u32 i, id; - int ret = 0; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - memset(&s, 0, sizeof(s)); - memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); - - if (BCH_SNAPSHOT_DELETED(&s)) - return 0; - - id = le32_to_cpu(s.parent); - if (id) { - ret = bch2_snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot with nonexistent parent:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (ret) - goto err; - - if (le32_to_cpu(v.children[0]) != k.k->p.offset && - le32_to_cpu(v.children[1]) != k.k->p.offset) { - bch_err(c, "snapshot parent %u missing pointer to child %llu", - id, k.k->p.offset); - ret = -EINVAL; - goto err; - } - } - - for (i = 0; i < 2 && s.children[i]; i++) { - id = le32_to_cpu(s.children[i]); - - ret = bch2_snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot node %llu has nonexistent child %u", - k.k->p.offset, id); - if (ret) - goto err; - - if (le32_to_cpu(v.parent) != k.k->p.offset) { - bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", - id, le32_to_cpu(v.parent), k.k->p.offset); - ret = -EINVAL; - goto err; - } - } - - bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && - !BCH_SNAPSHOT_WILL_DELETE(&s); - - if (should_have_subvol) { - id = le32_to_cpu(s.subvol); - ret = bch2_subvolume_get(trans, id, false, &subvol); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot points to nonexistent subvolume:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (ret) - goto err; - - if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { - bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", - k.k->p.offset); - ret = -EINVAL; - goto err; - } - } else { - if (fsck_err_on(s.subvol, - trans, snapshot_should_not_have_subvol, - "snapshot should not point to subvol:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.subvol = 0; - s = u->v; - } - } - - ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); - if (ret < 0) - goto err; - - if (fsck_err_on(!ret, - trans, snapshot_to_bad_snapshot_tree, - "snapshot points to missing/incorrect tree:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = snapshot_tree_ptr_repair(trans, iter, k, &s); - if (ret) - goto err; - } - ret = 0; - - real_depth = bch2_snapshot_depth(c, parent_id); - - if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, - trans, snapshot_bad_depth, - "snapshot with incorrect depth field, should be %u:\n%s", - real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.depth = cpu_to_le32(real_depth); - s = u->v; - } - - ret = snapshot_skiplist_good(trans, k.k->p.offset, s); - if (ret < 0) - goto err; - - if (fsck_err_on(!ret, - trans, snapshot_bad_skiplist, - "snapshot with bad skiplist field:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) - u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id)); - - bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); - s = u->v; - } - ret = 0; -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_check_snapshots(struct bch_fs *c) -{ - /* - * We iterate backwards as checking/fixing the depth field requires that - * the parent's depth already be correct: - */ - int ret = bch2_trans_run(c, - for_each_btree_key_reverse_commit(trans, iter, - BTREE_ID_snapshots, POS_MAX, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_snapshot(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static int check_snapshot_exists(struct btree_trans *trans, u32 id) -{ - struct bch_fs *c = trans->c; - - /* Do we need to reconstruct the snapshot_tree entry as well? */ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - u32 tree_id = 0; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, - 0, k, ret) { - if (k.k->type == KEY_TYPE_snapshot_tree && - le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { - tree_id = k.k->p.offset; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - return ret; - - if (!tree_id) { - ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); - if (ret) - return ret; - } - - struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); - ret = PTR_ERR_OR_ZERO(snapshot); - if (ret) - return ret; - - bkey_snapshot_init(&snapshot->k_i); - snapshot->k.p = POS(0, id); - snapshot->v.tree = cpu_to_le32(tree_id); - snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); - - for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, - 0, k, ret) { - if (k.k->type == KEY_TYPE_subvolume && - le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { - snapshot->v.subvol = cpu_to_le32(k.k->p.offset); - SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return bch2_snapshot_table_make_room(c, id) ?: - bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0); -} - -/* Figure out which snapshot nodes belong in the same tree: */ -struct snapshot_tree_reconstruct { - enum btree_id btree; - struct bpos cur_pos; - snapshot_id_list cur_ids; - DARRAY(snapshot_id_list) trees; -}; - -static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r) -{ - darray_for_each(r->trees, i) - darray_exit(i); - darray_exit(&r->trees); - darray_exit(&r->cur_ids); -} - -static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos) -{ - return r->btree == BTREE_ID_inodes - ? r->cur_pos.offset == pos.offset - : r->cur_pos.inode == pos.inode; -} - -static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) -{ - return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL; -} - -static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) -{ - bool first = true; - darray_for_each(*s, i) { - if (!first) - prt_char(out, ' '); - first = false; - prt_printf(out, "%u", *i); - } -} - -static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r) -{ - if (r->cur_ids.nr) { - darray_for_each(r->trees, i) - if (snapshot_id_lists_have_common(i, &r->cur_ids)) { - int ret = snapshot_list_merge(c, i, &r->cur_ids); - if (ret) - return ret; - goto out; - } - darray_push(&r->trees, r->cur_ids); - darray_init(&r->cur_ids); - } -out: - r->cur_ids.nr = 0; - return 0; -} - -static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos) -{ - if (!same_snapshot(r, pos)) - snapshot_tree_reconstruct_next(c, r); - r->cur_pos = pos; - return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot); -} - -int bch2_reconstruct_snapshots(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - struct snapshot_tree_reconstruct r = {}; - int ret = 0; - - for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { - if (btree_type_has_snapshots(btree)) { - r.btree = btree; - - ret = for_each_btree_key(trans, iter, btree, POS_MIN, - BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ - get_snapshot_trees(c, &r, k.k->p); - })); - if (ret) - goto err; - - snapshot_tree_reconstruct_next(c, &r); - } - } - - darray_for_each(r.trees, t) { - printbuf_reset(&buf); - snapshot_id_list_to_text(&buf, t); - - darray_for_each(*t, id) { - if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty, - trans, snapshot_node_missing, - "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { - if (t->nr > 1) { - bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto err; - } - - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_snapshot_exists(trans, *id)); - if (ret) - goto err; - } - } - } -fsck_err: -err: - bch2_trans_put(trans); - snapshot_tree_reconstruct_exit(&r); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -int __bch2_check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot); - - /* Snapshot was definitively deleted, this error is marked autofix */ - if (fsck_err_on(state == SNAPSHOT_ID_deleted, - trans, bkey_in_deleted_snapshot, - "key in deleted snapshot %s, delete?", - (bch2_btree_id_to_text(&buf, iter->btree_id), - prt_char(&buf, ' '), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; - - if (state == SNAPSHOT_ID_empty) { - /* - * Snapshot missing: we should have caught this with btree_lost_data and - * kicked off reconstruct_snapshots, so if we end up here we have no - * idea what happened. - * - * Do not delete unless we know that subvolumes and snapshots - * are consistent: - * - * XXX: - * - * We could be smarter here, and instead of using the generic - * recovery pass ratelimiting, track if there have been any - * changes to the snapshots or inodes btrees since those passes - * last ran. - */ - ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret; - ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret; - - if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)) - ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; - - unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0); - - if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot, - "key in missing snapshot %s, delete?", - (bch2_btree_id_to_text(&buf, iter->btree_id), - prt_char(&buf, ' '), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; - } - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int __bch2_get_snapshot_overwrites(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - snapshot_id_list *s) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos), - BTREE_ITER_all_snapshots, k, ret) { - if (!bkey_eq(k.k->p, pos)) - break; - - if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) || - snapshot_list_has_ancestor(c, s, k.k->p.snapshot)) - continue; - - ret = snapshot_list_add(c, s, k.k->p.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - if (ret) - darray_exit(s); - - return ret; -} - -/* - * Mark a snapshot as deleted, for future cleanup: - */ -int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) -{ - struct btree_iter iter; - struct bkey_i_snapshot *s = - bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, id), - 0, snapshot); - int ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), - trans->c, "missing snapshot %u", id); - return ret; - } - - /* already deleted? */ - if (BCH_SNAPSHOT_WILL_DELETE(&s->v)) - goto err; - - SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true); - SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); - s->v.subvol = 0; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) -{ - if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1])) - swap(s->children[0], s->children[1]); -} - -static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter, p_iter = {}; - struct btree_iter c_iter = {}; - struct btree_iter tree_iter = {}; - u32 parent_id, child_id; - unsigned i; - int ret = 0; - - struct bkey_i_snapshot *s = - bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_intent, snapshot); - ret = PTR_ERR_OR_ZERO(s); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", id); - - if (ret) - goto err; - - BUG_ON(BCH_SNAPSHOT_DELETED(&s->v)); - BUG_ON(s->v.children[1]); - - parent_id = le32_to_cpu(s->v.parent); - child_id = le32_to_cpu(s->v.children[0]); - - if (parent_id) { - struct bkey_i_snapshot *parent; - - parent = bch2_bkey_get_mut_typed(trans, &p_iter, - BTREE_ID_snapshots, POS(0, parent_id), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(parent); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", parent_id); - if (unlikely(ret)) - goto err; - - /* find entry in parent->children for node being deleted */ - for (i = 0; i < 2; i++) - if (le32_to_cpu(parent->v.children[i]) == id) - break; - - if (bch2_fs_inconsistent_on(i == 2, c, - "snapshot %u missing child pointer to %u", - parent_id, id)) - goto err; - - parent->v.children[i] = cpu_to_le32(child_id); - - normalize_snapshot_child_pointers(&parent->v); - } - - if (child_id) { - struct bkey_i_snapshot *child; - - child = bch2_bkey_get_mut_typed(trans, &c_iter, - BTREE_ID_snapshots, POS(0, child_id), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(child); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", child_id); - if (unlikely(ret)) - goto err; - - child->v.parent = cpu_to_le32(parent_id); - - if (!child->v.parent) { - child->v.skip[0] = 0; - child->v.skip[1] = 0; - child->v.skip[2] = 0; - } - } - - if (!parent_id) { - /* - * We're deleting the root of a snapshot tree: update the - * snapshot_tree entry to point to the new root, or delete it if - * this is the last snapshot ID in this tree: - */ - struct bkey_i_snapshot_tree *s_t; - - BUG_ON(s->v.children[1]); - - s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, - BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)), - 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(s_t); - if (ret) - goto err; - - if (s->v.children[0]) { - s_t->v.root_snapshot = s->v.children[0]; - } else { - s_t->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&s_t->k, 0); - } - } - - if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) { - SET_BCH_SNAPSHOT_DELETED(&s->v, true); - s->v.parent = 0; - s->v.children[0] = 0; - s->v.children[1] = 0; - s->v.subvol = 0; - s->v.tree = 0; - s->v.depth = 0; - s->v.skip[0] = 0; - s->v.skip[1] = 0; - s->v.skip[2] = 0; - } else { - s->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&s->k, 0); - } -err: - bch2_trans_iter_exit(trans, &tree_iter); - bch2_trans_iter_exit(trans, &p_iter); - bch2_trans_iter_exit(trans, &c_iter); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i_snapshot *n; - struct bkey_s_c k; - unsigned i, j; - u32 depth = bch2_snapshot_depth(c, parent); - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, - POS_MIN, BTREE_ITER_intent); - k = bch2_btree_iter_peek(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - for (i = 0; i < nr_snapids; i++) { - k = bch2_btree_iter_prev_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k || !k.k->p.offset) { - ret = bch_err_throw(c, ENOSPC_snapshot_create); - goto err; - } - - n = bch2_bkey_alloc(trans, &iter, 0, snapshot); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - n->v.flags = 0; - n->v.parent = cpu_to_le32(parent); - n->v.subvol = cpu_to_le32(snapshot_subvols[i]); - n->v.tree = cpu_to_le32(tree); - n->v.depth = cpu_to_le32(depth); - n->v.btime.lo = cpu_to_le64(bch2_current_time(c)); - n->v.btime.hi = 0; - - for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) - n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); - - bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); - SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); - - ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); - if (ret) - goto err; - - new_snapids[i] = iter.pos.offset; - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * Create new snapshot IDs as children of an existing snapshot ID: - */ -static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct btree_iter iter; - struct bkey_i_snapshot *n_parent; - int ret = 0; - - n_parent = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, parent), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(n_parent); - if (unlikely(ret)) { - if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot %u not found", parent); - return ret; - } - - if (n_parent->v.children[0] || n_parent->v.children[1]) { - bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); - ret = -EINVAL; - goto err; - } - - ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), - new_snapids, snapshot_subvols, nr_snapids); - if (ret) - goto err; - - n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); - n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); - n_parent->v.subvol = 0; - SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * Create a snapshot node that is the root of a new tree: - */ -static int bch2_snapshot_node_create_tree(struct btree_trans *trans, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct bkey_i_snapshot_tree *n_tree; - int ret; - - n_tree = __bch2_snapshot_tree_create(trans); - ret = PTR_ERR_OR_ZERO(n_tree) ?: - create_snapids(trans, 0, n_tree->k.p.offset, - new_snapids, snapshot_subvols, nr_snapids); - if (ret) - return ret; - - n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); - n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); - return 0; -} - -int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - BUG_ON((parent == 0) != (nr_snapids == 1)); - BUG_ON((parent != 0) != (nr_snapids == 2)); - - return parent - ? bch2_snapshot_node_create_children(trans, parent, - new_snapids, snapshot_subvols, nr_snapids) - : bch2_snapshot_node_create_tree(trans, - new_snapids, snapshot_subvols, nr_snapids); - -} - -/* - * If we have an unlinked inode in an internal snapshot node, and the inode - * really has been deleted in all child snapshots, how does this get cleaned up? - * - * first there is the problem of how keys that have been overwritten in all - * child snapshots get deleted (unimplemented?), but inodes may perhaps be - * special? - * - * also: unlinked inode in internal snapshot appears to not be getting deleted - * correctly if inode doesn't exist in leaf snapshots - * - * solution: - * - * for a key in an interior snapshot node that needs work to be done that - * requires it to be mutated: iterate over all descendent leaf nodes and copy - * that key to snapshot leaf nodes, where we can mutate it - */ - -static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) -{ - struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id); - return i ? i->live_child : 0; -} - -static unsigned __live_child(struct snapshot_table *t, u32 id, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) -{ - struct snapshot_t *s = __snapshot_t(t, id); - if (!s) - return 0; - - for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) - if (s->children[i] && - !snapshot_list_has_id(delete_leaves, s->children[i]) && - !interior_delete_has_id(delete_interior, s->children[i])) - return s->children[i]; - - for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) { - u32 live_child = s->children[i] - ? __live_child(t, s->children[i], delete_leaves, delete_interior) - : 0; - if (live_child) - return live_child; - } - - return 0; -} - -static unsigned live_child(struct bch_fs *c, u32 id) -{ - struct snapshot_delete *d = &c->snapshot_delete; - - guard(rcu)(); - return __live_child(rcu_dereference(c->snapshots), id, - &d->delete_leaves, &d->delete_interior); -} - -static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id) -{ - return snapshot_list_has_id(&d->delete_leaves, id) || - interior_delete_has_id(&d->delete_interior, id) != 0; -} - -static int delete_dead_snapshots_process_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct snapshot_delete *d = &trans->c->snapshot_delete; - - if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot)) - return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - - u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot); - if (live_child) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - new->k.p.snapshot = live_child; - - struct btree_iter dst_iter; - struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, - iter->btree_id, new->k.p, - BTREE_ITER_all_snapshots| - BTREE_ITER_intent); - ret = bkey_err(dst_k); - if (ret) - return ret; - - ret = (bkey_deleted(dst_k.k) - ? bch2_trans_update(trans, &dst_iter, new, - BTREE_UPDATE_internal_snapshot_node) - : 0) ?: - bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &dst_iter); - return ret; - } - - return 0; -} - -static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum) -{ - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - - u64 inum = iter->btree_id != BTREE_ID_inodes - ? iter->pos.inode - : iter->pos.offset; - - if (*prev_inum == inum) - return false; - - *prev_inum = inum; - - bool ret = !snapshot_list_has_id(&d->deleting_from_trees, - bch2_snapshot_tree(c, iter->pos.snapshot)); - if (unlikely(ret)) { - struct bpos pos = iter->pos; - pos.snapshot = 0; - if (iter->btree_id != BTREE_ID_inodes) - pos.offset = U64_MAX; - bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos)); - } - - return ret; -} - -static int delete_dead_snapshot_keys_v1(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - - for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { - struct disk_reservation res = { 0 }; - u64 prev_inum = 0; - - d->pos.pos = POS_MIN; - - if (!btree_type_has_snapshots(d->pos.btree)) - continue; - - int ret = for_each_btree_key_commit(trans, iter, - d->pos.btree, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - d->pos.pos = iter.pos; - - if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) - continue; - - delete_dead_snapshots_process_key(trans, &iter, k); - })); - - bch2_disk_reservation_put(c, &res); - - if (ret) - return ret; - } - - return 0; -} - -static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree, - struct bpos start, struct bpos end) -{ - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - struct disk_reservation res = { 0 }; - - d->pos.btree = btree; - d->pos.pos = POS_MIN; - - int ret = for_each_btree_key_max_commit(trans, iter, - btree, start, end, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - d->pos.pos = iter.pos; - delete_dead_snapshots_process_key(trans, &iter, k); - })); - - bch2_disk_reservation_put(c, &res); - return ret; -} - -static int delete_dead_snapshot_keys_v2(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - struct disk_reservation res = { 0 }; - u64 prev_inum = 0; - int ret = 0; - - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); - - while (1) { - struct bkey_s_c k; - ret = lockrestart_do(trans, - bkey_err(k = bch2_btree_iter_peek(trans, &iter))); - if (ret) - break; - - if (!k.k) - break; - - d->pos.btree = iter.btree_id; - d->pos.pos = iter.pos; - - if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) - continue; - - if (snapshot_id_dying(d, k.k->p.snapshot)) { - struct bpos start = POS(k.k->p.offset, 0); - struct bpos end = POS(k.k->p.offset, U64_MAX); - - ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?: - delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?: - delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end); - if (ret) - break; - - bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1)); - } else { - bch2_btree_iter_advance(trans, &iter); - } - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - goto err; - - prev_inum = 0; - ret = for_each_btree_key_commit(trans, iter, - BTREE_ID_inodes, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - d->pos.btree = iter.btree_id; - d->pos.pos = iter.pos; - - if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) - continue; - - delete_dead_snapshots_process_key(trans, &iter, k); - })); -err: - bch2_disk_reservation_put(c, &res); - return ret; -} - -/* - * For a given snapshot, if it doesn't have a subvolume that points to it, and - * it doesn't have child snapshot nodes - it's now redundant and we can mark it - * as deleted. - */ -static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k) -{ - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - unsigned live_children = 0; - int ret = 0; - - if (BCH_SNAPSHOT_SUBVOL(s.v)) - return 0; - - if (BCH_SNAPSHOT_DELETED(s.v)) - return 0; - - mutex_lock(&d->progress_lock); - for (unsigned i = 0; i < 2; i++) { - u32 child = le32_to_cpu(s.v->children[i]); - - live_children += child && - !snapshot_list_has_id(&d->delete_leaves, child); - } - - u32 tree = bch2_snapshot_tree(c, s.k->p.offset); - - if (live_children == 0) { - ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: - snapshot_list_add(c, &d->delete_leaves, s.k->p.offset); - } else if (live_children == 1) { - struct snapshot_interior_delete n = { - .id = s.k->p.offset, - .live_child = live_child(c, s.k->p.offset), - }; - - if (!n.live_child) { - bch_err(c, "error finding live child of snapshot %u", n.id); - ret = -EINVAL; - } else { - ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: - darray_push(&d->delete_interior, n); - } - } - mutex_unlock(&d->progress_lock); - - return ret; -} - -static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, - interior_delete_list *skip) -{ - guard(rcu)(); - while (interior_delete_has_id(skip, id)) - id = __bch2_snapshot_parent(c, id); - - while (n--) { - do { - id = __bch2_snapshot_parent(c, id); - } while (interior_delete_has_id(skip, id)); - } - - return id; -} - -static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c k, - interior_delete_list *deleted) -{ - struct bch_fs *c = trans->c; - u32 nr_deleted_ancestors = 0; - struct bkey_i_snapshot *s; - int ret; - - if (!bch2_snapshot_exists(c, k.k->p.offset)) - return 0; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - if (interior_delete_has_id(deleted, k.k->p.offset)) - return 0; - - s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - darray_for_each(*deleted, i) - nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id); - - if (!nr_deleted_ancestors) - return 0; - - le32_add_cpu(&s->v.depth, -nr_deleted_ancestors); - - if (!s->v.depth) { - s->v.skip[0] = 0; - s->v.skip[1] = 0; - s->v.skip[2] = 0; - } else { - u32 depth = le32_to_cpu(s->v.depth); - u32 parent = bch2_snapshot_parent(c, s->k.p.offset); - - for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { - u32 id = le32_to_cpu(s->v.skip[j]); - - if (interior_delete_has_id(deleted, id)) { - id = bch2_snapshot_nth_parent_skip(c, - parent, - depth > 1 - ? get_random_u32_below(depth - 1) - : 0, - deleted); - s->v.skip[j] = cpu_to_le32(id); - } - } - - bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32); - } - - return bch2_trans_update(trans, iter, &s->k_i, 0); -} - -static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d) -{ - prt_printf(out, "deleting from trees"); - darray_for_each(d->deleting_from_trees, i) - prt_printf(out, " %u", *i); - - prt_printf(out, "deleting leaves"); - darray_for_each(d->delete_leaves, i) - prt_printf(out, " %u", *i); - prt_newline(out); - - prt_printf(out, "interior"); - darray_for_each(d->delete_interior, i) - prt_printf(out, " %u->%u", i->id, i->live_child); - prt_newline(out); -} - -int __bch2_delete_dead_snapshots(struct bch_fs *c) -{ - struct snapshot_delete *d = &c->snapshot_delete; - int ret = 0; - - if (!mutex_trylock(&d->lock)) - return 0; - - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) - goto out_unlock; - - struct btree_trans *trans = bch2_trans_get(c); - - /* - * For every snapshot node: If we have no live children and it's not - * pointed to by a subvolume, delete it: - */ - d->running = true; - d->pos = BBPOS_MIN; - - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, - check_should_delete_snapshot(trans, k)); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "walking snapshots"); - if (ret) - goto err; - - if (!d->delete_leaves.nr && !d->delete_interior.nr) - goto err; - - { - struct printbuf buf = PRINTBUF; - bch2_snapshot_delete_nodes_to_text(&buf, d); - - ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); - printbuf_exit(&buf); - if (ret) - goto err; - } - - ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2) - ? delete_dead_snapshot_keys_v2(trans) - : delete_dead_snapshot_keys_v1(trans); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting keys from dying snapshots"); - if (ret) - goto err; - - darray_for_each(d->delete_leaves, i) { - ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, *i)); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting snapshot %u", *i); - if (ret) - goto err; - } - - /* - * Fixing children of deleted snapshots can't be done completely - * atomically, if we crash between here and when we delete the interior - * nodes some depth fields will be off: - */ - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, - BTREE_ITER_intent, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior)); - if (ret) - goto err; - - darray_for_each(d->delete_interior, i) { - ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, i->id)); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting snapshot %u", i->id); - if (ret) - goto err; - } -err: - mutex_lock(&d->progress_lock); - darray_exit(&d->deleting_from_trees); - darray_exit(&d->delete_interior); - darray_exit(&d->delete_leaves); - d->running = false; - mutex_unlock(&d->progress_lock); - bch2_trans_put(trans); - - bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots); -out_unlock: - mutex_unlock(&d->lock); - if (!bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); - return ret; -} - -int bch2_delete_dead_snapshots(struct bch_fs *c) -{ - if (!c->opts.auto_snapshot_deletion) - return 0; - - return __bch2_delete_dead_snapshots(c); -} - -void bch2_delete_dead_snapshots_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work); - - set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); - - bch2_delete_dead_snapshots(c); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); -} - -void bch2_delete_dead_snapshots_async(struct bch_fs *c) -{ - if (!c->opts.auto_snapshot_deletion) - return; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots)) - return; - - BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - - if (!queue_work(system_long_wq, &c->snapshot_delete.work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); -} - -void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct snapshot_delete *d = &c->snapshot_delete; - - if (!d->running) { - prt_str(out, "(not running)"); - return; - } - - mutex_lock(&d->progress_lock); - bch2_snapshot_delete_nodes_to_text(out, d); - - bch2_bbpos_to_text(out, d->pos); - mutex_unlock(&d->progress_lock); -} - -int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, - enum btree_id id, - struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos), - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots, - k, ret) { - if (!bkey_eq(pos, k.k->p)) - break; - - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { - ret = 1; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap) -{ - /* If there's one child, it's redundant and keys will be moved to the child */ - return !!snap.v->children[0] + !!snap.v->children[1] == 1; -} - -static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) -{ - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_WILL_DELETE(snap.v) || - interior_snapshot_needs_delete(snap)) - set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); - - return 0; -} - -int bch2_snapshots_read(struct bch_fs *c) -{ - /* - * Initializing the is_ancestor bitmaps requires ancestors to already be - * initialized - so mark in reverse: - */ - int ret = bch2_trans_run(c, - for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots, - POS_MAX, 0, k, - __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: - bch2_check_snapshot_needs_deletion(trans, k))); - bch_err_fn(c, ret); - - /* - * It's important that we check if we need to reconstruct snapshots - * before going RW, so we mark that pass as required in the superblock - - * otherwise, we could end up deleting keys with missing snapshot nodes - * instead - */ - BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && - test_bit(BCH_FS_may_go_rw, &c->flags)); - - return ret; -} - -void bch2_fs_snapshots_exit(struct bch_fs *c) -{ - kvfree(rcu_dereference_protected(c->snapshots, true)); -} - -void bch2_fs_snapshots_init_early(struct bch_fs *c) -{ - INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work); - mutex_init(&c->snapshot_delete.lock); - mutex_init(&c->snapshot_delete.progress_lock); - mutex_init(&c->snapshots_unlinked_lock); -} diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h deleted file mode 100644 index 6766bf673ed9..000000000000 --- a/fs/bcachefs/snapshot.h +++ /dev/null @@ -1,275 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SNAPSHOT_H -#define _BCACHEFS_SNAPSHOT_H - -void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); - -#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ - .key_validate = bch2_snapshot_tree_validate, \ - .val_to_text = bch2_snapshot_tree_to_text, \ - .min_val_size = 8, \ -}) - -struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); - -int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); - -void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ - .key_validate = bch2_snapshot_validate, \ - .val_to_text = bch2_snapshot_to_text, \ - .trigger = bch2_mark_snapshot, \ - .min_val_size = 24, \ -}) - -static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) -{ - u32 idx = U32_MAX - id; - - return likely(t && idx < t->nr) - ? &t->s[idx] - : NULL; -} - -static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) -{ - return __snapshot_t(rcu_dereference(c->snapshots), id); -} - -static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->tree : 0; -} - -static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->parent : 0; -} - -static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - return __bch2_snapshot_parent_early(c, id); -} - -static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - if (!s) - return 0; - - u32 parent = s->parent; - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - parent && - s->depth != snapshot_t(c, parent)->depth + 1) - panic("id %u depth=%u parent %u depth=%u\n", - id, snapshot_t(c, id)->depth, - parent, snapshot_t(c, parent)->depth); - - return parent; -} - -static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - return __bch2_snapshot_parent(c, id); -} - -static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) -{ - guard(rcu)(); - while (n--) - id = __bch2_snapshot_parent(c, id); - return id; -} - -u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *); -u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); - -static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - - u32 parent; - while ((parent = __bch2_snapshot_parent(c, id))) - id = parent; - return id; -} - -static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->state : SNAPSHOT_ID_empty; -} - -static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - return __bch2_snapshot_id_state(c, id); -} - -static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) -{ - return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live; -} - -static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; -} - -static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) -{ - int ret = bch2_snapshot_is_internal_node(c, id); - if (ret < 0) - return ret; - return !ret; -} - -static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) -{ - guard(rcu)(); - return parent ? snapshot_t(c, parent)->depth + 1 : 0; -} - -bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); - -static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -{ - return id == ancestor - ? true - : __bch2_snapshot_is_ancestor(c, id, ancestor); -} - -static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - const struct snapshot_t *t = snapshot_t(c, id); - return t && (t->children[0]|t->children[1]) != 0; -} - -static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) -{ - return darray_find(*s, id) != NULL; -} - -static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) -{ - darray_for_each(*s, i) - if (bch2_snapshot_is_ancestor(c, id, *i)) - return true; - return false; -} - -static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) -{ - BUG_ON(snapshot_list_has_id(s, id)); - int ret = darray_push(s, id); - if (ret) - bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); - return ret; -} - -static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id) -{ - int ret = snapshot_list_has_id(s, id) - ? 0 - : darray_push(s, id); - if (ret) - bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); - return ret; -} - -static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src) -{ - darray_for_each(*src, i) { - int ret = snapshot_list_add_nodup(c, dst, *i); - if (ret) - return ret; - } - - return 0; -} - -int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, - struct bch_snapshot *s); -int bch2_snapshot_get_subvol(struct btree_trans *, u32, - struct bch_subvolume *); - -/* only exported for tests: */ -int bch2_snapshot_node_create(struct btree_trans *, u32, - u32 *, u32 *, unsigned); - -int bch2_check_snapshot_trees(struct bch_fs *); -int bch2_check_snapshots(struct bch_fs *); -int bch2_reconstruct_snapshots(struct bch_fs *); - -int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); - -static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot)) - ? 0 - : __bch2_check_key_has_snapshot(trans, iter, k); -} - -int __bch2_get_snapshot_overwrites(struct btree_trans *, - enum btree_id, struct bpos, - snapshot_id_list *); - -/* - * Get a list of snapshot IDs that have overwritten a given key: - */ -static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - snapshot_id_list *s) -{ - darray_init(s); - - return bch2_snapshot_has_children(trans->c, pos.snapshot) - ? __bch2_get_snapshot_overwrites(trans, btree, pos, s) - : 0; - -} - -int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); - -int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); - -static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, - enum btree_id id, - struct bpos pos) -{ - if (!btree_type_has_snapshots(id) || - bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0) - return 0; - - return __bch2_key_has_snapshot_overwrites(trans, id, pos); -} - -int __bch2_delete_dead_snapshots(struct bch_fs *); -int bch2_delete_dead_snapshots(struct bch_fs *); -void bch2_delete_dead_snapshots_work(struct work_struct *); -void bch2_delete_dead_snapshots_async(struct bch_fs *); -void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *); - -int bch2_snapshots_read(struct bch_fs *); -void bch2_fs_snapshots_exit(struct bch_fs *); -void bch2_fs_snapshots_init_early(struct bch_fs *); - -#endif /* _BCACHEFS_SNAPSHOT_H */ diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h deleted file mode 100644 index 9bccae1f3590..000000000000 --- a/fs/bcachefs/snapshot_format.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H -#define _BCACHEFS_SNAPSHOT_FORMAT_H - -struct bch_snapshot { - struct bch_val v; - __le32 flags; - __le32 parent; - __le32 children[2]; - __le32 subvol; - /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ - __le32 tree; - __le32 depth; - __le32 skip[3]; - bch_le128 btime; -}; - -LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) -/* True if a subvolume points to this snapshot node: */ -LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3) - -/* - * Snapshot trees: - * - * The snapshot_trees btree gives us persistent indentifier for each tree of - * bch_snapshot nodes, and allow us to record and easily find the root/master - * subvolume that other snapshots were created from: - */ -struct bch_snapshot_tree { - struct bch_val v; - __le32 master_subvol; - __le32 root_snapshot; -}; - -#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */ diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h deleted file mode 100644 index 0ab698f13e5c..000000000000 --- a/fs/bcachefs/snapshot_types.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SNAPSHOT_TYPES_H -#define _BCACHEFS_SNAPSHOT_TYPES_H - -#include "bbpos_types.h" -#include "darray.h" -#include "subvolume_types.h" - -typedef DARRAY(u32) snapshot_id_list; - -#define IS_ANCESTOR_BITMAP 128 - -struct snapshot_t { - enum snapshot_id_state { - SNAPSHOT_ID_empty, - SNAPSHOT_ID_live, - SNAPSHOT_ID_deleted, - } state; - u32 parent; - u32 skip[3]; - u32 depth; - u32 children[2]; - u32 subvol; /* Nonzero only if a subvolume points to this node: */ - u32 tree; - unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; -}; - -struct snapshot_table { - struct rcu_head rcu; - size_t nr; -#ifndef RUST_BINDGEN - DECLARE_FLEX_ARRAY(struct snapshot_t, s); -#else - struct snapshot_t s[0]; -#endif -}; - -struct snapshot_interior_delete { - u32 id; - u32 live_child; -}; -typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; - -struct snapshot_delete { - struct mutex lock; - struct work_struct work; - - struct mutex progress_lock; - snapshot_id_list deleting_from_trees; - snapshot_id_list delete_leaves; - interior_delete_list delete_interior; - - bool running; - struct bbpos pos; -}; - -#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */ diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c deleted file mode 100644 index 3e9f59226bdf..000000000000 --- a/fs/bcachefs/str_hash.c +++ /dev/null @@ -1,400 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_update.h" -#include "dirent.h" -#include "fsck.h" -#include "str_hash.h" -#include "subvolume.h" - -static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) -{ - if (d.v->d_type == DT_SUBVOL) { - struct bch_subvolume subvol; - int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol), - false, &subvol); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - return !ret; - } else { - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); - int ret = bkey_err(k); - if (ret) - return ret; - - ret = bkey_is_inode(k.k); - bch2_trans_iter_exit(trans, &iter); - return ret; - } -} - -static int bch2_fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old, - bool *updated_before_k_pos) -{ - struct bch_fs *c = trans->c; - struct qstr old_name = bch2_dirent_get_name(old); - struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bkey_dirent_init(&new->k_i); - dirent_copy_target(new, old); - new->k.p = old.k->p; - - char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20); - ret = PTR_ERR_OR_ZERO(renamed_buf); - if (ret) - return ret; - - for (unsigned i = 0; i < 1000; i++) { - new->k.u64s = BKEY_U64s_MAX; - - struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf, - sprintf(renamed_buf, "%.*s.fsck_renamed-%u", - old_name.len, old_name.name, i)); - - ret = bch2_dirent_init_name(c, new, hash_info, &renamed_name, NULL); - if (ret) - return ret; - - ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - (subvol_inum) { 0, old.k->p.inode }, - old.k->p.snapshot, &new->k_i, - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create); - if (ret && !bch2_err_matches(ret, EEXIST)) - break; - if (!ret) { - if (bpos_lt(new->k.p, old.k->p)) - *updated_before_k_pos = true; - break; - } - } - - ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); - bch_err_fn(c, ret); - return ret; -} - -static noinline int hash_pick_winner(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c k1, - struct bkey_s_c k2) -{ - if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && - !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) - return 0; - - switch (desc.btree_id) { - case BTREE_ID_dirents: { - int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1)); - if (ret < 0) - return ret; - if (!ret) - return 0; - - ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2)); - if (ret < 0) - return ret; - if (!ret) - return 1; - return 2; - } - default: - return 0; - } -} - -/* - * str_hash lookups across snapshots break in wild ways if hash_info in - * different snapshot versions doesn't match - so if we find one mismatch, check - * them all - */ -int bch2_repair_inode_hash_info(struct btree_trans *trans, - struct bch_inode_unpacked *snapshot_root) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - bool need_commit = false; - int ret = 0; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, - POS(0, snapshot_root->bi_inum), - BTREE_ITER_all_snapshots, k, ret) { - if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot))) - break; - if (!bkey_is_inode(k.k)) - continue; - - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(k, &inode); - if (ret) - break; - - if (inode.bi_hash_seed == snapshot_root->bi_hash_seed && - INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) { -#ifdef CONFIG_BCACHEFS_DEBUG - struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root); - struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); - - BUG_ON(hash1.type != hash2.type || - memcmp(&hash1.siphash_key, - &hash2.siphash_key, - sizeof(hash1.siphash_key))); -#endif - continue; - } - - printbuf_reset(&buf); - prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n", - snapshot_root->bi_inum, - inode.bi_snapshot, - snapshot_root->bi_snapshot); - - bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode)); - prt_printf(&buf, " %llx\n", inode.bi_hash_seed); - - bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); - prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed); - - if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) { - inode.bi_hash_seed = snapshot_root->bi_hash_seed; - SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); - - ret = __bch2_fsck_write_inode(trans, &inode); - if (ret) - break; - need_commit = true; - } - } - - if (ret) - goto err; - - if (!need_commit) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", - snapshot_root->bi_inum); - - prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot); - bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); - prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed); -#if 0 - prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); - bch2_prt_str_hash_type(&buf, hash_info->type); - prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); -#endif - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto err; - } - - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; -err: -fsck_err: - printbuf_exit(&buf); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * All versions of the same inode in different snapshots must have the same hash - * seed/type: verify that the hash info we're using matches the root - */ -static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, - struct bch_hash_info *hash_info) -{ - struct bch_inode_unpacked snapshot_root; - int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root); - if (ret) - return ret; - - struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root); - if (hash_info->type != hash_root.type || - memcmp(&hash_info->siphash_key, - &hash_root.siphash_key, - sizeof(hash_root.siphash_key))) - ret = bch2_repair_inode_hash_info(trans, &snapshot_root); - - return ret; -} - -/* Put a str_hash key in its proper location, checking for duplicates */ -int bch2_str_hash_repair_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc *desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c k, - struct btree_iter *dup_iter, struct bkey_s_c dup_k, - bool *updated_before_k_pos) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - bool free_snapshots_seen = false; - int ret = 0; - - if (!s) { - s = bch2_trans_kmalloc(trans, sizeof(*s)); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - goto out; - - s->pos = k_iter->pos; - darray_init(&s->ids); - - ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids); - if (ret) - goto out; - - free_snapshots_seen = true; - } - - if (!dup_k.k) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info, - (subvol_inum) { 0, new->k.p.inode }, - new->k.p.snapshot, new, - STR_HASH_must_create| - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node); - ret = bkey_err(dup_k); - if (ret) - goto out; - if (dup_k.k) - goto duplicate_entries; - - if (bpos_lt(new->k.p, k.k->p)) - *updated_before_k_pos = true; - - ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id, - k_iter->pos, new->k.p) ?: - bch2_hash_delete_at(trans, *desc, hash_info, k_iter, - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_commit; - } else { -duplicate_entries: - ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k); - if (ret < 0) - goto out; - - if (!fsck_err(trans, hash_table_key_duplicate, - "duplicate hash table keys%s:\n%s", - ret != 2 ? "" : ", both point to valid inodes", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, dup_k), - buf.buf))) - goto out; - - switch (ret) { - case 0: - ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); - break; - case 1: - ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0); - break; - case 2: - ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info, - bkey_s_c_to_dirent(k), - updated_before_k_pos) ?: - bch2_hash_delete_at(trans, *desc, hash_info, k_iter, - BTREE_ITER_with_updates); - goto out; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_commit; - } -out: -fsck_err: - bch2_trans_iter_exit(trans, dup_iter); - printbuf_exit(&buf); - if (free_snapshots_seen) - darray_exit(&s->ids); - return ret; -} - -int __bch2_str_hash_check_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc *desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k, - bool *updated_before_k_pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter = {}; - struct printbuf buf = PRINTBUF; - struct bkey_s_c k; - int ret = 0; - - u64 hash = desc->hash_bkey(hash_info, hash_k); - if (hash_k.k->p.offset < hash) - goto bad_hash; - - for_each_btree_key_norestart(trans, iter, desc->btree_id, - SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_slots| - BTREE_ITER_with_updates, k, ret) { - if (bkey_eq(k.k->p, hash_k.k->p)) - break; - - if (k.k->type == desc->key_type && - !desc->cmp_bkey(k, hash_k)) { - ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, - hash_info) ?: - bch2_str_hash_repair_key(trans, s, desc, hash_info, - k_iter, hash_k, - &iter, k, updated_before_k_pos); - break; - } - - if (bkey_deleted(k.k)) - goto bad_hash; - } - bch2_trans_iter_exit(trans, &iter); -out: -fsck_err: - printbuf_exit(&buf); - return ret; -bad_hash: - bch2_trans_iter_exit(trans, &iter); - /* - * Before doing any repair, check hash_info itself: - */ - ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info); - if (ret) - goto out; - - if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: should be at %llu\n%s", - hash, - (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) - ret = bch2_str_hash_repair_key(trans, s, desc, hash_info, - k_iter, hash_k, - &iter, bkey_s_c_null, - updated_before_k_pos); - goto out; -} diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h deleted file mode 100644 index 8979ac2d7a3b..000000000000 --- a/fs/bcachefs/str_hash.h +++ /dev/null @@ -1,431 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_STR_HASH_H -#define _BCACHEFS_STR_HASH_H - -#include "btree_iter.h" -#include "btree_update.h" -#include "checksum.h" -#include "error.h" -#include "inode.h" -#include "siphash.h" -#include "subvolume.h" -#include "super.h" - -#include <linux/crc32c.h> -#include <crypto/sha2.h> - -static inline enum bch_str_hash_type -bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) -{ - switch (opt) { - case BCH_STR_HASH_OPT_crc32c: - return BCH_STR_HASH_crc32c; - case BCH_STR_HASH_OPT_crc64: - return BCH_STR_HASH_crc64; - case BCH_STR_HASH_OPT_siphash: - return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) - ? BCH_STR_HASH_siphash - : BCH_STR_HASH_siphash_old; - default: - BUG(); - } -} - -struct bch_hash_info { - u32 inum_snapshot; - u8 type; - struct unicode_map *cf_encoding; - /* - * For crc32 or crc64 string hashes the first key value of - * the siphash_key (k0) is used as the key. - */ - SIPHASH_KEY siphash_key; -}; - -static inline struct bch_hash_info -bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) -{ - struct bch_hash_info info = { - .inum_snapshot = bi->bi_snapshot, - .type = INODE_STR_HASH(bi), - .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, - .siphash_key = { .k0 = bi->bi_hash_seed } - }; - - if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { - u8 digest[SHA256_DIGEST_SIZE]; - - sha256((const u8 *)&bi->bi_hash_seed, - sizeof(bi->bi_hash_seed), digest); - memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); - } - - return info; -} - -struct bch_str_hash_ctx { - union { - u32 crc32c; - u64 crc64; - SIPHASH_CTX siphash; - }; -}; - -static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info) -{ - switch (info->type) { - case BCH_STR_HASH_crc32c: - ctx->crc32c = crc32c(~0, &info->siphash_key.k0, - sizeof(info->siphash_key.k0)); - break; - case BCH_STR_HASH_crc64: - ctx->crc64 = crc64_be(~0, &info->siphash_key.k0, - sizeof(info->siphash_key.k0)); - break; - case BCH_STR_HASH_siphash_old: - case BCH_STR_HASH_siphash: - SipHash24_Init(&ctx->siphash, &info->siphash_key); - break; - default: - BUG(); - } -} - -static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info, - const void *data, size_t len) -{ - switch (info->type) { - case BCH_STR_HASH_crc32c: - ctx->crc32c = crc32c(ctx->crc32c, data, len); - break; - case BCH_STR_HASH_crc64: - ctx->crc64 = crc64_be(ctx->crc64, data, len); - break; - case BCH_STR_HASH_siphash_old: - case BCH_STR_HASH_siphash: - SipHash24_Update(&ctx->siphash, data, len); - break; - default: - BUG(); - } -} - -static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info) -{ - switch (info->type) { - case BCH_STR_HASH_crc32c: - return ctx->crc32c; - case BCH_STR_HASH_crc64: - return ctx->crc64 >> 1; - case BCH_STR_HASH_siphash_old: - case BCH_STR_HASH_siphash: - return SipHash24_End(&ctx->siphash) >> 1; - default: - BUG(); - } -} - -struct bch_hash_desc { - enum btree_id btree_id; - u8 key_type; - - u64 (*hash_key)(const struct bch_hash_info *, const void *); - u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); - bool (*cmp_key)(struct bkey_s_c, const void *); - bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); - bool (*is_visible)(subvol_inum inum, struct bkey_s_c); -}; - -static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k) -{ - return k.k->type == desc.key_type && - (!desc.is_visible || - !inum.inum || - desc.is_visible(inum, k)); -} - -static __always_inline struct bkey_s_c -bch2_hash_lookup_in_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, const void *key, - enum btree_iter_update_trigger_flags flags, - u32 snapshot) -{ - struct bkey_s_c k; - int ret; - - for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(inum.inum, desc.hash_key(info, key), snapshot), - POS(inum.inum, U64_MAX), - BTREE_ITER_slots|flags, k, ret) { - if (is_visible_key(desc, inum, k)) { - if (!desc.cmp_key(k, key)) - return k; - } else if (k.k->type == KEY_TYPE_hash_whiteout) { - ; - } else { - /* hole, not found */ - break; - } - } - bch2_trans_iter_exit(trans, iter); - - return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup); -} - -static __always_inline struct bkey_s_c -bch2_hash_lookup(struct btree_trans *trans, - struct btree_iter *iter, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, const void *key, - enum btree_iter_update_trigger_flags flags) -{ - u32 snapshot; - int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return bkey_s_c_err(ret); - - return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); -} - -static __always_inline int -bch2_hash_hole(struct btree_trans *trans, - struct btree_iter *iter, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, const void *key) -{ - struct bkey_s_c k; - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - - for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(inum.inum, desc.hash_key(info, key), snapshot), - POS(inum.inum, U64_MAX), - BTREE_ITER_slots|BTREE_ITER_intent, k, ret) - if (!is_visible_key(desc, inum, k)) - return 0; - bch2_trans_iter_exit(trans, iter); - - return ret ?: -BCH_ERR_ENOSPC_str_hash_create; -} - -static __always_inline -int bch2_hash_needs_whiteout(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *start) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_copy_iter(trans, &iter, start); - - bch2_btree_iter_advance(trans, &iter); - - for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) { - if (k.k->type != desc.key_type && - k.k->type != KEY_TYPE_hash_whiteout) - break; - - if (k.k->type == desc.key_type && - desc.hash_bkey(info, k) <= start->pos.offset) { - ret = 1; - break; - } - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static __always_inline -struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, u32 snapshot, - struct bkey_i *insert, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter slot = {}; - struct bkey_s_c k; - bool found = false; - int ret; - - for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(insert->k.p.inode, - desc.hash_bkey(info, bkey_i_to_s_c(insert)), - snapshot), - POS(insert->k.p.inode, U64_MAX), - BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) { - if (is_visible_key(desc, inum, k)) { - if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) - goto found; - - /* hash collision: */ - continue; - } - - if (!slot.path && !(flags & STR_HASH_must_replace)) - bch2_trans_copy_iter(trans, &slot, iter); - - if (k.k->type != KEY_TYPE_hash_whiteout) - goto not_found; - } - - if (!ret) - ret = bch_err_throw(c, ENOSPC_str_hash_create); -out: - bch2_trans_iter_exit(trans, &slot); - bch2_trans_iter_exit(trans, iter); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; -found: - found = true; -not_found: - if (found && (flags & STR_HASH_must_create)) { - bch2_trans_iter_exit(trans, &slot); - return k; - } else if (!found && (flags & STR_HASH_must_replace)) { - ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace); - } else { - if (!found && slot.path) - swap(*iter, slot); - - insert->k.p = iter->pos; - ret = bch2_trans_update(trans, iter, insert, flags); - } - - goto out; -} - -static __always_inline -int bch2_hash_set_in_snapshot(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, u32 snapshot, - struct bkey_i *insert, - enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum, - snapshot, insert, flags); - int ret = bkey_err(k); - if (ret) - return ret; - if (k.k) { - bch2_trans_iter_exit(trans, &iter); - return bch_err_throw(trans->c, EEXIST_str_hash_set); - } - - return 0; -} - -static __always_inline -int bch2_hash_set(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, - struct bkey_i *insert, - enum btree_iter_update_trigger_flags flags) -{ - insert->k.p.inode = inum.inum; - - u32 snapshot; - return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: - bch2_hash_set_in_snapshot(trans, desc, info, inum, - snapshot, insert, flags); -} - -static __always_inline -int bch2_hash_delete_at(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_i *delete; - int ret; - - delete = bch2_trans_kmalloc(trans, sizeof(*delete)); - ret = PTR_ERR_OR_ZERO(delete); - if (ret) - return ret; - - ret = bch2_hash_needs_whiteout(trans, desc, info, iter); - if (ret < 0) - return ret; - - bkey_init(&delete->k); - delete->k.p = iter->pos; - delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; - - return bch2_trans_update(trans, iter, delete, flags); -} - -static __always_inline -int bch2_hash_delete(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, const void *key) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key, - BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - return ret; - - ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *); - -struct snapshots_seen; -int bch2_str_hash_repair_key(struct btree_trans *, - struct snapshots_seen *, - const struct bch_hash_desc *, - struct bch_hash_info *, - struct btree_iter *, struct bkey_s_c, - struct btree_iter *, struct bkey_s_c, - bool *); - -int __bch2_str_hash_check_key(struct btree_trans *, - struct snapshots_seen *, - const struct bch_hash_desc *, - struct bch_hash_info *, - struct btree_iter *, struct bkey_s_c, - bool *); - -static inline int bch2_str_hash_check_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc *desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k, - bool *updated_before_k_pos) -{ - if (hash_k.k->type != desc->key_type) - return 0; - - if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) - return 0; - - return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k, - updated_before_k_pos); -} - -#endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c deleted file mode 100644 index 020587449123..000000000000 --- a/fs/bcachefs/subvolume.c +++ /dev/null @@ -1,752 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "enumerated_ref.h" -#include "errcode.h" -#include "error.h" -#include "fs.h" -#include "recovery_passes.h" -#include "snapshot.h" -#include "subvolume.h" - -#include <linux/random.h> - -static int bch2_subvolume_delete(struct btree_trans *, u32); - -static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid) -{ - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "missing subvolume %u", subvolid); - bool print = bch2_count_fsck_err(c, subvol_missing, &buf); - - int ret = bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_inodes, 0); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return ret; -} - -static struct bpos subvolume_children_pos(struct bkey_s_c k) -{ - if (k.k->type != KEY_TYPE_subvolume) - return POS_MIN; - - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - if (!s.v->fs_path_parent) - return POS_MIN; - return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset); -} - -static int check_subvol(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_subvolume subvol; - struct btree_iter subvol_children_iter = {}; - struct bch_snapshot snapshot; - struct printbuf buf = PRINTBUF; - unsigned snapid; - int ret = 0; - - if (k.k->type != KEY_TYPE_subvolume) - return 0; - - subvol = bkey_s_c_to_subvolume(k); - snapid = le32_to_cpu(subvol.v->snapshot); - ret = bch2_snapshot_lookup(trans, snapid, &snapshot); - - if (bch2_err_matches(ret, ENOENT)) - return bch2_run_print_explicit_recovery_pass(c, - BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; - if (ret) - return ret; - - if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { - ret = bch2_subvolume_delete(trans, iter->pos.offset); - bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); - return ret ?: -BCH_ERR_transaction_restart_nested; - } - - if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && - subvol.v->fs_path_parent, - trans, subvol_root_fs_path_parent_nonzero, - "root subvolume has nonzero fs_path_parent\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct bkey_i_subvolume *n = - bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - n->v.fs_path_parent = 0; - } - - if (subvol.v->fs_path_parent) { - struct bpos pos = subvolume_children_pos(k); - - struct bkey_s_c subvol_children_k = - bch2_bkey_get_iter(trans, &subvol_children_iter, - BTREE_ID_subvolume_children, pos, 0); - ret = bkey_err(subvol_children_k); - if (ret) - goto err; - - if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set, - trans, subvol_children_not_set, - "subvolume not set in subvolume_children btree at %llu:%llu\n%s", - pos.inode, pos.offset, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true); - if (ret) - goto err; - } - } - - struct bch_inode_unpacked inode; - ret = bch2_inode_find_by_inum_nowarn_trans(trans, - (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, - &inode); - if (!ret) { - if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, - trans, subvol_root_wrong_bi_subvol, - "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", - inode.bi_inum, inode.bi_snapshot, - inode.bi_subvol, subvol.k->p.offset)) { - inode.bi_subvol = subvol.k->p.offset; - inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); - ret = __bch2_fsck_write_inode(trans, &inode); - if (ret) - goto err; - } - } else if (bch2_err_matches(ret, ENOENT)) { - if (fsck_err(trans, subvol_to_missing_root, - "subvolume %llu points to missing subvolume root %llu:%u", - k.k->p.offset, le64_to_cpu(subvol.v->inode), - le32_to_cpu(subvol.v->snapshot))) { - /* - * Recreate - any contents that are still disconnected - * will then get reattached under lost+found - */ - bch2_inode_init_early(c, &inode); - bch2_inode_init_late(c, &inode, bch2_current_time(c), - 0, 0, S_IFDIR|0700, 0, NULL); - inode.bi_inum = le64_to_cpu(subvol.v->inode); - inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); - inode.bi_subvol = k.k->p.offset; - inode.bi_parent_subvol = le32_to_cpu(subvol.v->fs_path_parent); - ret = __bch2_fsck_write_inode(trans, &inode); - if (ret) - goto err; - } - } else { - goto err; - } - - if (!BCH_SUBVOLUME_SNAP(subvol.v)) { - u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); - u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root); - - struct bch_snapshot_tree st; - ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); - - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "%s: snapshot tree %u not found", __func__, snapshot_tree); - - if (ret) - goto err; - - if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, - trans, subvol_not_master_and_not_snapshot, - "subvolume %llu is not set as snapshot but is not master subvolume", - k.k->p.offset)) { - struct bkey_i_subvolume *s = - bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - goto err; - - SET_BCH_SUBVOLUME_SNAP(&s->v, true); - } - } -err: -fsck_err: - bch2_trans_iter_exit(trans, &subvol_children_iter); - printbuf_exit(&buf); - return ret; -} - -int bch2_check_subvols(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static int check_subvol_child(struct btree_trans *trans, - struct btree_iter *child_iter, - struct bkey_s_c child_k) -{ - struct bch_subvolume s; - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset), - 0, subvolume, &s); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (fsck_err_on(ret || - le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode, - trans, subvol_children_bad, - "incorrect entry in subvolume_children btree %llu:%llu", - child_k.k->p.inode, child_k.k->p.offset)) { - ret = bch2_btree_delete_at(trans, child_iter, 0); - if (ret) - goto err; - } -err: -fsck_err: - return ret; -} - -int bch2_check_subvol_children(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol_child(trans, &iter, k))); - bch_err_fn(c, ret); - return 0; -} - -/* Subvolumes: */ - -int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k); - int ret = 0; - - bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) || - bkey_gt(k.k->p, SUBVOL_POS_MAX), - c, subvol_pos_bad, - "invalid pos"); - - bkey_fsck_err_on(!subvol.v->snapshot, - c, subvol_snapshot_bad, - "invalid snapshot"); - - bkey_fsck_err_on(!subvol.v->inode, - c, subvol_inode_bad, - "invalid inode"); -fsck_err: - return ret; -} - -void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - - prt_printf(out, "root %llu snapshot id %u", - le64_to_cpu(s.v->inode), - le32_to_cpu(s.v->snapshot)); - - if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) { - prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); - prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); - } - - if (BCH_SUBVOLUME_RO(s.v)) - prt_printf(out, " ro"); - if (BCH_SUBVOLUME_SNAP(s.v)) - prt_printf(out, " snapshot"); - if (BCH_SUBVOLUME_UNLINKED(s.v)) - prt_printf(out, " unlinked"); -} - -static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) -{ - return !bpos_eq(pos, POS_MIN) - ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set) - : 0; -} - -int bch2_subvolume_trigger(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - if (flags & BTREE_TRIGGER_transactional) { - struct bpos children_pos_old = subvolume_children_pos(old); - struct bpos children_pos_new = subvolume_children_pos(new.s_c); - - if (!bpos_eq(children_pos_old, children_pos_new)) { - int ret = subvolume_children_mod(trans, children_pos_old, false) ?: - subvolume_children_mod(trans, children_pos_new, true); - if (ret) - return ret; - } - } - - return 0; -} - -int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) -{ - struct btree_iter iter; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); - struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter); - bch2_trans_iter_exit(trans, &iter); - - return bkey_err(k) ?: k.k && k.k->p.inode == subvol - ? -BCH_ERR_ENOTEMPTY_subvol_not_empty - : 0; -} - -static __always_inline int -bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, - bool inconsistent_if_not_found, - struct bch_subvolume *s) -{ - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), - BTREE_ITER_cached| - BTREE_ITER_with_updates, subvolume, s); - if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found) - ret = bch2_subvolume_missing(trans->c, subvol) ?: ret; - return ret; -} - -int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, - bool inconsistent_if_not_found, - struct bch_subvolume *s) -{ - return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s); -} - -int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) -{ - struct bch_subvolume s; - int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s); - if (ret) - return ret; - - if (BCH_SUBVOLUME_RO(&s)) - return -EROFS; - return 0; -} - -int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol) -{ - return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol)); -} - -int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, - struct bch_subvolume *subvol) -{ - struct bch_snapshot snap; - - return bch2_snapshot_lookup(trans, snapshot, &snap) ?: - bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol); -} - -int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, - u32 *snapid, bool warn) -{ - struct btree_iter iter; - struct bkey_s_c_subvolume subvol; - int ret; - - subvol = bch2_bkey_get_iter_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached|BTREE_ITER_with_updates, - subvolume); - ret = bkey_err(subvol); - - if (bch2_err_matches(ret, ENOENT)) - ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; - - if (likely(!ret)) - *snapid = le32_to_cpu(subvol.v->snapshot); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, - u32 *snapid) -{ - return __bch2_subvolume_get_snapshot(trans, subvolid, snapid, true); -} - -static int bch2_subvolume_reparent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - u32 old_parent, u32 new_parent) -{ - struct bkey_i_subvolume *s; - int ret; - - if (k.k->type != KEY_TYPE_subvolume) - return 0; - - if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) && - le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent) - return 0; - - s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - s->v.creation_parent = cpu_to_le32(new_parent); - return 0; -} - -/* - * Separate from the snapshot tree in the snapshots btree, we record the tree - * structure of how snapshot subvolumes were created - the parent subvolume of - * each snapshot subvolume. - * - * When a subvolume is deleted, we scan for child subvolumes and reparant them, - * to avoid dangling references: - */ -static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) -{ - struct bch_subvolume s; - - return lockrestart_do(trans, - bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?: - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_subvolume_reparent(trans, &iter, k, - subvolid_to_delete, le32_to_cpu(s.creation_parent))); -} - -/* - * Delete subvolume, mark snapshot ID as deleted, queue up snapshot - * deletion/cleanup: - */ -static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) -{ - struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {}; - - struct bkey_s_c_subvolume subvol = - bch2_bkey_get_iter_typed(trans, &subvol_iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached|BTREE_ITER_intent, - subvolume); - int ret = bkey_err(subvol); - if (bch2_err_matches(ret, ENOENT)) - ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; - if (ret) - goto err; - - u32 snapid = le32_to_cpu(subvol.v->snapshot); - - struct bkey_s_c_snapshot snapshot = - bch2_bkey_get_iter_typed(trans, &snapshot_iter, - BTREE_ID_snapshots, POS(0, snapid), - 0, snapshot); - ret = bkey_err(snapshot); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing snapshot %u", snapid); - if (ret) - goto err; - - u32 treeid = le32_to_cpu(snapshot.v->tree); - - struct bkey_s_c_snapshot_tree snapshot_tree = - bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, - BTREE_ID_snapshot_trees, POS(0, treeid), - 0, snapshot_tree); - ret = bkey_err(snapshot_tree); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing snapshot tree %u", treeid); - if (ret) - goto err; - - if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { - struct bkey_i_snapshot_tree *snapshot_tree_mut = - bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter, - &snapshot_tree.s_c, - 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(snapshot_tree_mut); - if (ret) - goto err; - - snapshot_tree_mut->v.master_subvol = 0; - } - - ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?: - bch2_snapshot_node_set_deleted(trans, snapid); -err: - bch2_trans_iter_exit(trans, &snapshot_tree_iter); - bch2_trans_iter_exit(trans, &snapshot_iter); - bch2_trans_iter_exit(trans, &subvol_iter); - return ret; -} - -static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) -{ - int ret = bch2_subvolumes_reparent(trans, subvolid) ?: - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_subvolume_delete(trans, subvolid)); - - bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols); - return ret; -} - -static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, - snapshot_wait_for_pagecache_and_delete_work); - int ret = 0; - - while (!ret) { - mutex_lock(&c->snapshots_unlinked_lock); - snapshot_id_list s = c->snapshots_unlinked; - darray_init(&c->snapshots_unlinked); - mutex_unlock(&c->snapshots_unlinked_lock); - - if (!s.nr) - break; - - bch2_evict_subvolume_inodes(c, &s); - - darray_for_each(s, id) { - ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); - bch_err_msg(c, ret, "deleting subvolume %u", *id); - if (ret) - break; - } - - darray_exit(&s); - } - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); -} - -struct subvolume_unlink_hook { - struct btree_trans_commit_hook h; - u32 subvol; -}; - -static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, - struct btree_trans_commit_hook *_h) -{ - struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); - struct bch_fs *c = trans->c; - int ret = 0; - - mutex_lock(&c->snapshots_unlinked_lock); - if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) - ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); - mutex_unlock(&c->snapshots_unlinked_lock); - - if (ret) - return ret; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache)) - return -EROFS; - - if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); - return 0; -} - -int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) -{ - struct btree_iter iter; - struct bkey_i_subvolume *n; - struct subvolume_unlink_hook *h; - int ret = 0; - - h = bch2_trans_kmalloc(trans, sizeof(*h)); - ret = PTR_ERR_OR_ZERO(h); - if (ret) - return ret; - - h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook; - h->subvol = subvolid; - bch2_trans_commit_hook(trans, &h->h); - - n = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached, subvolume); - ret = PTR_ERR_OR_ZERO(n); - if (bch2_err_matches(ret, ENOENT)) - ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; - if (unlikely(ret)) - return ret; - - SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); - n->v.fs_path_parent = 0; - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_subvolume_create(struct btree_trans *trans, u64 inode, - u32 parent_subvolid, - u32 src_subvolid, - u32 *new_subvolid, - u32 *new_snapshotid, - bool ro) -{ - struct bch_fs *c = trans->c; - struct btree_iter dst_iter, src_iter = {}; - struct bkey_i_subvolume *new_subvol = NULL; - struct bkey_i_subvolume *src_subvol = NULL; - u32 parent = 0, new_nodes[2], snapshot_subvols[2]; - int ret = 0; - - ret = bch2_bkey_get_empty_slot(trans, &dst_iter, - BTREE_ID_subvolumes, POS(0, U32_MAX)); - if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = bch_err_throw(c, ENOSPC_subvolume_create); - if (ret) - return ret; - - snapshot_subvols[0] = dst_iter.pos.offset; - snapshot_subvols[1] = src_subvolid; - - if (src_subvolid) { - /* Creating a snapshot: */ - - src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, - BTREE_ID_subvolumes, POS(0, src_subvolid), - BTREE_ITER_cached, subvolume); - ret = PTR_ERR_OR_ZERO(src_subvol); - if (bch2_err_matches(ret, ENOENT)) - ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret; - if (unlikely(ret)) - goto err; - - parent = le32_to_cpu(src_subvol->v.snapshot); - } - - ret = bch2_snapshot_node_create(trans, parent, new_nodes, - snapshot_subvols, - src_subvolid ? 2 : 1); - if (ret) - goto err; - - if (src_subvolid) { - src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); - ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); - if (ret) - goto err; - } - - new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume); - ret = PTR_ERR_OR_ZERO(new_subvol); - if (ret) - goto err; - - new_subvol->v.flags = 0; - new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); - new_subvol->v.inode = cpu_to_le64(inode); - new_subvol->v.creation_parent = cpu_to_le32(src_subvolid); - new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid); - new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); - new_subvol->v.otime.hi = 0; - - SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); - SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); - - *new_subvolid = new_subvol->k.p.offset; - *new_snapshotid = new_nodes[0]; -err: - bch2_trans_iter_exit(trans, &src_iter); - bch2_trans_iter_exit(trans, &dst_iter); - return ret; -} - -int bch2_initialize_subvolumes(struct bch_fs *c) -{ - struct bkey_i_snapshot_tree root_tree; - struct bkey_i_snapshot root_snapshot; - struct bkey_i_subvolume root_volume; - int ret; - - bkey_snapshot_tree_init(&root_tree.k_i); - root_tree.k.p.offset = 1; - root_tree.v.master_subvol = cpu_to_le32(1); - root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); - - bkey_snapshot_init(&root_snapshot.k_i); - root_snapshot.k.p.offset = U32_MAX; - root_snapshot.v.flags = 0; - root_snapshot.v.parent = 0; - root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); - root_snapshot.v.tree = cpu_to_le32(1); - SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); - - bkey_subvolume_init(&root_volume.k_i); - root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_volume.v.flags = 0; - root_volume.v.snapshot = cpu_to_le32(U32_MAX); - root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - - ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?: - bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0, 0) ?: - bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0, 0); - bch_err_fn(c, ret); - return ret; -} - -static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); - ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_is_inode(k.k)) { - struct bch_fs *c = trans->c; - bch_err(c, "root inode not found"); - ret = bch_err_throw(c, ENOENT_inode); - goto err; - } - - ret = bch2_inode_unpack(k, &inode); - BUG_ON(ret); - - inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - - ret = bch2_inode_write(trans, &iter, &inode); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* set bi_subvol on root inode */ -int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) -{ - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_fs_upgrade_for_subvolumes(trans)); - bch_err_fn(c, ret); - return ret; -} - -void bch2_fs_subvolumes_init_early(struct bch_fs *c) -{ - INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, - bch2_subvolume_wait_for_pagecache_and_delete); -} diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h deleted file mode 100644 index 075f55e25c70..000000000000 --- a/fs/bcachefs/subvolume.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUBVOLUME_H -#define _BCACHEFS_SUBVOLUME_H - -#include "darray.h" -#include "subvolume_types.h" - -int bch2_check_subvols(struct bch_fs *); -int bch2_check_subvol_children(struct bch_fs *); - -int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ - .key_validate = bch2_subvolume_validate, \ - .val_to_text = bch2_subvolume_to_text, \ - .trigger = bch2_subvolume_trigger, \ - .min_val_size = 16, \ -}) - -int bch2_subvol_has_children(struct btree_trans *, u32); -int bch2_subvolume_get(struct btree_trans *, unsigned, - bool, struct bch_subvolume *); -int __bch2_subvolume_get_snapshot(struct btree_trans *, u32, - u32 *, bool); -int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); - -int bch2_subvol_is_ro_trans(struct btree_trans *, u32); -int bch2_subvol_is_ro(struct bch_fs *, u32); - -static inline struct bkey_s_c -bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, u32 subvolid, unsigned flags) -{ - u32 snapshot; - int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot); - if (ret) - return bkey_s_c_err(ret); - - bch2_btree_iter_set_snapshot(trans, iter, snapshot); - return bch2_btree_iter_peek_max_type(trans, iter, end, flags); -} - -#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ - _end, _subvolid, _flags, _k, _do) \ -({ \ - struct bkey_s_c _k; \ - int _ret3 = 0; \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\ - _end, _subvolid, (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ - } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - -#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \ - _start, _end, _subvolid, _flags, _k, _do) \ -({ \ - struct btree_iter _iter; \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ - _end, _subvolid, _flags, _k, _do); \ -}) - -int bch2_subvolume_unlink(struct btree_trans *, u32); -int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); - -int bch2_initialize_subvolumes(struct bch_fs *); -int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); - -void bch2_fs_subvolumes_init_early(struct bch_fs *); - -#endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h deleted file mode 100644 index e029df7ba89f..000000000000 --- a/fs/bcachefs/subvolume_format.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H -#define _BCACHEFS_SUBVOLUME_FORMAT_H - -#define SUBVOL_POS_MIN POS(0, 1) -#define SUBVOL_POS_MAX POS(0, S32_MAX) -#define BCACHEFS_ROOT_SUBVOL 1 - -struct bch_subvolume { - struct bch_val v; - __le32 flags; - __le32 snapshot; - __le64 inode; - /* - * Snapshot subvolumes form a tree, separate from the snapshot nodes - * tree - if this subvolume is a snapshot, this is the ID of the - * subvolume it was created from: - * - * This is _not_ necessarily the subvolume of the directory containing - * this subvolume: - */ - __le32 creation_parent; - __le32 fs_path_parent; - bch_le128 otime; -}; - -LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) -/* - * We need to know whether a subvolume is a snapshot so we can know whether we - * can delete it (or whether it should just be rm -rf'd) - */ -LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) -LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) - -#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h deleted file mode 100644 index 9d634b906dcd..000000000000 --- a/fs/bcachefs/subvolume_types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUBVOLUME_TYPES_H -#define _BCACHEFS_SUBVOLUME_TYPES_H - -typedef struct { - /* we can't have padding in this struct: */ - u64 subvol; - u64 inum; -} subvol_inum; - -#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c deleted file mode 100644 index 6c2e1d647403..000000000000 --- a/fs/bcachefs/super-io.c +++ /dev/null @@ -1,1562 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "checksum.h" -#include "disk_groups.h" -#include "ec.h" -#include "error.h" -#include "journal.h" -#include "journal_sb.h" -#include "journal_seq_blacklist.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "quota.h" -#include "sb-clean.h" -#include "sb-counters.h" -#include "sb-downgrade.h" -#include "sb-errors.h" -#include "sb-members.h" -#include "super-io.h" -#include "super.h" -#include "trace.h" -#include "vstructs.h" - -#include <linux/backing-dev.h> -#include <linux/sort.h> -#include <linux/string_choices.h> - -struct bch2_metadata_version { - u16 version; - const char *name; -}; - -static const struct bch2_metadata_version bch2_metadata_versions[] = { -#define x(n, v) { \ - .version = v, \ - .name = #n, \ -}, - BCH_METADATA_VERSIONS() -#undef x -}; - -void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v) -{ - const char *str = "(unknown version)"; - - for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) - if (bch2_metadata_versions[i].version == v) { - str = bch2_metadata_versions[i].name; - break; - } - - prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); -} - -enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v) -{ - if (!BCH_VERSION_MAJOR(v)) - return v; - - for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) - if (bch2_metadata_versions[i].version > v && - BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) == - BCH_VERSION_MAJOR(v)) - v = bch2_metadata_versions[i].version; - - return v; -} - -int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) -{ - int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && - version <= c->sb.version_incompat_allowed) - ? 0 - : -BCH_ERR_may_not_use_incompat_feature; - - mutex_lock(&c->sb_lock); - if (!ret) { - SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); - bch2_write_super(c); - } else { - darray_for_each(c->incompat_versions_requested, i) - if (version == *i) - goto out; - - darray_push(&c->incompat_versions_requested, version); - struct printbuf buf = PRINTBUF; - prt_str(&buf, "requested incompat feature "); - bch2_version_to_text(&buf, version); - prt_str(&buf, " currently not enabled, allowed up to "); - bch2_version_to_text(&buf, version); - prt_printf(&buf, "\n set version_upgrade=incompat to enable"); - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - } - -out: - mutex_unlock(&c->sb_lock); - - return ret; -} - -const char * const bch2_sb_fields[] = { -#define x(name, nr) #name, - BCH_SB_FIELDS() -#undef x - NULL -}; - -static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, - enum bch_validate_flags, struct printbuf *); - -struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, - enum bch_sb_field_type type) -{ - /* XXX: need locking around superblock to access optional fields */ - - vstruct_for_each(sb, f) - if (le32_to_cpu(f->type) == type) - return f; - return NULL; -} - -static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, - struct bch_sb_field *f, - unsigned u64s) -{ - unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; - unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; - - BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size); - - if (!f && !u64s) { - /* nothing to do: */ - } else if (!f) { - f = vstruct_last(sb->sb); - memset(f, 0, sizeof(u64) * u64s); - f->u64s = cpu_to_le32(u64s); - f->type = 0; - } else { - void *src, *dst; - - src = vstruct_end(f); - - if (u64s) { - f->u64s = cpu_to_le32(u64s); - dst = vstruct_end(f); - } else { - dst = f; - } - - memmove(dst, src, vstruct_end(sb->sb) - src); - - if (dst > src) - memset(src, 0, dst - src); - } - - sb->sb->u64s = cpu_to_le32(sb_u64s); - - return u64s ? f : NULL; -} - -void bch2_sb_field_delete(struct bch_sb_handle *sb, - enum bch_sb_field_type type) -{ - struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); - - if (f) - __bch2_sb_field_resize(sb, f, 0); -} - -/* Superblock realloc/free: */ - -void bch2_free_super(struct bch_sb_handle *sb) -{ - kfree(sb->bio); - if (!IS_ERR_OR_NULL(sb->s_bdev_file)) - bdev_fput(sb->s_bdev_file); - kfree(sb->holder); - kfree(sb->sb_name); - - kfree(sb->sb); - memset(sb, 0, sizeof(*sb)); -} - -int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) -{ - size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); - size_t new_buffer_size; - struct bch_sb *new_sb; - struct bio *bio; - - if (sb->bdev) - new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev)); - - new_buffer_size = roundup_pow_of_two(new_bytes); - - if (sb->sb && sb->buffer_size >= new_buffer_size) - return 0; - - if (sb->sb && sb->have_layout) { - u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; - - if (new_bytes > max_bytes) { - struct printbuf buf = PRINTBUF; - - prt_bdevname(&buf, sb->bdev); - prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes); - pr_err("%s", buf.buf); - printbuf_exit(&buf); - return -BCH_ERR_ENOSPC_sb; - } - } - - if (sb->buffer_size >= new_buffer_size && sb->sb) - return 0; - - if (dynamic_fault("bcachefs:add:super_realloc")) - return -BCH_ERR_ENOMEM_sb_realloc_injected; - - new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); - if (!new_sb) - return -BCH_ERR_ENOMEM_sb_buf_realloc; - - sb->sb = new_sb; - - if (sb->have_bio) { - unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size); - - bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); - if (!bio) - return -BCH_ERR_ENOMEM_sb_bio_realloc; - - bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); - - kfree(sb->bio); - sb->bio = bio; - } - - sb->buffer_size = new_buffer_size; - - return 0; -} - -struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, - enum bch_sb_field_type type, - unsigned u64s) -{ - struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); - ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; - ssize_t d = -old_u64s + u64s; - - if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) - return NULL; - - if (sb->fs_sb) { - struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); - - lockdep_assert_held(&c->sb_lock); - - /* XXX: we're not checking that offline device have enough space */ - - for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) { - struct bch_sb_handle *dev_sb = &ca->disk_sb; - - if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize); - return NULL; - } - } - } - - f = bch2_sb_field_get_id(sb->sb, type); - f = __bch2_sb_field_resize(sb, f, u64s); - if (f) - f->type = cpu_to_le32(type); - return f; -} - -struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *sb, - enum bch_sb_field_type type, - unsigned u64s) -{ - struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); - - if (!f || le32_to_cpu(f->u64s) < u64s) - f = bch2_sb_field_resize_id(sb, type, u64s); - return f; -} - -/* Superblock validate: */ - -static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) -{ - u64 offset, prev_offset, max_sectors; - unsigned i; - - BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); - - if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) && - !uuid_equal(&layout->magic, &BCHFS_MAGIC)) { - prt_printf(out, "Not a bcachefs superblock layout"); - return -BCH_ERR_invalid_sb_layout; - } - - if (layout->layout_type != 0) { - prt_printf(out, "Invalid superblock layout type %u", - layout->layout_type); - return -BCH_ERR_invalid_sb_layout_type; - } - - if (!layout->nr_superblocks) { - prt_printf(out, "Invalid superblock layout: no superblocks"); - return -BCH_ERR_invalid_sb_layout_nr_superblocks; - } - - if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { - prt_printf(out, "Invalid superblock layout: too many superblocks"); - return -BCH_ERR_invalid_sb_layout_nr_superblocks; - } - - if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) { - prt_printf(out, "Invalid superblock layout: max_size_bits too high"); - return -BCH_ERR_invalid_sb_layout_sb_max_size_bits; - } - - max_sectors = 1 << layout->sb_max_size_bits; - - prev_offset = le64_to_cpu(layout->sb_offset[0]); - - for (i = 1; i < layout->nr_superblocks; i++) { - offset = le64_to_cpu(layout->sb_offset[i]); - - if (offset < prev_offset + max_sectors) { - prt_printf(out, "Invalid superblock layout: superblocks overlap\n" - " (sb %u ends at %llu next starts at %llu", - i - 1, prev_offset + max_sectors, offset); - return -BCH_ERR_invalid_sb_layout_superblocks_overlap; - } - prev_offset = offset; - } - - return 0; -} - -static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) -{ - u16 version = le16_to_cpu(sb->version); - u16 version_min = le16_to_cpu(sb->version_min); - - if (!bch2_version_compatible(version)) { - prt_str(out, "Unsupported superblock version "); - bch2_version_to_text(out, version); - prt_str(out, " (min "); - bch2_version_to_text(out, bcachefs_metadata_version_min); - prt_str(out, ", max "); - bch2_version_to_text(out, bcachefs_metadata_version_current); - prt_str(out, ")"); - return -BCH_ERR_invalid_sb_version; - } - - if (!bch2_version_compatible(version_min)) { - prt_str(out, "Unsupported superblock version_min "); - bch2_version_to_text(out, version_min); - prt_str(out, " (min "); - bch2_version_to_text(out, bcachefs_metadata_version_min); - prt_str(out, ", max "); - bch2_version_to_text(out, bcachefs_metadata_version_current); - prt_str(out, ")"); - return -BCH_ERR_invalid_sb_version; - } - - if (version_min > version) { - prt_str(out, "Bad minimum version "); - bch2_version_to_text(out, version_min); - prt_str(out, ", greater than version field "); - bch2_version_to_text(out, version); - return -BCH_ERR_invalid_sb_version; - } - - return 0; -} - -int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, - enum bch_validate_flags flags, struct printbuf *out) -{ - enum bch_opt_id opt_id; - int ret; - - ret = bch2_sb_compatible(sb, out); - if (ret) - return ret; - - u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR); - unsigned incompat_bit = 0; - if (incompat) - incompat_bit = __ffs64(incompat); - else if (sb->features[1]) - incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1])); - - if (incompat_bit) { - prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)", - incompat_bit, - bch2_sb_features[BCH_FEATURE_NR - 1], - BCH_FEATURE_NR - 1); - return -BCH_ERR_invalid_sb_features; - } - - if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || - BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { - prt_str(out, "Filesystem has incompatible version "); - bch2_version_to_text(out, le16_to_cpu(sb->version)); - prt_str(out, ", current version "); - bch2_version_to_text(out, bcachefs_metadata_version_current); - return -BCH_ERR_invalid_sb_features; - } - - if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { - prt_printf(out, "Bad user UUID (got zeroes)"); - return -BCH_ERR_invalid_sb_uuid; - } - - if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) { - prt_printf(out, "Bad internal UUID (got zeroes)"); - return -BCH_ERR_invalid_sb_uuid; - } - - if (!(flags & BCH_VALIDATE_write) && - le64_to_cpu(sb->offset) != read_offset) { - prt_printf(out, "Bad sb offset (got %llu, read from %llu)", - le64_to_cpu(sb->offset), read_offset); - return -BCH_ERR_invalid_sb_offset; - } - - if (!sb->nr_devices || - sb->nr_devices > BCH_SB_MEMBERS_MAX) { - prt_printf(out, "Bad number of member devices %u (max %u)", - sb->nr_devices, BCH_SB_MEMBERS_MAX); - return -BCH_ERR_invalid_sb_too_many_members; - } - - if (sb->dev_idx >= sb->nr_devices) { - prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)", - sb->dev_idx, sb->nr_devices); - return -BCH_ERR_invalid_sb_dev_idx; - } - - if (!sb->time_precision || - le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { - prt_printf(out, "Invalid time precision: %u (min 1, max %lu)", - le32_to_cpu(sb->time_precision), NSEC_PER_SEC); - return -BCH_ERR_invalid_sb_time_precision; - } - - /* old versions didn't know to downgrade this field */ - if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version)) - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version)); - - if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) { - prt_printf(out, "Invalid version_incompat "); - bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); - prt_str(out, " > incompat_allowed "); - bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); - if (flags & BCH_VALIDATE_write) - return -BCH_ERR_invalid_sb_version; - else - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); - } - - if (sb->nr_devices > 1) - SET_BCH_SB_MULTI_DEVICE(sb, true); - - if (!flags) { - /* - * Been seeing a bug where these are getting inexplicably - * zeroed, so we're now validating them, but we have to be - * careful not to preven people's filesystems from mounting: - */ - if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) - SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); - if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) - SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); - - if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 && - !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb)) - SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30); - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) - SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); - - if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) - SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags && - !BCH_SB_CSUM_ERR_RETRY_NR(sb)) - SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3); - } - -#ifdef __KERNEL__ - if (!BCH_SB_SHARD_INUMS_NBITS(sb)) - SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus()))); -#endif - - for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { - const struct bch_option *opt = bch2_opt_table + opt_id; - - if (opt->get_sb) { - u64 v = bch2_opt_from_sb(sb, opt_id, -1); - - prt_printf(out, "Invalid option "); - ret = bch2_opt_validate(opt, v, out); - if (ret) - return ret; - - printbuf_reset(out); - } - } - - /* validate layout */ - ret = validate_sb_layout(&sb->layout, out); - if (ret) - return ret; - - vstruct_for_each(sb, f) { - if (!f->u64s) { - prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)", - le32_to_cpu(f->type)); - return -BCH_ERR_invalid_sb_field_size; - } - - if (vstruct_next(f) > vstruct_last(sb)) { - prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", - le32_to_cpu(f->type)); - return -BCH_ERR_invalid_sb_field_size; - } - } - - struct bch_sb_field *mi = - bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?: - bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1); - - /* members must be validated first: */ - if (!mi) { - prt_printf(out, "Invalid superblock: member info area missing"); - return -BCH_ERR_invalid_sb_members_missing; - } - - ret = bch2_sb_field_validate(sb, mi, flags, out); - if (ret) - return ret; - - vstruct_for_each(sb, f) { - if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1) - continue; - - ret = bch2_sb_field_validate(sb, f, flags, out); - if (ret) - return ret; - } - - if ((flags & BCH_VALIDATE_write) && - bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) { - prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu", - le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq), - le64_to_cpu(sb->seq)); - return -BCH_ERR_invalid_sb_members_missing; - } - - return 0; -} - -/* device open: */ - -static unsigned long le_ulong_to_cpu(unsigned long v) -{ - return sizeof(unsigned long) == 8 - ? le64_to_cpu(v) - : le32_to_cpu(v); -} - -static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned nr) -{ - BUG_ON(nr & (BITS_PER_TYPE(long) - 1)); - - for (unsigned i = 0; i < BITS_TO_LONGS(nr); i++) - dst[i] = le_ulong_to_cpu(src[i]); -} - -static void bch2_sb_update(struct bch_fs *c) -{ - struct bch_sb *src = c->disk_sb.sb; - - lockdep_assert_held(&c->sb_lock); - - c->sb.uuid = src->uuid; - c->sb.user_uuid = src->user_uuid; - c->sb.version = le16_to_cpu(src->version); - c->sb.version_incompat = BCH_SB_VERSION_INCOMPAT(src); - c->sb.version_incompat_allowed - = BCH_SB_VERSION_INCOMPAT_ALLOWED(src); - c->sb.version_min = le16_to_cpu(src->version_min); - c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); - c->sb.nr_devices = src->nr_devices; - c->sb.clean = BCH_SB_CLEAN(src); - c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); - - c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); - c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; - - /* XXX this is wrong, we need a 96 or 128 bit integer type */ - c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), - c->sb.nsec_per_time_unit); - c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); - - c->sb.features = le64_to_cpu(src->features[0]); - c->sb.compat = le64_to_cpu(src->compat[0]); - c->sb.multi_device = BCH_SB_MULTI_DEVICE(src); - - memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); - if (ext) { - c->sb.recovery_passes_required = - bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - - le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, - sizeof(c->sb.errors_silent) * 8); - c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); - } - - for_each_member_device(c, ca) { - struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); - ca->mi = bch2_mi_to_cpu(&m); - } -} - -static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) -{ - struct bch_sb_field *src_f, *dst_f; - struct bch_sb *dst = dst_handle->sb; - unsigned i; - - dst->version = src->version; - dst->version_min = src->version_min; - dst->seq = src->seq; - dst->uuid = src->uuid; - dst->user_uuid = src->user_uuid; - memcpy(dst->label, src->label, sizeof(dst->label)); - - dst->block_size = src->block_size; - dst->nr_devices = src->nr_devices; - - dst->time_base_lo = src->time_base_lo; - dst->time_base_hi = src->time_base_hi; - dst->time_precision = src->time_precision; - dst->write_time = src->write_time; - - memcpy(dst->flags, src->flags, sizeof(dst->flags)); - memcpy(dst->features, src->features, sizeof(dst->features)); - memcpy(dst->compat, src->compat, sizeof(dst->compat)); - - for (i = 0; i < BCH_SB_FIELD_NR; i++) { - int d; - - if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) - continue; - - src_f = bch2_sb_field_get_id(src, i); - dst_f = bch2_sb_field_get_id(dst, i); - - d = (src_f ? le32_to_cpu(src_f->u64s) : 0) - - (dst_f ? le32_to_cpu(dst_f->u64s) : 0); - if (d > 0) { - int ret = bch2_sb_realloc(dst_handle, - le32_to_cpu(dst_handle->sb->u64s) + d); - - if (ret) - return ret; - - dst = dst_handle->sb; - dst_f = bch2_sb_field_get_id(dst, i); - } - - dst_f = __bch2_sb_field_resize(dst_handle, dst_f, - src_f ? le32_to_cpu(src_f->u64s) : 0); - - if (src_f) - memcpy(dst_f, src_f, vstruct_bytes(src_f)); - } - - return 0; -} - -int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) -{ - int ret; - - lockdep_assert_held(&c->sb_lock); - - ret = bch2_sb_realloc(&c->disk_sb, 0) ?: - __copy_super(&c->disk_sb, src) ?: - bch2_sb_replicas_to_cpu_replicas(c) ?: - bch2_sb_disk_groups_to_cpu(c); - if (ret) - return ret; - - bch2_sb_update(c); - return 0; -} - -int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) -{ - return __copy_super(&ca->disk_sb, c->disk_sb.sb); -} - -/* read superblock: */ - -static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) -{ - size_t bytes; - int ret; -reread: - bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); - sb->bio->bi_iter.bi_sector = offset; - bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); - - ret = submit_bio_wait(sb->bio); - if (ret) { - prt_printf(err, "IO error: %i", ret); - return ret; - } - - if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && - !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { - prt_str(err, "Not a bcachefs superblock (got magic "); - pr_uuid(err, sb->sb->magic.b); - prt_str(err, ")"); - return -BCH_ERR_invalid_sb_magic; - } - - ret = bch2_sb_compatible(sb->sb, err); - if (ret) - return ret; - - bytes = vstruct_bytes(sb->sb); - - u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits); - if (bytes > sb_size) { - prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)", - bytes, sb_size); - return -BCH_ERR_invalid_sb_too_big; - } - - if (bytes > sb->buffer_size) { - ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)); - if (ret) - return ret; - goto reread; - } - - enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb); - if (csum_type >= BCH_CSUM_NR || - bch2_csum_type_is_encryption(csum_type)) { - prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); - return -BCH_ERR_invalid_sb_csum_type; - } - - /* XXX: verify MACs */ - struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb); - if (bch2_crc_cmp(csum, sb->sb->csum)) { - bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum); - return -BCH_ERR_invalid_sb_csum; - } - - sb->seq = le64_to_cpu(sb->sb->seq); - - return 0; -} - -static int __bch2_read_super(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb, bool ignore_notbchfs_msg) -{ - u64 offset = opt_get(*opts, sb); - struct bch_sb_layout layout; - struct printbuf err = PRINTBUF; - struct printbuf err2 = PRINTBUF; - __le64 *i; - int ret; -#ifndef __KERNEL__ -retry: -#endif - memset(sb, 0, sizeof(*sb)); - sb->mode = BLK_OPEN_READ; - sb->have_bio = true; - sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL); - if (!sb->holder) - return -ENOMEM; - - sb->sb_name = kstrdup(path, GFP_KERNEL); - if (!sb->sb_name) { - ret = -ENOMEM; - prt_printf(&err, "error allocating memory for sb_name"); - goto err; - } - -#ifndef __KERNEL__ - if (opt_get(*opts, direct_io) == false) - sb->mode |= BLK_OPEN_BUFFERED; -#endif - - if (!opt_get(*opts, noexcl)) - sb->mode |= BLK_OPEN_EXCL; - - if (!opt_get(*opts, nochanges)) - sb->mode |= BLK_OPEN_WRITE; - - sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (IS_ERR(sb->s_bdev_file) && - PTR_ERR(sb->s_bdev_file) == -EACCES && - opt_get(*opts, read_only)) { - sb->mode &= ~BLK_OPEN_WRITE; - - sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (!IS_ERR(sb->s_bdev_file)) - opt_set(*opts, nochanges, true); - } - - if (IS_ERR(sb->s_bdev_file)) { - ret = PTR_ERR(sb->s_bdev_file); - prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret)); - goto err; - } - sb->bdev = file_bdev(sb->s_bdev_file); - - ret = bch2_sb_realloc(sb, 0); - if (ret) { - prt_printf(&err, "error allocating memory for superblock"); - goto err; - } - - if (bch2_fs_init_fault("read_super")) { - prt_printf(&err, "dynamic fault"); - ret = -EFAULT; - goto err; - } - - ret = read_one_super(sb, offset, &err); - if (!ret) - goto got_super; - - if (opt_defined(*opts, sb)) - goto err; - - prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n", - path, err.buf); - if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg) - bch2_print_opts(opts, KERN_INFO "%s", err2.buf); - else - bch2_print_opts(opts, KERN_ERR "%s", err2.buf); - - printbuf_exit(&err2); - printbuf_reset(&err); - - /* - * Error reading primary superblock - read location of backup - * superblocks: - */ - bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); - sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; - /* - * use sb buffer to read layout, since sb buffer is page aligned but - * layout won't be: - */ - bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); - - ret = submit_bio_wait(sb->bio); - if (ret) { - prt_printf(&err, "IO error: %i", ret); - goto err; - } - - memcpy(&layout, sb->sb, sizeof(layout)); - ret = validate_sb_layout(&layout, &err); - if (ret) - goto err; - - for (i = layout.sb_offset; - i < layout.sb_offset + layout.nr_superblocks; i++) { - offset = le64_to_cpu(*i); - - if (offset == opt_get(*opts, sb)) { - ret = -BCH_ERR_invalid; - continue; - } - - ret = read_one_super(sb, offset, &err); - if (!ret) - goto got_super; - } - - goto err; - -got_super: - if (le16_to_cpu(sb->sb->block_size) << 9 < - bdev_logical_block_size(sb->bdev) && - opt_get(*opts, direct_io)) { -#ifndef __KERNEL__ - opt_set(*opts, direct_io, false); - bch2_free_super(sb); - goto retry; -#endif - prt_printf(&err, "block size (%u) smaller than device block size (%u)", - le16_to_cpu(sb->sb->block_size) << 9, - bdev_logical_block_size(sb->bdev)); - ret = -BCH_ERR_block_size_too_small; - goto err; - } - - sb->have_layout = true; - - ret = bch2_sb_validate(sb->sb, offset, 0, &err); - if (ret) { - bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", - path, err.buf); - goto err_no_print; - } -out: - printbuf_exit(&err); - return ret; -err: - bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n", - path, err.buf); -err_no_print: - bch2_free_super(sb); - goto out; -} - -int bch2_read_super(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb) -{ - return __bch2_read_super(path, opts, sb, false); -} - -/* provide a silenced version for mount.bcachefs */ - -int bch2_read_super_silent(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb) -{ - return __bch2_read_super(path, opts, sb, true); -} - -/* write superblock: */ - -static void write_super_endio(struct bio *bio) -{ - struct bch_dev *ca = bio->bi_private; - - bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status); - - /* XXX: return errors directly */ - - if (bio->bi_status) { - bch_err_dev_ratelimited(ca, "superblock %s error: %s", - str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status)); - ca->sb_write_error = 1; - } - - closure_put(&ca->fs->sb_write); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); -} - -static void read_back_super(struct bch_fs *c, struct bch_dev *ca) -{ - struct bch_sb *sb = ca->disk_sb.sb; - struct bio *bio = ca->disk_sb.bio; - - memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE); - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); - bio->bi_end_io = write_super_endio; - bio->bi_private = ca; - bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE); - - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); - - enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - closure_bio_submit(bio, &c->sb_write); -} - -static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) -{ - struct bch_sb *sb = ca->disk_sb.sb; - struct bio *bio = ca->disk_sb.bio; - - sb->offset = sb->layout.sb_offset[idx]; - - SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); - sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), - null_nonce(), sb); - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); - bio->bi_end_io = write_super_endio; - bio->bi_private = ca; - bch2_bio_map(bio, sb, - roundup((size_t) vstruct_bytes(sb), - bdev_logical_block_size(ca->disk_sb.bdev))); - - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], - bio_sectors(bio)); - - enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - closure_bio_submit(bio, &c->sb_write); -} - -int bch2_write_super(struct bch_fs *c) -{ - struct closure *cl = &c->sb_write; - struct printbuf err = PRINTBUF; - unsigned sb = 0, nr_wrote; - struct bch_devs_mask sb_written; - bool wrote, can_mount_without_written, can_mount_with_written; - unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; - DARRAY(struct bch_dev *) online_devices = {}; - int ret = 0; - - trace_and_count(c, write_super, c, _RET_IP_); - - if (c->opts.degraded == BCH_DEGRADED_very) - degraded_flags |= BCH_FORCE_IF_LOST; - - lockdep_assert_held(&c->sb_lock); - - closure_init_stack(cl); - memset(&sb_written, 0, sizeof(sb_written)); - - /* - * Note: we do writes to RO devices here, and we might want to change - * that in the future. - * - * For now, we expect to be able to call write_super() when we're not - * yet RW: - */ - for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) { - ret = darray_push(&online_devices, ca); - if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - goto out; - } - enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - } - - /* Make sure we're using the new magic numbers: */ - c->disk_sb.sb->magic = BCHFS_MAGIC; - c->disk_sb.sb->layout.magic = BCHFS_MAGIC; - - le64_add_cpu(&c->disk_sb.sb->seq, 1); - - struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - darray_for_each(online_devices, ca) - __bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq; - c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds()); - - if (test_bit(BCH_FS_error, &c->flags)) - SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); - if (test_bit(BCH_FS_topology_error, &c->flags)) - SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); - - SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); - - bch2_sb_counters_from_cpu(c); - bch2_sb_members_from_cpu(c); - bch2_sb_members_cpy_v2_v1(&c->disk_sb); - bch2_sb_errors_from_cpu(c); - bch2_sb_downgrade_update(c); - - darray_for_each(online_devices, ca) - bch2_sb_from_fs(c, (*ca)); - - darray_for_each(online_devices, ca) { - printbuf_reset(&err); - - ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err); - if (ret) { - bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); - goto out; - } - } - - if (c->opts.nochanges) - goto out; - - /* - * Defer writing the superblock until filesystem initialization is - * complete - don't write out a partly initialized superblock: - */ - if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) - goto out; - - if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) { - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "attempting to write superblock that wasn't version downgraded ("); - bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version)); - prt_str(&buf, " > "); - bch2_version_to_text(&buf, bcachefs_metadata_version_current); - prt_str(&buf, ")"); - bch2_fs_fatal_error(c, ": %s", buf.buf); - printbuf_exit(&buf); - ret = bch_err_throw(c, sb_not_downgraded); - goto out; - } - - darray_for_each(online_devices, ca) { - __set_bit((*ca)->dev_idx, sb_written.d); - (*ca)->sb_write_error = 0; - } - - darray_for_each(online_devices, ca) - read_back_super(c, *ca); - closure_sync(cl); - - darray_for_each(online_devices, cap) { - struct bch_dev *ca = *cap; - - if (ca->sb_write_error) - continue; - - if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { - struct printbuf buf = PRINTBUF; - prt_char(&buf, ' '); - prt_bdevname(&buf, ca->disk_sb.bdev); - prt_printf(&buf, - ": Superblock write was silently dropped! (seq %llu expected %llu)", - le64_to_cpu(ca->sb_read_scratch->seq), - ca->disk_sb.seq); - - if (c->opts.errors != BCH_ON_ERROR_continue && - c->opts.errors != BCH_ON_ERROR_fix_safe) { - ret = bch_err_throw(c, erofs_sb_err); - bch2_fs_fatal_error(c, "%s", buf.buf); - } else { - bch_err(c, "%s", buf.buf); - } - - printbuf_exit(&buf); - } - - if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { - struct printbuf buf = PRINTBUF; - prt_char(&buf, ' '); - prt_bdevname(&buf, ca->disk_sb.bdev); - prt_printf(&buf, - ": Superblock modified by another process (seq %llu expected %llu)", - le64_to_cpu(ca->sb_read_scratch->seq), - ca->disk_sb.seq); - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = bch_err_throw(c, erofs_sb_err); - } - } - - if (ret) - goto out; - - do { - wrote = false; - darray_for_each(online_devices, cap) { - struct bch_dev *ca = *cap; - if (!ca->sb_write_error && - sb < ca->disk_sb.sb->layout.nr_superblocks) { - write_one_super(c, ca, sb); - wrote = true; - } - } - closure_sync(cl); - sb++; - } while (wrote); - - darray_for_each(online_devices, cap) { - struct bch_dev *ca = *cap; - if (ca->sb_write_error) - __clear_bit(ca->dev_idx, sb_written.d); - else - ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); - } - - nr_wrote = dev_mask_nr(&sb_written); - - can_mount_with_written = - bch2_have_enough_devs(c, sb_written, degraded_flags, false); - - for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++) - sb_written.d[i] = ~sb_written.d[i]; - - can_mount_without_written = - bch2_have_enough_devs(c, sb_written, degraded_flags, false); - - /* - * If we would be able to mount _without_ the devices we successfully - * wrote superblocks to, we weren't able to write to enough devices: - * - * Exception: if we can mount without the successes because we haven't - * written anything (new filesystem), we continue if we'd be able to - * mount with the devices we did successfully write to: - */ - if (bch2_fs_fatal_err_on(!nr_wrote || - !can_mount_with_written || - (can_mount_without_written && - !can_mount_with_written), c, - ": Unable to write superblock to sufficient devices (from %ps)", - (void *) _RET_IP_)) - ret = bch_err_throw(c, erofs_sb_err); -out: - /* Make new options visible after they're persistent: */ - bch2_sb_update(c); - darray_for_each(online_devices, ca) - enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super); - darray_exit(&online_devices); - printbuf_exit(&err); - return ret; -} - -void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) -{ - mutex_lock(&c->sb_lock); - if (!(c->sb.features & (1ULL << feat))) { - c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); - - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); -} - -/* Downgrade if superblock is at a higher version than currently supported: */ -bool bch2_check_version_downgrade(struct bch_fs *c) -{ - bool ret = bcachefs_metadata_version_current < c->sb.version; - - lockdep_assert_held(&c->sb_lock); - - /* - * Downgrade, if superblock is at a higher version than currently - * supported: - * - * c->sb will be checked before we write the superblock, so update it as - * well: - */ - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current) - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current); - if (c->sb.version > bcachefs_metadata_version_current) - c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); - if (c->sb.version_min > bcachefs_metadata_version_current) - c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); - c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); - return ret; -} - -void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) -{ - lockdep_assert_held(&c->sb_lock); - - if (BCH_VERSION_MAJOR(new_version) > - BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) - bch2_sb_field_resize(&c->disk_sb, downgrade, 0); - - c->disk_sb.sb->version = cpu_to_le16(new_version); - - if (incompat) { - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); - } -} - -void bch2_sb_upgrade_incompat(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - if (c->sb.version == c->sb.version_incompat_allowed) - goto unlock; - - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Now allowing incompatible features up to "); - bch2_version_to_text(&buf, c->sb.version); - prt_str(&buf, ", previously allowed up to "); - bch2_version_to_text(&buf, c->sb.version_incompat_allowed); - prt_newline(&buf); - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version)); - bch2_write_super(c); -unlock: - mutex_unlock(&c->sb_lock); -} - -static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - if (vstruct_bytes(f) < 88) { - prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88); - return -BCH_ERR_invalid_sb_ext; - } - - return 0; -} - -static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_ext *e = field_to_type(f, ext); - - prt_printf(out, "Recovery passes required:\t"); - prt_bitflags(out, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0]))); - prt_newline(out); - - unsigned long *errors_silent = kmalloc(sizeof(e->errors_silent), GFP_KERNEL); - if (errors_silent) { - le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8); - - prt_printf(out, "Errors to silently fix:\t"); - prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, - min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8)); - prt_newline(out); - - kfree(errors_silent); - } - - prt_printf(out, "Btrees with missing data:\t"); - prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data)); - prt_newline(out); -} - -static const struct bch_sb_field_ops bch_sb_field_ops_ext = { - .validate = bch2_sb_ext_validate, - .to_text = bch2_sb_ext_to_text, -}; - -static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { -#define x(f, nr) \ - [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, - BCH_SB_FIELDS() -#undef x -}; - -static const struct bch_sb_field_ops bch2_sb_field_null_ops; - -static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) -{ - return likely(type < ARRAY_SIZE(bch2_sb_field_ops)) - ? bch2_sb_field_ops[type] - : &bch2_sb_field_null_ops; -} - -static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - unsigned type = le32_to_cpu(f->type); - struct printbuf field_err = PRINTBUF; - const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); - int ret; - - ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0; - if (ret) { - prt_printf(err, "Invalid superblock section %s: %s", - bch2_sb_fields[type], field_err.buf); - prt_newline(err); - bch2_sb_field_to_text(err, sb, f); - } - - printbuf_exit(&field_err); - return ret; -} - -void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - unsigned type = le32_to_cpu(f->type); - const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); - - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - if (ops->to_text) - ops->to_text(out, sb, f); -} - -void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - unsigned type = le32_to_cpu(f->type); - - if (type < BCH_SB_FIELD_NR) - prt_printf(out, "%s", bch2_sb_fields[type]); - else - prt_printf(out, "(unknown field %u)", type); - - prt_printf(out, " (size %zu):", vstruct_bytes(f)); - prt_newline(out); - - __bch2_sb_field_to_text(out, sb, f); -} - -void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) -{ - unsigned i; - - prt_printf(out, "Type: %u", l->layout_type); - prt_newline(out); - - prt_str(out, "Superblock max size: "); - prt_units_u64(out, 512 << l->sb_max_size_bits); - prt_newline(out); - - prt_printf(out, "Nr superblocks: %u", l->nr_superblocks); - prt_newline(out); - - prt_str(out, "Offsets: "); - for (i = 0; i < l->nr_superblocks; i++) { - if (i) - prt_str(out, ", "); - prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i])); - } - prt_newline(out); -} - -void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, - bool print_layout, unsigned fields) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 44); - - prt_printf(out, "External UUID:\t"); - pr_uuid(out, sb->user_uuid.b); - prt_newline(out); - - prt_printf(out, "Internal UUID:\t"); - pr_uuid(out, sb->uuid.b); - prt_newline(out); - - prt_printf(out, "Magic number:\t"); - pr_uuid(out, sb->magic.b); - prt_newline(out); - - prt_printf(out, "Device index:\t%u\n", sb->dev_idx); - - prt_printf(out, "Label:\t"); - if (!strlen(sb->label)) - prt_printf(out, "(none)"); - else - prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); - prt_newline(out); - - prt_printf(out, "Version:\t"); - bch2_version_to_text(out, le16_to_cpu(sb->version)); - prt_newline(out); - - prt_printf(out, "Incompatible features allowed:\t"); - bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); - prt_newline(out); - - prt_printf(out, "Incompatible features in use:\t"); - bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); - prt_newline(out); - - prt_printf(out, "Version upgrade complete:\t"); - bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); - prt_newline(out); - - prt_printf(out, "Oldest version on disk:\t"); - bch2_version_to_text(out, le16_to_cpu(sb->version_min)); - prt_newline(out); - - prt_printf(out, "Created:\t"); - if (sb->time_base_lo) - bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); - else - prt_printf(out, "(not set)"); - prt_newline(out); - - prt_printf(out, "Sequence number:\t"); - prt_printf(out, "%llu", le64_to_cpu(sb->seq)); - prt_newline(out); - - prt_printf(out, "Time of last write:\t"); - bch2_prt_datetime(out, le64_to_cpu(sb->write_time)); - prt_newline(out); - - prt_printf(out, "Superblock size:\t"); - prt_units_u64(out, vstruct_bytes(sb)); - prt_str(out, "/"); - prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits); - prt_newline(out); - - prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb)); - prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb)); - - prt_printf(out, "Sections:\t"); - u64 fields_have = 0; - vstruct_for_each(sb, f) - fields_have |= 1 << le32_to_cpu(f->type); - prt_bitflags(out, bch2_sb_fields, fields_have); - prt_newline(out); - - prt_printf(out, "Features:\t"); - prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); - prt_newline(out); - - prt_printf(out, "Compat features:\t"); - prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); - prt_newline(out); - - prt_newline(out); - prt_printf(out, "Options:"); - prt_newline(out); - printbuf_indent_add(out, 2); - { - enum bch_opt_id id; - - for (id = 0; id < bch2_opts_nr; id++) { - const struct bch_option *opt = bch2_opt_table + id; - - if (opt->get_sb) { - u64 v = bch2_opt_from_sb(sb, id, -1); - - prt_printf(out, "%s:\t", opt->attr.name); - bch2_opt_to_text(out, NULL, sb, opt, v, - OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); - prt_newline(out); - } - } - } - - printbuf_indent_sub(out, 2); - - if (print_layout) { - prt_newline(out); - prt_printf(out, "layout:"); - prt_newline(out); - printbuf_indent_add(out, 2); - bch2_sb_layout_to_text(out, &sb->layout); - printbuf_indent_sub(out, 2); - } - - vstruct_for_each(sb, f) - if (fields & (1 << le32_to_cpu(f->type))) { - prt_newline(out); - bch2_sb_field_to_text(out, sb, f); - } -} diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h deleted file mode 100644 index a3b7a90f2533..000000000000 --- a/fs/bcachefs/super-io.h +++ /dev/null @@ -1,119 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUPER_IO_H -#define _BCACHEFS_SUPER_IO_H - -#include "extents.h" -#include "eytzinger.h" -#include "super_types.h" -#include "super.h" -#include "sb-members.h" - -#include <asm/byteorder.h> - -#define BCH_SB_READ_SCRATCH_BUF_SIZE 4096 - -static inline bool bch2_version_compatible(u16 version) -{ - return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && - version >= bcachefs_metadata_version_min; -} - -void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); -enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); - -int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); - -static inline int bch2_request_incompat_feature(struct bch_fs *c, - enum bcachefs_metadata_version version) -{ - return likely(version <= c->sb.version_incompat) - ? 0 - : bch2_set_version_incompat(c, version); -} - -static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) -{ - return le32_to_cpu(f->u64s) * sizeof(u64); -} - -#define field_to_type(_f, _name) \ - container_of_or_null(_f, struct bch_sb_field_##_name, field) - -struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type); -#define bch2_sb_field_get(_sb, _name) \ - field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name) - -struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *, - enum bch_sb_field_type, unsigned); -#define bch2_sb_field_resize(_sb, _name, _u64s) \ - field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name) - -struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *, - enum bch_sb_field_type, unsigned); -#define bch2_sb_field_get_minsize(_sb, _name, _u64s) \ - field_to_type(bch2_sb_field_get_minsize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name) - -#define bch2_sb_field_nr_entries(_f) \ - (_f ? ((bch2_sb_field_bytes(&_f->field) - sizeof(*_f)) / \ - sizeof(_f->entries[0])) \ - : 0) - -void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); - -extern const char * const bch2_sb_fields[]; - -struct bch_sb_field_ops { - int (*validate)(struct bch_sb *, struct bch_sb_field *, - enum bch_validate_flags, struct printbuf *); - void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); -}; - -static inline __le64 bch2_sb_magic(struct bch_fs *c) -{ - __le64 ret; - - memcpy(&ret, &c->sb.uuid, sizeof(ret)); - return ret; -} - -static inline __u64 jset_magic(struct bch_fs *c) -{ - return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); -} - -static inline __u64 bset_magic(struct bch_fs *c) -{ - return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); -} - -int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); -int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); - -void bch2_free_super(struct bch_sb_handle *); -int bch2_sb_realloc(struct bch_sb_handle *, unsigned); - -int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *); - -int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); -int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); -int bch2_write_super(struct bch_fs *); -void __bch2_check_set_feature(struct bch_fs *, unsigned); - -static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) -{ - if (!(c->sb.features & (1ULL << feat))) - __bch2_check_set_feature(c, feat); -} - -bool bch2_check_version_downgrade(struct bch_fs *); -void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); -void bch2_sb_upgrade_incompat(struct bch_fs *); - -void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, - struct bch_sb_field *); -void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, - struct bch_sb_field *); -void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); -void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned); - -#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c deleted file mode 100644 index c46b1053a02c..000000000000 --- a/fs/bcachefs/super.c +++ /dev/null @@ -1,2547 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bcachefs setup/teardown code, and some metadata io - read a superblock and - * figure out what to do with it. - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "async_objs.h" -#include "backpointers.h" -#include "bkey_sort.h" -#include "btree_cache.h" -#include "btree_gc.h" -#include "btree_journal_iter.h" -#include "btree_key_cache.h" -#include "btree_node_scan.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "btree_write_buffer.h" -#include "buckets_waiting_for_journal.h" -#include "chardev.h" -#include "checksum.h" -#include "clock.h" -#include "compress.h" -#include "debug.h" -#include "disk_accounting.h" -#include "disk_groups.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "errcode.h" -#include "error.h" -#include "fs.h" -#include "fs-io.h" -#include "fs-io-buffered.h" -#include "fs-io-direct.h" -#include "fsck.h" -#include "inode.h" -#include "io_read.h" -#include "io_write.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "journal_seq_blacklist.h" -#include "move.h" -#include "migrate.h" -#include "movinggc.h" -#include "nocow_locking.h" -#include "quota.h" -#include "rebalance.h" -#include "recovery.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-clean.h" -#include "sb-counters.h" -#include "sb-errors.h" -#include "sb-members.h" -#include "snapshot.h" -#include "subvolume.h" -#include "super.h" -#include "super-io.h" -#include "sysfs.h" -#include "thread_with_file.h" -#include "trace.h" - -#include <linux/backing-dev.h> -#include <linux/blkdev.h> -#include <linux/debugfs.h> -#include <linux/device.h> -#include <linux/idr.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/random.h> -#include <linux/sysfs.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); -MODULE_DESCRIPTION("bcachefs filesystem"); - -typedef DARRAY(struct bch_sb_handle) bch_sb_handles; - -#define x(n) #n, -const char * const bch2_fs_flag_strs[] = { - BCH_FS_FLAGS() - NULL -}; - -const char * const bch2_write_refs[] = { - BCH_WRITE_REFS() - NULL -}; - -const char * const bch2_dev_read_refs[] = { - BCH_DEV_READ_REFS() - NULL -}; - -const char * const bch2_dev_write_refs[] = { - BCH_DEV_WRITE_REFS() - NULL -}; -#undef x - -static void __bch2_print_str(struct bch_fs *c, const char *prefix, - const char *str) -{ -#ifdef __KERNEL__ - struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); - - if (unlikely(stdio)) { - bch2_stdio_redirect_printf(stdio, true, "%s", str); - return; - } -#endif - bch2_print_string_as_lines(KERN_ERR, str); -} - -void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) -{ - __bch2_print_str(c, prefix, str); -} - -__printf(2, 0) -static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) -{ -#ifdef __KERNEL__ - if (unlikely(stdio)) { - if (fmt[0] == KERN_SOH[0]) - fmt += 2; - - bch2_stdio_redirect_vprintf(stdio, true, fmt, args); - return; - } -#endif - vprintk(fmt, args); -} - -void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) -{ - struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; - - va_list args; - va_start(args, fmt); - bch2_print_maybe_redirect(stdio, fmt, args); - va_end(args); -} - -void __bch2_print(struct bch_fs *c, const char *fmt, ...) -{ - struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); - - va_list args; - va_start(args, fmt); - bch2_print_maybe_redirect(stdio, fmt, args); - va_end(args); -} - -#define KTYPE(type) \ -static const struct attribute_group type ## _group = { \ - .attrs = type ## _files \ -}; \ - \ -static const struct attribute_group *type ## _groups[] = { \ - &type ## _group, \ - NULL \ -}; \ - \ -static const struct kobj_type type ## _ktype = { \ - .release = type ## _release, \ - .sysfs_ops = &type ## _sysfs_ops, \ - .default_groups = type ## _groups \ -} - -static void bch2_fs_release(struct kobject *); -static void bch2_dev_release(struct kobject *); -static void bch2_fs_counters_release(struct kobject *k) -{ -} - -static void bch2_fs_internal_release(struct kobject *k) -{ -} - -static void bch2_fs_opts_dir_release(struct kobject *k) -{ -} - -static void bch2_fs_time_stats_release(struct kobject *k) -{ -} - -KTYPE(bch2_fs); -KTYPE(bch2_fs_counters); -KTYPE(bch2_fs_internal); -KTYPE(bch2_fs_opts_dir); -KTYPE(bch2_fs_time_stats); -KTYPE(bch2_dev); - -static struct kset *bcachefs_kset; -static LIST_HEAD(bch_fs_list); -static DEFINE_MUTEX(bch_fs_list_lock); - -DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); - -static void bch2_dev_unlink(struct bch_dev *); -static void bch2_dev_free(struct bch_dev *); -static int bch2_dev_alloc(struct bch_fs *, unsigned); -static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); -static void bch2_dev_io_ref_stop(struct bch_dev *, int); -static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); - -struct bch_fs *bch2_dev_to_fs(dev_t dev) -{ - guard(mutex)(&bch_fs_list_lock); - guard(rcu)(); - - struct bch_fs *c; - list_for_each_entry(c, &bch_fs_list, list) - for_each_member_device_rcu(c, ca, NULL) - if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { - closure_get(&c->cl); - return c; - } - return NULL; -} - -static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) -{ - struct bch_fs *c; - - lockdep_assert_held(&bch_fs_list_lock); - - list_for_each_entry(c, &bch_fs_list, list) - if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) - return c; - - return NULL; -} - -struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) -{ - struct bch_fs *c; - - mutex_lock(&bch_fs_list_lock); - c = __bch2_uuid_to_fs(uuid); - if (c) - closure_get(&c->cl); - mutex_unlock(&bch_fs_list_lock); - - return c; -} - -/* Filesystem RO/RW: */ - -/* - * For startup/shutdown of RW stuff, the dependencies are: - * - * - foreground writes depend on copygc and rebalance (to free up space) - * - * - copygc and rebalance depend on mark and sweep gc (they actually probably - * don't because they either reserve ahead of time or don't block if - * allocations fail, but allocations can require mark and sweep gc to run - * because of generation number wraparound) - * - * - all of the above depends on the allocator threads - * - * - allocator depends on the journal (when it rewrites prios and gens) - */ - -static void __bch2_fs_read_only(struct bch_fs *c) -{ - unsigned clean_passes = 0; - u64 seq = 0; - - bch2_fs_ec_stop(c); - bch2_open_buckets_stop(c, NULL, true); - bch2_rebalance_stop(c); - bch2_copygc_stop(c); - bch2_fs_ec_flush(c); - - bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", - journal_cur_seq(&c->journal)); - - do { - clean_passes++; - - if (bch2_btree_interior_updates_flush(c) || - bch2_btree_write_buffer_flush_going_ro(c) || - bch2_journal_flush_all_pins(&c->journal) || - bch2_btree_flush_all_writes(c) || - seq != atomic64_read(&c->journal.seq)) { - seq = atomic64_read(&c->journal.seq); - clean_passes = 0; - } - } while (clean_passes < 2); - - bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", - journal_cur_seq(&c->journal)); - - if (test_bit(JOURNAL_replay_done, &c->journal.flags) && - !test_bit(BCH_FS_emergency_ro, &c->flags)) - set_bit(BCH_FS_clean_shutdown, &c->flags); - - bch2_fs_journal_stop(&c->journal); - - bch_info(c, "%sclean shutdown complete, journal seq %llu", - test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", - c->journal.seq_ondisk); - - /* - * After stopping journal: - */ - for_each_member_device(c, ca) { - bch2_dev_io_ref_stop(ca, WRITE); - bch2_dev_allocator_remove(c, ca); - } -} - -static void bch2_writes_disabled(struct enumerated_ref *writes) -{ - struct bch_fs *c = container_of(writes, struct bch_fs, writes); - - set_bit(BCH_FS_write_disable_complete, &c->flags); - wake_up(&bch2_read_only_wait); -} - -void bch2_fs_read_only(struct bch_fs *c) -{ - if (!test_bit(BCH_FS_rw, &c->flags)) { - bch2_journal_reclaim_stop(&c->journal); - return; - } - - BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); - - bch_verbose(c, "going read-only"); - - /* - * Block new foreground-end write operations from starting - any new - * writes will return -EROFS: - */ - set_bit(BCH_FS_going_ro, &c->flags); - enumerated_ref_stop_async(&c->writes); - - /* - * If we're not doing an emergency shutdown, we want to wait on - * outstanding writes to complete so they don't see spurious errors due - * to shutting down the allocator: - * - * If we are doing an emergency shutdown outstanding writes may - * hang until we shutdown the allocator so we don't want to wait - * on outstanding writes before shutting everything down - but - * we do need to wait on them before returning and signalling - * that going RO is complete: - */ - wait_event(bch2_read_only_wait, - test_bit(BCH_FS_write_disable_complete, &c->flags) || - test_bit(BCH_FS_emergency_ro, &c->flags)); - - bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); - if (writes_disabled) - bch_verbose(c, "finished waiting for writes to stop"); - - __bch2_fs_read_only(c); - - wait_event(bch2_read_only_wait, - test_bit(BCH_FS_write_disable_complete, &c->flags)); - - if (!writes_disabled) - bch_verbose(c, "finished waiting for writes to stop"); - - clear_bit(BCH_FS_write_disable_complete, &c->flags); - clear_bit(BCH_FS_going_ro, &c->flags); - clear_bit(BCH_FS_rw, &c->flags); - - if (!bch2_journal_error(&c->journal) && - !test_bit(BCH_FS_error, &c->flags) && - !test_bit(BCH_FS_emergency_ro, &c->flags) && - test_bit(BCH_FS_started, &c->flags) && - test_bit(BCH_FS_clean_shutdown, &c->flags) && - c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { - BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); - BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); - BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); - BUG_ON(c->btree_write_buffer.inc.keys.nr); - BUG_ON(c->btree_write_buffer.flushing.keys.nr); - bch2_verify_accounting_clean(c); - - bch_verbose(c, "marking filesystem clean"); - bch2_fs_mark_clean(c); - } else { - /* Make sure error counts/counters are persisted */ - mutex_lock(&c->sb_lock); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - bch_verbose(c, "done going read-only, filesystem not clean"); - } -} - -static void bch2_fs_read_only_work(struct work_struct *work) -{ - struct bch_fs *c = - container_of(work, struct bch_fs, read_only_work); - - down_write(&c->state_lock); - bch2_fs_read_only(c); - up_write(&c->state_lock); -} - -static void bch2_fs_read_only_async(struct bch_fs *c) -{ - queue_work(system_long_wq, &c->read_only_work); -} - -bool bch2_fs_emergency_read_only(struct bch_fs *c) -{ - bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - - bch2_journal_halt(&c->journal); - bch2_fs_read_only_async(c); - - wake_up(&bch2_read_only_wait); - return ret; -} - -static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out, - bool locked) -{ - bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - - if (!locked) - bch2_journal_halt(&c->journal); - else - bch2_journal_halt_locked(&c->journal); - bch2_fs_read_only_async(c); - wake_up(&bch2_read_only_wait); - - if (ret) - prt_printf(out, "emergency read only at seq %llu\n", - journal_cur_seq(&c->journal)); - - return ret; -} - -bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out) -{ - return __bch2_fs_emergency_read_only2(c, out, false); -} - -bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) -{ - bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - - bch2_journal_halt_locked(&c->journal); - bch2_fs_read_only_async(c); - - wake_up(&bch2_read_only_wait); - return ret; -} - -static int __bch2_fs_read_write(struct bch_fs *c, bool early) -{ - int ret; - - BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - - if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) - return bch_err_throw(c, erofs_no_alloc_info); - - if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { - bch_err(c, "cannot go rw, unfixed btree errors"); - return bch_err_throw(c, erofs_unfixed_errors); - } - - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { - bch_err(c, "cannot go rw, filesystem is an unresized image file"); - return bch_err_throw(c, erofs_filesystem_full); - } - - if (test_bit(BCH_FS_rw, &c->flags)) - return 0; - - bch_info(c, "going read-write"); - - ret = bch2_fs_init_rw(c); - if (ret) - goto err; - - ret = bch2_sb_members_v2_init(c); - if (ret) - goto err; - - clear_bit(BCH_FS_clean_shutdown, &c->flags); - - scoped_guard(rcu) - for_each_online_member_rcu(c, ca) - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - bch2_dev_allocator_add(c, ca); - enumerated_ref_start(&ca->io_ref[WRITE]); - } - - bch2_recalc_capacity(c); - - /* - * First journal write must be a flush write: after a clean shutdown we - * don't read the journal, so the first journal write may end up - * overwriting whatever was there previously, and there must always be - * at least one non-flush write in the journal or recovery will fail: - */ - spin_lock(&c->journal.lock); - set_bit(JOURNAL_need_flush_write, &c->journal.flags); - set_bit(JOURNAL_running, &c->journal.flags); - bch2_journal_space_available(&c->journal); - spin_unlock(&c->journal.lock); - - ret = bch2_fs_mark_dirty(c); - if (ret) - goto err; - - ret = bch2_journal_reclaim_start(&c->journal); - if (ret) - goto err; - - set_bit(BCH_FS_rw, &c->flags); - set_bit(BCH_FS_was_rw, &c->flags); - - enumerated_ref_start(&c->writes); - - ret = bch2_copygc_start(c); - if (ret) { - bch_err_msg(c, ret, "error starting copygc thread"); - goto err; - } - - ret = bch2_rebalance_start(c); - if (ret) { - bch_err_msg(c, ret, "error starting rebalance thread"); - goto err; - } - - bch2_do_discards(c); - bch2_do_invalidates(c); - bch2_do_stripe_deletes(c); - bch2_do_pending_node_rewrites(c); - return 0; -err: - if (test_bit(BCH_FS_rw, &c->flags)) - bch2_fs_read_only(c); - else - __bch2_fs_read_only(c); - return ret; -} - -int bch2_fs_read_write(struct bch_fs *c) -{ - if (c->opts.recovery_pass_last && - c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) - return bch_err_throw(c, erofs_norecovery); - - if (c->opts.nochanges) - return bch_err_throw(c, erofs_nochanges); - - if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) - return bch_err_throw(c, erofs_no_alloc_info); - - return __bch2_fs_read_write(c, false); -} - -int bch2_fs_read_write_early(struct bch_fs *c) -{ - down_write(&c->state_lock); - int ret = __bch2_fs_read_write(c, true); - up_write(&c->state_lock); - - return ret; -} - -/* Filesystem startup/shutdown: */ - -static void __bch2_fs_free(struct bch_fs *c) -{ - for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) - bch2_time_stats_exit(&c->times[i]); - -#ifdef CONFIG_UNICODE - utf8_unload(c->cf_encoding); -#endif - - bch2_find_btree_nodes_exit(&c->found_btree_nodes); - bch2_free_pending_node_rewrites(c); - bch2_free_fsck_errs(c); - bch2_fs_vfs_exit(c); - bch2_fs_snapshots_exit(c); - bch2_fs_sb_errors_exit(c); - bch2_fs_replicas_exit(c); - bch2_fs_rebalance_exit(c); - bch2_fs_quota_exit(c); - bch2_fs_nocow_locking_exit(c); - bch2_fs_journal_exit(&c->journal); - bch2_fs_fs_io_direct_exit(c); - bch2_fs_fs_io_buffered_exit(c); - bch2_fs_fsio_exit(c); - bch2_fs_io_write_exit(c); - bch2_fs_io_read_exit(c); - bch2_fs_encryption_exit(c); - bch2_fs_ec_exit(c); - bch2_fs_counters_exit(c); - bch2_fs_compress_exit(c); - bch2_io_clock_exit(&c->io_clock[WRITE]); - bch2_io_clock_exit(&c->io_clock[READ]); - bch2_fs_buckets_waiting_for_journal_exit(c); - bch2_fs_btree_write_buffer_exit(c); - bch2_fs_btree_key_cache_exit(&c->btree_key_cache); - bch2_fs_btree_iter_exit(c); - bch2_fs_btree_interior_update_exit(c); - bch2_fs_btree_cache_exit(c); - bch2_fs_accounting_exit(c); - bch2_fs_async_obj_exit(c); - bch2_journal_keys_put_initial(c); - bch2_find_btree_nodes_exit(&c->found_btree_nodes); - - BUG_ON(atomic_read(&c->journal_keys.ref)); - percpu_free_rwsem(&c->mark_lock); - if (c->online_reserved) { - u64 v = percpu_u64_get(c->online_reserved); - WARN(v, "online_reserved not 0 at shutdown: %lli", v); - free_percpu(c->online_reserved); - } - - darray_exit(&c->incompat_versions_requested); - darray_exit(&c->btree_roots_extra); - free_percpu(c->pcpu); - free_percpu(c->usage); - mempool_exit(&c->large_bkey_pool); - mempool_exit(&c->btree_bounce_pool); - bioset_exit(&c->btree_bio); - mempool_exit(&c->fill_iter); - enumerated_ref_exit(&c->writes); - kfree(rcu_dereference_protected(c->disk_groups, 1)); - kfree(c->journal_seq_blacklist_table); - - if (c->write_ref_wq) - destroy_workqueue(c->write_ref_wq); - if (c->btree_write_submit_wq) - destroy_workqueue(c->btree_write_submit_wq); - if (c->btree_read_complete_wq) - destroy_workqueue(c->btree_read_complete_wq); - if (c->copygc_wq) - destroy_workqueue(c->copygc_wq); - if (c->btree_write_complete_wq) - destroy_workqueue(c->btree_write_complete_wq); - if (c->btree_update_wq) - destroy_workqueue(c->btree_update_wq); - - bch2_free_super(&c->disk_sb); - kvfree(c); - module_put(THIS_MODULE); -} - -static void bch2_fs_release(struct kobject *kobj) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - __bch2_fs_free(c); -} - -void __bch2_fs_stop(struct bch_fs *c) -{ - bch_verbose(c, "shutting down"); - - set_bit(BCH_FS_stopping, &c->flags); - - down_write(&c->state_lock); - bch2_fs_read_only(c); - up_write(&c->state_lock); - - for (unsigned i = 0; i < c->sb.nr_devices; i++) { - struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); - if (ca) - bch2_dev_io_ref_stop(ca, READ); - } - - for_each_member_device(c, ca) - bch2_dev_unlink(ca); - - if (c->kobj.state_in_sysfs) - kobject_del(&c->kobj); - - bch2_fs_debug_exit(c); - bch2_fs_chardev_exit(c); - - bch2_ro_ref_put(c); - wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); - - kobject_put(&c->counters_kobj); - kobject_put(&c->time_stats); - kobject_put(&c->opts_dir); - kobject_put(&c->internal); - - /* btree prefetch might have kicked off reads in the background: */ - bch2_btree_flush_all_reads(c); - - for_each_member_device(c, ca) - cancel_work_sync(&ca->io_error_work); - - cancel_work_sync(&c->read_only_work); -} - -void bch2_fs_free(struct bch_fs *c) -{ - mutex_lock(&bch_fs_list_lock); - list_del(&c->list); - mutex_unlock(&bch_fs_list_lock); - - closure_sync(&c->cl); - closure_debug_destroy(&c->cl); - - for (unsigned i = 0; i < c->sb.nr_devices; i++) { - struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); - - if (ca) { - EBUG_ON(atomic_long_read(&ca->ref) != 1); - bch2_dev_io_ref_stop(ca, READ); - bch2_free_super(&ca->disk_sb); - bch2_dev_free(ca); - } - } - - bch_verbose(c, "shutdown complete"); - - kobject_put(&c->kobj); -} - -void bch2_fs_stop(struct bch_fs *c) -{ - __bch2_fs_stop(c); - bch2_fs_free(c); -} - -static int bch2_fs_online(struct bch_fs *c) -{ - int ret = 0; - - lockdep_assert_held(&bch_fs_list_lock); - - if (c->sb.multi_device && - __bch2_uuid_to_fs(c->sb.uuid)) { - bch_err(c, "filesystem UUID already open"); - return bch_err_throw(c, filesystem_uuid_already_open); - } - - ret = bch2_fs_chardev_init(c); - if (ret) { - bch_err(c, "error creating character device"); - return ret; - } - - bch2_fs_debug_init(c); - - ret = (c->sb.multi_device - ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) - : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: - kobject_add(&c->internal, &c->kobj, "internal") ?: - kobject_add(&c->opts_dir, &c->kobj, "options") ?: -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: -#endif - kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: - bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS); - if (ret) { - bch_err(c, "error creating sysfs objects"); - return ret; - } - - down_write(&c->state_lock); - - for_each_member_device(c, ca) { - ret = bch2_dev_sysfs_online(c, ca); - if (ret) { - bch_err(c, "error creating sysfs objects"); - bch2_dev_put(ca); - goto err; - } - } - - BUG_ON(!list_empty(&c->list)); - list_add(&c->list, &bch_fs_list); -err: - up_write(&c->state_lock); - return ret; -} - -int bch2_fs_init_rw(struct bch_fs *c) -{ - if (test_bit(BCH_FS_rw_init_done, &c->flags)) - return 0; - - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || - !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", - WQ_FREEZABLE, 0))) - return bch_err_throw(c, ENOMEM_fs_other_alloc); - - int ret = bch2_fs_btree_interior_update_init(c) ?: - bch2_fs_btree_write_buffer_init(c) ?: - bch2_fs_fs_io_buffered_init(c) ?: - bch2_fs_io_write_init(c) ?: - bch2_fs_journal_init(&c->journal); - if (ret) - return ret; - - set_bit(BCH_FS_rw_init_done, &c->flags); - return 0; -} - -static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, - bch_sb_handles *sbs) -{ - struct bch_fs *c; - struct printbuf name = PRINTBUF; - unsigned i, iter_size; - int ret = 0; - - c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); - if (!c) { - c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); - goto out; - } - - c->stdio = (void *)(unsigned long) opts->stdio; - - __module_get(THIS_MODULE); - - closure_init(&c->cl, NULL); - - c->kobj.kset = bcachefs_kset; - kobject_init(&c->kobj, &bch2_fs_ktype); - kobject_init(&c->internal, &bch2_fs_internal_ktype); - kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); - kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); - kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); - - c->minor = -1; - c->disk_sb.fs_sb = true; - - init_rwsem(&c->state_lock); - mutex_init(&c->sb_lock); - mutex_init(&c->replicas_gc_lock); - mutex_init(&c->btree_root_lock); - INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); - - refcount_set(&c->ro_ref, 1); - init_waitqueue_head(&c->ro_ref_wait); - - for (i = 0; i < BCH_TIME_STAT_NR; i++) - bch2_time_stats_init(&c->times[i]); - - bch2_fs_allocator_background_init(c); - bch2_fs_allocator_foreground_init(c); - bch2_fs_btree_cache_init_early(&c->btree_cache); - bch2_fs_btree_gc_init_early(c); - bch2_fs_btree_interior_update_init_early(c); - bch2_fs_btree_iter_init_early(c); - bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); - bch2_fs_btree_write_buffer_init_early(c); - bch2_fs_copygc_init(c); - bch2_fs_ec_init_early(c); - bch2_fs_journal_init_early(&c->journal); - bch2_fs_journal_keys_init(c); - bch2_fs_move_init(c); - bch2_fs_nocow_locking_init_early(c); - bch2_fs_quota_init(c); - bch2_fs_recovery_passes_init(c); - bch2_fs_sb_errors_init_early(c); - bch2_fs_snapshots_init_early(c); - bch2_fs_subvolumes_init_early(c); - - INIT_LIST_HEAD(&c->list); - - mutex_init(&c->bio_bounce_pages_lock); - mutex_init(&c->snapshot_table_lock); - init_rwsem(&c->snapshot_create_lock); - - spin_lock_init(&c->btree_write_error_lock); - - INIT_LIST_HEAD(&c->journal_iters); - - INIT_LIST_HEAD(&c->fsck_error_msgs); - mutex_init(&c->fsck_error_msgs_lock); - - seqcount_init(&c->usage_lock); - - sema_init(&c->io_in_flight, 128); - - INIT_LIST_HEAD(&c->vfs_inodes_list); - mutex_init(&c->vfs_inodes_lock); - - c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; - c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; - c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; - - mutex_init(&c->sectors_available_lock); - - ret = percpu_init_rwsem(&c->mark_lock); - if (ret) - goto err; - - mutex_lock(&c->sb_lock); - ret = bch2_sb_to_fs(c, sb); - mutex_unlock(&c->sb_lock); - - if (ret) - goto err; - - /* Compat: */ - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && - !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) - SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && - !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) - SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); - - c->opts = bch2_opts_default; - ret = bch2_opts_from_sb(&c->opts, sb); - if (ret) - goto err; - - bch2_opts_apply(&c->opts, *opts); - - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - c->opts.block_size > PAGE_SIZE) { - bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); - ret = -EINVAL; - goto err; - } - - c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; - if (c->opts.inodes_use_key_cache) - c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; - c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; - - c->block_bits = ilog2(block_sectors(c)); - c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); - - if (bch2_fs_init_fault("fs_alloc")) { - bch_err(c, "fs_alloc fault injected"); - ret = -EFAULT; - goto err; - } - - if (c->sb.multi_device) - pr_uuid(&name, c->sb.user_uuid.b); - else - prt_bdevname(&name, sbs->data[0].bdev); - - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; - if (ret) - goto err; - - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - - iter_size = sizeof(struct sort_iter) + - (btree_blocks(c) + 1) * 2 * - sizeof(struct sort_iter_set); - - if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || - enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, - bch2_writes_disabled) || - mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || - bioset_init(&c->btree_bio, 1, - max(offsetof(struct btree_read_bio, bio), - offsetof(struct btree_write_bio, wbio.bio)), - BIOSET_NEED_BVECS) || - !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || - !(c->usage = alloc_percpu(struct bch_fs_usage_base)) || - !(c->online_reserved = alloc_percpu(u64)) || - mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, - c->opts.btree_node_size) || - mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { - ret = bch_err_throw(c, ENOMEM_fs_other_alloc); - goto err; - } - - ret = - bch2_fs_async_obj_init(c) ?: - bch2_fs_btree_cache_init(c) ?: - bch2_fs_btree_iter_init(c) ?: - bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_buckets_waiting_for_journal_init(c) ?: - bch2_io_clock_init(&c->io_clock[READ]) ?: - bch2_io_clock_init(&c->io_clock[WRITE]) ?: - bch2_fs_compress_init(c) ?: - bch2_fs_counters_init(c) ?: - bch2_fs_ec_init(c) ?: - bch2_fs_encryption_init(c) ?: - bch2_fs_fsio_init(c) ?: - bch2_fs_fs_io_direct_init(c) ?: - bch2_fs_io_read_init(c) ?: - bch2_fs_rebalance_init(c) ?: - bch2_fs_sb_errors_init(c) ?: - bch2_fs_vfs_init(c); - if (ret) - goto err; - - if (go_rw_in_recovery(c)) { - /* - * start workqueues/kworkers early - kthread creation checks for - * pending signals, which is _very_ annoying - */ - ret = bch2_fs_init_rw(c); - if (ret) - goto err; - } - -#ifdef CONFIG_UNICODE - if (bch2_fs_casefold_enabled(c)) { - /* Default encoding until we can potentially have more as an option. */ - c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); - if (IS_ERR(c->cf_encoding)) { - printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); - ret = -EINVAL; - goto err; - } - } -#else - if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { - printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); - ret = -EINVAL; - goto err; - } -#endif - - for (i = 0; i < c->sb.nr_devices; i++) { - if (!bch2_member_exists(c->disk_sb.sb, i)) - continue; - ret = bch2_dev_alloc(c, i); - if (ret) - goto err; - } - - bch2_journal_entry_res_resize(&c->journal, - &c->btree_root_journal_res, - BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); - bch2_journal_entry_res_resize(&c->journal, - &c->clock_journal_res, - (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); - - mutex_lock(&bch_fs_list_lock); - ret = bch2_fs_online(c); - mutex_unlock(&bch_fs_list_lock); - - if (ret) - goto err; -out: - return c; -err: - bch2_fs_free(c); - c = ERR_PTR(ret); - goto out; -} - -noinline_for_stack -static void print_mount_opts(struct bch_fs *c) -{ - enum bch_opt_id i; - CLASS(printbuf, p)(); - bch2_log_msg_start(c, &p); - - prt_str(&p, "starting version "); - bch2_version_to_text(&p, c->sb.version); - - bool first = true; - for (i = 0; i < bch2_opts_nr; i++) { - const struct bch_option *opt = &bch2_opt_table[i]; - u64 v = bch2_opt_get_by_id(&c->opts, i); - - if (!(opt->flags & OPT_MOUNT)) - continue; - - if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) - continue; - - prt_str(&p, first ? " opts=" : ","); - first = false; - bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); - } - - if (c->sb.version_incompat_allowed != c->sb.version) { - prt_printf(&p, "\nallowing incompatible features above "); - bch2_version_to_text(&p, c->sb.version_incompat_allowed); - } - - if (c->opts.verbose) { - prt_printf(&p, "\nfeatures: "); - prt_bitflags(&p, bch2_sb_features, c->sb.features); - } - - if (c->sb.multi_device) { - prt_printf(&p, "\nwith devices"); - for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { - prt_char(&p, ' '); - prt_str(&p, ca->name); - } - } - - bch2_print_str(c, KERN_INFO, p.buf); -} - -static bool bch2_fs_may_start(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned flags = 0; - - switch (c->opts.degraded) { - case BCH_DEGRADED_very: - flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; - break; - case BCH_DEGRADED_yes: - flags |= BCH_FORCE_IF_DEGRADED; - break; - default: - mutex_lock(&c->sb_lock); - for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { - if (!bch2_member_exists(c->disk_sb.sb, i)) - continue; - - ca = bch2_dev_locked(c, i); - - if (!bch2_dev_is_online(ca) && - (ca->mi.state == BCH_MEMBER_STATE_rw || - ca->mi.state == BCH_MEMBER_STATE_ro)) { - mutex_unlock(&c->sb_lock); - return false; - } - } - mutex_unlock(&c->sb_lock); - break; - } - - return bch2_have_enough_devs(c, c->online_devs, flags, true); -} - -int bch2_fs_start(struct bch_fs *c) -{ - time64_t now = ktime_get_real_seconds(); - int ret = 0; - - print_mount_opts(c); - - if (c->cf_encoding) - bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); - - if (!bch2_fs_may_start(c)) - return bch_err_throw(c, insufficient_devices_to_start); - - down_write(&c->state_lock); - mutex_lock(&c->sb_lock); - - BUG_ON(test_bit(BCH_FS_started, &c->flags)); - - if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, - sizeof(struct bch_sb_field_ext) / sizeof(u64))) { - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); - ret = bch_err_throw(c, ENOSPC_sb); - goto err; - } - - ret = bch2_sb_members_v2_init(c); - if (ret) { - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); - goto err; - } - - scoped_guard(rcu) - for_each_online_member_rcu(c, ca) - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = - cpu_to_le64(now); - - /* - * Dno't write superblock yet: recovery might have to downgrade - */ - mutex_unlock(&c->sb_lock); - - scoped_guard(rcu) - for_each_online_member_rcu(c, ca) - if (ca->mi.state == BCH_MEMBER_STATE_rw) - bch2_dev_allocator_add(c, ca); - bch2_recalc_capacity(c); - up_write(&c->state_lock); - - c->recovery_task = current; - ret = BCH_SB_INITIALIZED(c->disk_sb.sb) - ? bch2_fs_recovery(c) - : bch2_fs_initialize(c); - c->recovery_task = NULL; - - if (ret) - goto err; - - ret = bch2_opts_hooks_pre_set(c); - if (ret) - goto err; - - if (bch2_fs_init_fault("fs_start")) { - ret = bch_err_throw(c, injected_fs_start); - goto err; - } - - set_bit(BCH_FS_started, &c->flags); - wake_up(&c->ro_ref_wait); - - down_write(&c->state_lock); - if (c->opts.read_only) - bch2_fs_read_only(c); - else if (!test_bit(BCH_FS_rw, &c->flags)) - ret = bch2_fs_read_write(c); - up_write(&c->state_lock); - -err: - if (ret) - bch_err_msg(c, ret, "starting filesystem"); - else - bch_verbose(c, "done starting filesystem"); - return ret; -} - -static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) -{ - struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); - - if (le16_to_cpu(sb->block_size) != block_sectors(c)) - return bch_err_throw(c, mismatched_block_size); - - if (le16_to_cpu(m.bucket_size) < - BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) - return bch_err_throw(c, bucket_size_too_small); - - return 0; -} - -static int bch2_dev_in_fs(struct bch_sb_handle *fs, - struct bch_sb_handle *sb, - struct bch_opts *opts) -{ - if (fs == sb) - return 0; - - if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) - return -BCH_ERR_device_not_a_member_of_filesystem; - - if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) - return -BCH_ERR_device_has_been_removed; - - if (fs->sb->block_size != sb->sb->block_size) - return -BCH_ERR_mismatched_block_size; - - if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || - le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) - return 0; - - if (fs->sb->seq == sb->sb->seq && - fs->sb->write_time != sb->sb->write_time) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Split brain detected between "); - prt_bdevname(&buf, sb->bdev); - prt_str(&buf, " and "); - prt_bdevname(&buf, fs->bdev); - prt_char(&buf, ':'); - prt_newline(&buf); - prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); - prt_newline(&buf); - - prt_bdevname(&buf, fs->bdev); - prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); - prt_newline(&buf); - - prt_bdevname(&buf, sb->bdev); - prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); - prt_newline(&buf); - - if (!opts->no_splitbrain_check) - prt_printf(&buf, "Not using older sb"); - - pr_err("%s", buf.buf); - printbuf_exit(&buf); - - if (!opts->no_splitbrain_check) - return -BCH_ERR_device_splitbrain; - } - - struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); - u64 seq_from_fs = le64_to_cpu(m.seq); - u64 seq_from_member = le64_to_cpu(sb->sb->seq); - - if (seq_from_fs && seq_from_fs < seq_from_member) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Split brain detected between "); - prt_bdevname(&buf, sb->bdev); - prt_str(&buf, " and "); - prt_bdevname(&buf, fs->bdev); - prt_char(&buf, ':'); - prt_newline(&buf); - - prt_bdevname(&buf, fs->bdev); - prt_str(&buf, " believes seq of "); - prt_bdevname(&buf, sb->bdev); - prt_printf(&buf, " to be %llu, but ", seq_from_fs); - prt_bdevname(&buf, sb->bdev); - prt_printf(&buf, " has %llu\n", seq_from_member); - - if (!opts->no_splitbrain_check) { - prt_str(&buf, "Not using "); - prt_bdevname(&buf, sb->bdev); - } - - pr_err("%s", buf.buf); - printbuf_exit(&buf); - - if (!opts->no_splitbrain_check) - return -BCH_ERR_device_splitbrain; - } - - return 0; -} - -/* Device startup/shutdown: */ - -static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) -{ - if (rw == READ) - clear_bit(ca->dev_idx, ca->fs->online_devs.d); - - if (!enumerated_ref_is_zero(&ca->io_ref[rw])) - enumerated_ref_stop(&ca->io_ref[rw], - rw == READ - ? bch2_dev_read_refs - : bch2_dev_write_refs); -} - -static void bch2_dev_release(struct kobject *kobj) -{ - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - - kfree(ca); -} - -static void bch2_dev_free(struct bch_dev *ca) -{ - WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); - WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); - - cancel_work_sync(&ca->io_error_work); - - bch2_dev_unlink(ca); - - if (ca->kobj.state_in_sysfs) - kobject_del(&ca->kobj); - - bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); - bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); - - bch2_free_super(&ca->disk_sb); - bch2_dev_allocator_background_exit(ca); - bch2_dev_journal_exit(ca); - - free_percpu(ca->io_done); - bch2_dev_buckets_free(ca); - kfree(ca->sb_read_scratch); - - bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); - bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); - - enumerated_ref_exit(&ca->io_ref[WRITE]); - enumerated_ref_exit(&ca->io_ref[READ]); -#ifndef CONFIG_BCACHEFS_DEBUG - percpu_ref_exit(&ca->ref); -#endif - kobject_put(&ca->kobj); -} - -static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) -{ - - lockdep_assert_held(&c->state_lock); - - if (enumerated_ref_is_zero(&ca->io_ref[READ])) - return; - - __bch2_dev_read_only(c, ca); - - bch2_dev_io_ref_stop(ca, READ); - - bch2_dev_unlink(ca); - - bch2_free_super(&ca->disk_sb); - bch2_dev_journal_exit(ca); -} - -#ifndef CONFIG_BCACHEFS_DEBUG -static void bch2_dev_ref_complete(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, ref); - - complete(&ca->ref_completion); -} -#endif - -static void bch2_dev_unlink(struct bch_dev *ca) -{ - struct kobject *b; - - /* - * This is racy w.r.t. the underlying block device being hot-removed, - * which removes it from sysfs. - * - * It'd be lovely if we had a way to handle this race, but the sysfs - * code doesn't appear to provide a good method and block/holder.c is - * susceptible as well: - */ - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev && - (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { - sysfs_remove_link(b, "bcachefs"); - sysfs_remove_link(&ca->kobj, "block"); - } -} - -static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) -{ - int ret; - - if (!c->kobj.state_in_sysfs) - return 0; - - if (!ca->kobj.state_in_sysfs) { - ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?: - bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); - if (ret) - return ret; - } - - if (ca->disk_sb.bdev) { - struct kobject *block = bdev_kobj(ca->disk_sb.bdev); - - ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); - if (ret) - return ret; - - ret = sysfs_create_link(&ca->kobj, block, "block"); - if (ret) - return ret; - } - - return 0; -} - -static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, - struct bch_member *member) -{ - struct bch_dev *ca; - unsigned i; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - return NULL; - - kobject_init(&ca->kobj, &bch2_dev_ktype); - init_completion(&ca->ref_completion); - - INIT_WORK(&ca->io_error_work, bch2_io_error_work); - - bch2_time_stats_quantiles_init(&ca->io_latency[READ]); - bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); - - ca->mi = bch2_mi_to_cpu(member); - - for (i = 0; i < ARRAY_SIZE(member->errors); i++) - atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); - - ca->uuid = member->uuid; - - ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, - ca->mi.bucket_size / btree_sectors(c)); - -#ifndef CONFIG_BCACHEFS_DEBUG - if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) - goto err; -#else - atomic_long_set(&ca->ref, 1); -#endif - - mutex_init(&ca->bucket_backpointer_mismatch.lock); - mutex_init(&ca->bucket_backpointer_empty.lock); - - bch2_dev_allocator_background_init(ca); - - if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || - enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) || - !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || - bch2_dev_buckets_alloc(c, ca) || - !(ca->io_done = alloc_percpu(*ca->io_done))) - goto err; - - return ca; -err: - bch2_dev_free(ca); - return NULL; -} - -static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, - unsigned dev_idx) -{ - ca->dev_idx = dev_idx; - __set_bit(ca->dev_idx, ca->self.d); - - if (!ca->name[0]) - scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); - - ca->fs = c; - rcu_assign_pointer(c->devs[ca->dev_idx], ca); - - if (bch2_dev_sysfs_online(c, ca)) - pr_warn("error creating sysfs objects"); -} - -static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) -{ - struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - struct bch_dev *ca = NULL; - - if (bch2_fs_init_fault("dev_alloc")) - goto err; - - ca = __bch2_dev_alloc(c, &member); - if (!ca) - goto err; - - ca->fs = c; - - bch2_dev_attach(c, ca, dev_idx); - return 0; -err: - return bch_err_throw(c, ENOMEM_dev_alloc); -} - -static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) -{ - unsigned ret; - - if (bch2_dev_is_online(ca)) { - bch_err(ca, "already have device online in slot %u", - sb->sb->dev_idx); - return bch_err_throw(ca->fs, device_already_online); - } - - if (get_capacity(sb->bdev->bd_disk) < - ca->mi.bucket_size * ca->mi.nbuckets) { - bch_err(ca, "cannot online: device too small"); - return bch_err_throw(ca->fs, device_size_too_small); - } - - BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); - BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); - - ret = bch2_dev_journal_init(ca, sb->sb); - if (ret) - return ret; - - struct printbuf name = PRINTBUF; - prt_bdevname(&name, sb->bdev); - strscpy(ca->name, name.buf, sizeof(ca->name)); - printbuf_exit(&name); - - /* Commit: */ - ca->disk_sb = *sb; - memset(sb, 0, sizeof(*sb)); - - /* - * Stash pointer to the filesystem for blk_holder_ops - note that once - * attached to a filesystem, we will always close the block device - * before tearing down the filesystem object. - */ - ca->disk_sb.holder->c = ca->fs; - - ca->dev = ca->disk_sb.bdev->bd_dev; - - enumerated_ref_start(&ca->io_ref[READ]); - - return 0; -} - -static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) -{ - struct bch_dev *ca; - int ret; - - lockdep_assert_held(&c->state_lock); - - if (le64_to_cpu(sb->sb->seq) > - le64_to_cpu(c->disk_sb.sb->seq)) - bch2_sb_to_fs(c, sb->sb); - - BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); - - ca = bch2_dev_locked(c, sb->sb->dev_idx); - - ret = __bch2_dev_attach_bdev(ca, sb); - if (ret) - return ret; - - set_bit(ca->dev_idx, c->online_devs.d); - - bch2_dev_sysfs_online(c, ca); - - bch2_rebalance_wakeup(c); - return 0; -} - -/* Device management: */ - -/* - * Note: this function is also used by the error paths - when a particular - * device sees an error, we call it to determine whether we can just set the - * device RO, or - if this function returns false - we'll set the whole - * filesystem RO: - * - * XXX: maybe we should be more explicit about whether we're changing state - * because we got an error or what have you? - */ -bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, - enum bch_member_state new_state, int flags) -{ - struct bch_devs_mask new_online_devs; - int nr_rw = 0, required; - - lockdep_assert_held(&c->state_lock); - - switch (new_state) { - case BCH_MEMBER_STATE_rw: - return true; - case BCH_MEMBER_STATE_ro: - if (ca->mi.state != BCH_MEMBER_STATE_rw) - return true; - - /* do we have enough devices to write to? */ - for_each_member_device(c, ca2) - if (ca2 != ca) - nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; - - required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) - ? c->opts.metadata_replicas - : metadata_replicas_required(c), - !(flags & BCH_FORCE_IF_DATA_DEGRADED) - ? c->opts.data_replicas - : data_replicas_required(c)); - - return nr_rw >= required; - case BCH_MEMBER_STATE_failed: - case BCH_MEMBER_STATE_spare: - if (ca->mi.state != BCH_MEMBER_STATE_rw && - ca->mi.state != BCH_MEMBER_STATE_ro) - return true; - - /* do we have enough devices to read from? */ - new_online_devs = c->online_devs; - __clear_bit(ca->dev_idx, new_online_devs.d); - - return bch2_have_enough_devs(c, new_online_devs, flags, false); - default: - BUG(); - } -} - -static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) -{ - bch2_dev_io_ref_stop(ca, WRITE); - - /* - * The allocator thread itself allocates btree nodes, so stop it first: - */ - bch2_dev_allocator_remove(c, ca); - bch2_recalc_capacity(c); - bch2_dev_journal_stop(&c->journal, ca); -} - -static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) -{ - lockdep_assert_held(&c->state_lock); - - BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); - - bch2_dev_allocator_add(c, ca); - bch2_recalc_capacity(c); - - if (enumerated_ref_is_zero(&ca->io_ref[WRITE])) - enumerated_ref_start(&ca->io_ref[WRITE]); - - bch2_dev_do_discards(ca); -} - -int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, - enum bch_member_state new_state, int flags) -{ - struct bch_member *m; - int ret = 0; - - if (ca->mi.state == new_state) - return 0; - - if (!bch2_dev_state_allowed(c, ca, new_state, flags)) - return bch_err_throw(c, device_state_not_allowed); - - if (new_state != BCH_MEMBER_STATE_rw) - __bch2_dev_read_only(c, ca); - - bch_notice(ca, "%s", bch2_member_states[new_state]); - - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_STATE(m, new_state); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (new_state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); - - bch2_rebalance_wakeup(c); - - return ret; -} - -int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, - enum bch_member_state new_state, int flags) -{ - int ret; - - down_write(&c->state_lock); - ret = __bch2_dev_set_state(c, ca, new_state, flags); - up_write(&c->state_lock); - - return ret; -} - -/* Device add/removal: */ - -int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) -{ - struct bch_member *m; - unsigned dev_idx = ca->dev_idx, data; - bool fast_device_removal = !bch2_request_incompat_feature(c, - bcachefs_metadata_version_fast_device_removal); - int ret; - - down_write(&c->state_lock); - - /* - * We consume a reference to ca->ref, regardless of whether we succeed - * or fail: - */ - bch2_dev_put(ca); - - if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { - bch_err(ca, "Cannot remove without losing data"); - ret = bch_err_throw(c, device_state_not_allowed); - goto err; - } - - __bch2_dev_read_only(c, ca); - - ret = fast_device_removal - ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) - : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: - bch2_dev_remove_stripes(c, ca->dev_idx, flags)); - if (ret) - goto err; - - /* Check if device still has data before blowing away alloc info */ - struct bch_dev_usage usage = bch2_dev_usage_read(ca); - for (unsigned i = 0; i < BCH_DATA_NR; i++) - if (!data_type_is_empty(i) && - !data_type_is_hidden(i) && - usage.buckets[i]) { - bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", - __bch2_data_types[i], usage.buckets[i]); - ret = -EBUSY; - goto err; - } - - ret = bch2_dev_remove_alloc(c, ca); - bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); - if (ret) - goto err; - - /* - * We need to flush the entire journal to get rid of keys that reference - * the device being removed before removing the superblock entry - */ - bch2_journal_flush_all_pins(&c->journal); - - /* - * this is really just needed for the bch2_replicas_gc_(start|end) - * calls, and could be cleaned up: - */ - ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); - bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); - if (ret) - goto err; - - ret = bch2_journal_flush(&c->journal); - bch_err_msg(ca, ret, "bch2_journal_flush()"); - if (ret) - goto err; - - ret = bch2_replicas_gc2(c); - bch_err_msg(ca, ret, "bch2_replicas_gc2()"); - if (ret) - goto err; - - data = bch2_dev_has_data(c, ca); - if (data) { - struct printbuf data_has = PRINTBUF; - - prt_bitflags(&data_has, __bch2_data_types, data); - bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); - printbuf_exit(&data_has); - ret = -EBUSY; - goto err; - } - - __bch2_dev_offline(c, ca); - - mutex_lock(&c->sb_lock); - rcu_assign_pointer(c->devs[ca->dev_idx], NULL); - mutex_unlock(&c->sb_lock); - -#ifndef CONFIG_BCACHEFS_DEBUG - percpu_ref_kill(&ca->ref); -#else - ca->dying = true; - bch2_dev_put(ca); -#endif - wait_for_completion(&ca->ref_completion); - - bch2_dev_free(ca); - - /* - * Free this device's slot in the bch_member array - all pointers to - * this device must be gone: - */ - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); - - if (fast_device_removal) - m->uuid = BCH_SB_MEMBER_DELETED_UUID; - else - memset(&m->uuid, 0, sizeof(m->uuid)); - - bch2_write_super(c); - - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); - return 0; -err: - if (test_bit(BCH_FS_rw, &c->flags) && - ca->mi.state == BCH_MEMBER_STATE_rw && - !enumerated_ref_is_zero(&ca->io_ref[READ])) - __bch2_dev_read_write(c, ca); - up_write(&c->state_lock); - return ret; -} - -/* Add new device to running filesystem: */ -int bch2_dev_add(struct bch_fs *c, const char *path) -{ - struct bch_opts opts = bch2_opts_empty(); - struct bch_sb_handle sb = {}; - struct bch_dev *ca = NULL; - struct printbuf errbuf = PRINTBUF; - struct printbuf label = PRINTBUF; - int ret = 0; - - ret = bch2_read_super(path, &opts, &sb); - bch_err_msg(c, ret, "reading super"); - if (ret) - goto err; - - struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); - - if (BCH_MEMBER_GROUP(&dev_mi)) { - bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); - if (label.allocation_failure) { - ret = -ENOMEM; - goto err; - } - } - - if (list_empty(&c->list)) { - mutex_lock(&bch_fs_list_lock); - if (__bch2_uuid_to_fs(c->sb.uuid)) - ret = bch_err_throw(c, filesystem_uuid_already_open); - else - list_add(&c->list, &bch_fs_list); - mutex_unlock(&bch_fs_list_lock); - - if (ret) { - bch_err(c, "filesystem UUID already open"); - goto err; - } - } - - ret = bch2_dev_may_add(sb.sb, c); - if (ret) - goto err; - - ca = __bch2_dev_alloc(c, &dev_mi); - if (!ca) { - ret = -ENOMEM; - goto err; - } - - ret = __bch2_dev_attach_bdev(ca, &sb); - if (ret) - goto err; - - down_write(&c->state_lock); - mutex_lock(&c->sb_lock); - SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); - - ret = bch2_sb_from_fs(c, ca); - bch_err_msg(c, ret, "setting up new superblock"); - if (ret) - goto err_unlock; - - if (dynamic_fault("bcachefs:add:no_slot")) - goto err_unlock; - - ret = bch2_sb_member_alloc(c); - if (ret < 0) { - bch_err_msg(c, ret, "setting up new superblock"); - goto err_unlock; - } - unsigned dev_idx = ret; - ret = 0; - - /* success: */ - - dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); - *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; - - ca->disk_sb.sb->dev_idx = dev_idx; - bch2_dev_attach(c, ca, dev_idx); - - if (BCH_MEMBER_GROUP(&dev_mi)) { - ret = __bch2_dev_group_set(c, ca, label.buf); - bch_err_msg(c, ret, "creating new label"); - if (ret) - goto err_unlock; - } - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (test_bit(BCH_FS_started, &c->flags)) { - ret = bch2_dev_usage_init(ca, false); - if (ret) - goto err_late; - - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - bch_err_msg(ca, ret, "marking new superblock"); - if (ret) - goto err_late; - - ret = bch2_fs_freespace_init(c); - bch_err_msg(ca, ret, "initializing free space"); - if (ret) - goto err_late; - - if (ca->mi.state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); - - ret = bch2_dev_journal_alloc(ca, false); - bch_err_msg(c, ret, "allocating journal"); - if (ret) - goto err_late; - } - - /* - * We just changed the superblock UUID, invalidate cache and send a - * uevent to update /dev/disk/by-uuid - */ - invalidate_bdev(ca->disk_sb.bdev); - - char uuid_str[37]; - snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid); - - char *envp[] = { - "CHANGE=uuid", - uuid_str, - NULL, - }; - kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); - - up_write(&c->state_lock); -out: - printbuf_exit(&label); - printbuf_exit(&errbuf); - bch_err_fn(c, ret); - return ret; - -err_unlock: - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); -err: - if (ca) - bch2_dev_free(ca); - bch2_free_super(&sb); - goto out; -err_late: - up_write(&c->state_lock); - ca = NULL; - goto err; -} - -/* Hot add existing device to running filesystem: */ -int bch2_dev_online(struct bch_fs *c, const char *path) -{ - struct bch_opts opts = bch2_opts_empty(); - struct bch_sb_handle sb = { NULL }; - struct bch_dev *ca; - unsigned dev_idx; - int ret; - - down_write(&c->state_lock); - - ret = bch2_read_super(path, &opts, &sb); - if (ret) { - up_write(&c->state_lock); - return ret; - } - - dev_idx = sb.sb->dev_idx; - - ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); - bch_err_msg(c, ret, "bringing %s online", path); - if (ret) - goto err; - - ret = bch2_dev_attach_bdev(c, &sb); - if (ret) - goto err; - - ca = bch2_dev_locked(c, dev_idx); - - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); - if (ret) - goto err; - - if (ca->mi.state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); - - if (!ca->mi.freespace_initialized) { - ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); - bch_err_msg(ca, ret, "initializing free space"); - if (ret) - goto err; - } - - if (!ca->journal.nr) { - ret = bch2_dev_journal_alloc(ca, false); - bch_err_msg(ca, ret, "allocating journal"); - if (ret) - goto err; - } - - mutex_lock(&c->sb_lock); - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = - cpu_to_le64(ktime_get_real_seconds()); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - up_write(&c->state_lock); - return 0; -err: - up_write(&c->state_lock); - bch2_free_super(&sb); - return ret; -} - -int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) -{ - down_write(&c->state_lock); - - if (!bch2_dev_is_online(ca)) { - bch_err(ca, "Already offline"); - up_write(&c->state_lock); - return 0; - } - - if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { - bch_err(ca, "Cannot offline required disk"); - up_write(&c->state_lock); - return bch_err_throw(c, device_state_not_allowed); - } - - __bch2_dev_offline(c, ca); - - up_write(&c->state_lock); - return 0; -} - -static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) -{ - struct bch_fs *c = ca->fs; - u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; - - return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod2(trans, false, v, dev_data_type, - .dev = ca->dev_idx, - .data_type = BCH_DATA_free)) ?: - bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); -} - -int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -{ - struct bch_member *m; - u64 old_nbuckets; - int ret = 0; - - down_write(&c->state_lock); - old_nbuckets = ca->mi.nbuckets; - - if (nbuckets < ca->mi.nbuckets) { - bch_err(ca, "Cannot shrink yet"); - ret = -EINVAL; - goto err; - } - - if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { - bch_err(ca, "New device size too big (%llu greater than max %u)", - nbuckets, BCH_MEMBER_NBUCKETS_MAX); - ret = bch_err_throw(c, device_size_too_big); - goto err; - } - - if (bch2_dev_is_online(ca) && - get_capacity(ca->disk_sb.bdev->bd_disk) < - ca->mi.bucket_size * nbuckets) { - bch_err(ca, "New size larger than device"); - ret = bch_err_throw(c, device_size_too_small); - goto err; - } - - ret = bch2_dev_buckets_resize(c, ca, nbuckets); - bch_err_msg(ca, ret, "resizing buckets"); - if (ret) - goto err; - - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - if (ret) - goto err; - - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - m->nbuckets = cpu_to_le64(nbuckets); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (ca->mi.freespace_initialized) { - ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); - if (ret) - goto err; - } - - bch2_recalc_capacity(c); -err: - up_write(&c->state_lock); - return ret; -} - -int bch2_fs_resize_on_mount(struct bch_fs *c) -{ - for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { - u64 old_nbuckets = ca->mi.nbuckets; - u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), - ca->mi.bucket_size); - - if (ca->mi.resize_on_mount && - new_nbuckets > ca->mi.nbuckets) { - bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); - int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); - bch_err_fn(ca, ret); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_fs_resize_on_mount); - up_write(&c->state_lock); - return ret; - } - - mutex_lock(&c->sb_lock); - struct bch_member *m = - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - m->nbuckets = cpu_to_le64(new_nbuckets); - SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); - - c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (ca->mi.freespace_initialized) { - ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_fs_resize_on_mount); - up_write(&c->state_lock); - return ret; - } - } - } - } - return 0; -} - -/* return with ref on ca->ref: */ -struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) -{ - if (!strncmp(name, "/dev/", strlen("/dev/"))) - name += strlen("/dev/"); - - for_each_member_device(c, ca) - if (!strcmp(name, ca->name)) - return ca; - return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); -} - -/* blk_holder_ops: */ - -static struct bch_fs *bdev_get_fs(struct block_device *bdev) - __releases(&bdev->bd_holder_lock) -{ - struct bch_sb_handle_holder *holder = bdev->bd_holder; - struct bch_fs *c = holder->c; - - if (c && !bch2_ro_ref_tryget(c)) - c = NULL; - - mutex_unlock(&bdev->bd_holder_lock); - - if (c) - wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); - return c; -} - -/* returns with ref on ca->ref */ -static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) -{ - for_each_member_device(c, ca) - if (ca->disk_sb.bdev == bdev) - return ca; - return NULL; -} - -static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) -{ - struct bch_fs *c = bdev_get_fs(bdev); - if (!c) - return; - - struct super_block *sb = c->vfs_sb; - if (sb) { - /* - * Not necessary, c->ro_ref guards against the filesystem being - * unmounted - we only take this to avoid a warning in - * sync_filesystem: - */ - down_read(&sb->s_umount); - } - - down_write(&c->state_lock); - struct bch_dev *ca = bdev_to_bch_dev(c, bdev); - if (!ca) - goto unlock; - - bool dev = bch2_dev_state_allowed(c, ca, - BCH_MEMBER_STATE_failed, - BCH_FORCE_IF_DEGRADED); - - if (!dev && sb) { - if (!surprise) - sync_filesystem(sb); - shrink_dcache_sb(sb); - evict_inodes(sb); - } - - struct printbuf buf = PRINTBUF; - __bch2_log_msg_start(ca->name, &buf); - - prt_printf(&buf, "offline from block layer"); - - if (dev) { - __bch2_dev_offline(c, ca); - } else { - bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only2(c, &buf); - } - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - bch2_dev_put(ca); -unlock: - if (sb) - up_read(&sb->s_umount); - up_write(&c->state_lock); - bch2_ro_ref_put(c); -} - -static void bch2_fs_bdev_sync(struct block_device *bdev) -{ - struct bch_fs *c = bdev_get_fs(bdev); - if (!c) - return; - - struct super_block *sb = c->vfs_sb; - if (sb) { - /* - * Not necessary, c->ro_ref guards against the filesystem being - * unmounted - we only take this to avoid a warning in - * sync_filesystem: - */ - down_read(&sb->s_umount); - sync_filesystem(sb); - up_read(&sb->s_umount); - } - - bch2_ro_ref_put(c); -} - -const struct blk_holder_ops bch2_sb_handle_bdev_ops = { - .mark_dead = bch2_fs_bdev_mark_dead, - .sync = bch2_fs_bdev_sync, -}; - -/* Filesystem open: */ - -static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) -{ - return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: - cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); -} - -struct bch_fs *bch2_fs_open(darray_const_str *devices, - struct bch_opts *opts) -{ - bch_sb_handles sbs = {}; - struct bch_fs *c = NULL; - struct bch_sb_handle *best = NULL; - struct printbuf errbuf = PRINTBUF; - int ret = 0; - - if (!try_module_get(THIS_MODULE)) - return ERR_PTR(-ENODEV); - - if (!devices->nr) { - ret = -EINVAL; - goto err; - } - - ret = darray_make_room(&sbs, devices->nr); - if (ret) - goto err; - - darray_for_each(*devices, i) { - struct bch_sb_handle sb = { NULL }; - - ret = bch2_read_super(*i, opts, &sb); - if (ret) - goto err; - - BUG_ON(darray_push(&sbs, sb)); - } - - if (opts->nochanges && !opts->read_only) { - ret = bch_err_throw(c, erofs_nochanges); - goto err_print; - } - - darray_for_each(sbs, sb) - if (!best || sb_cmp(sb->sb, best->sb) > 0) - best = sb; - - darray_for_each_reverse(sbs, sb) { - ret = bch2_dev_in_fs(best, sb, opts); - - if (ret == -BCH_ERR_device_has_been_removed || - ret == -BCH_ERR_device_splitbrain) { - bch2_free_super(sb); - darray_remove_item(&sbs, sb); - best -= best > sb; - ret = 0; - continue; - } - - if (ret) - goto err_print; - } - - c = bch2_fs_alloc(best->sb, opts, &sbs); - ret = PTR_ERR_OR_ZERO(c); - if (ret) - goto err; - - down_write(&c->state_lock); - darray_for_each(sbs, sb) { - ret = bch2_dev_attach_bdev(c, sb); - if (ret) { - up_write(&c->state_lock); - goto err; - } - } - up_write(&c->state_lock); - - if (!c->opts.nostart) { - ret = bch2_fs_start(c); - if (ret) - goto err; - } -out: - darray_for_each(sbs, sb) - bch2_free_super(sb); - darray_exit(&sbs); - printbuf_exit(&errbuf); - module_put(THIS_MODULE); - return c; -err_print: - pr_err("bch_fs_open err opening %s: %s", - devices->data[0], bch2_err_str(ret)); -err: - if (!IS_ERR_OR_NULL(c)) - bch2_fs_stop(c); - c = ERR_PTR(ret); - goto out; -} - -/* Global interfaces/init */ - -static void bcachefs_exit(void) -{ - bch2_debug_exit(); - bch2_vfs_exit(); - bch2_chardev_exit(); - bch2_btree_key_cache_exit(); - if (bcachefs_kset) - kset_unregister(bcachefs_kset); -} - -static int __init bcachefs_init(void) -{ - bch2_bkey_pack_test(); - - if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || - bch2_btree_key_cache_init() || - bch2_chardev_init() || - bch2_vfs_init() || - bch2_debug_init()) - goto err; - - return 0; -err: - bcachefs_exit(); - return -ENOMEM; -} - -#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name); -BCH_DEBUG_PARAMS_ALL() -#undef BCH_DEBUG_PARAM - -static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp) -{ - /* Match bool exactly, by re-using it. */ - struct static_key *key = kp->arg; - struct kernel_param boolkp = *kp; - bool v; - int ret; - - boolkp.arg = &v; - - ret = param_set_bool(val, &boolkp); - if (ret) - return ret; - if (v) - static_key_enable(key); - else - static_key_disable(key); - return 0; -} - -static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp) -{ - struct static_key *key = kp->arg; - return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y'); -} - -static const struct kernel_param_ops bch2_param_ops_static_key_t = { - .flags = KERNEL_PARAM_OPS_FL_NOARG, - .set = bch2_param_set_static_key_t, - .get = bch2_param_get_static_key_t, -}; - -#define BCH_DEBUG_PARAM(name, description) \ - module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\ - __MODULE_PARM_TYPE(name, "static_key_t"); \ - MODULE_PARM_DESC(name, description); -BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - -__maybe_unused -static unsigned bch2_metadata_version = bcachefs_metadata_version_current; -module_param_named(version, bch2_metadata_version, uint, 0444); - -module_exit(bcachefs_exit); -module_init(bcachefs_init); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h deleted file mode 100644 index e90bab9afe78..000000000000 --- a/fs/bcachefs/super.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUPER_H -#define _BCACHEFS_SUPER_H - -#include "extents.h" - -#include "bcachefs_ioctl.h" - -#include <linux/math64.h> - -extern const char * const bch2_fs_flag_strs[]; -extern const char * const bch2_write_refs[]; -extern const char * const bch2_dev_read_refs[]; -extern const char * const bch2_dev_write_refs[]; - -struct bch_fs *bch2_dev_to_fs(dev_t); -struct bch_fs *bch2_uuid_to_fs(__uuid_t); - -bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, - enum bch_member_state, int); -int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, - enum bch_member_state, int); -int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, - enum bch_member_state, int); - -int bch2_dev_fail(struct bch_dev *, int); -int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); -int bch2_dev_add(struct bch_fs *, const char *); -int bch2_dev_online(struct bch_fs *, const char *); -int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); -int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); -struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); - -bool bch2_fs_emergency_read_only(struct bch_fs *); -bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *); - -bool bch2_fs_emergency_read_only_locked(struct bch_fs *); -void bch2_fs_read_only(struct bch_fs *); - -int bch2_fs_read_write(struct bch_fs *); -int bch2_fs_read_write_early(struct bch_fs *); - -int bch2_fs_resize_on_mount(struct bch_fs *); - -void __bch2_fs_stop(struct bch_fs *); -void bch2_fs_free(struct bch_fs *); -void bch2_fs_stop(struct bch_fs *); - -int bch2_fs_init_rw(struct bch_fs *); -int bch2_fs_start(struct bch_fs *); -struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *); - -extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; - -#endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h deleted file mode 100644 index 3a899f799d1d..000000000000 --- a/fs/bcachefs/super_types.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUPER_TYPES_H -#define _BCACHEFS_SUPER_TYPES_H - -struct bch_fs; - -struct bch_sb_handle_holder { - struct bch_fs *c; -}; - -struct bch_sb_handle { - struct bch_sb *sb; - struct file *s_bdev_file; - struct block_device *bdev; - char *sb_name; - struct bio *bio; - struct bch_sb_handle_holder *holder; - size_t buffer_size; - blk_mode_t mode; - unsigned have_layout:1; - unsigned have_bio:1; - unsigned fs_sb:1; - u64 seq; -}; - -struct bch_devs_mask { - unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; -}; - -struct bch_devs_list { - u8 nr; - u8 data[BCH_BKEY_PTRS_MAX]; -}; - -#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c deleted file mode 100644 index 05848375cea2..000000000000 --- a/fs/bcachefs/sysfs.c +++ /dev/null @@ -1,914 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bcache sysfs interfaces - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#ifndef NO_BCACHEFS_SYSFS - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "sysfs.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_gc.h" -#include "buckets.h" -#include "clock.h" -#include "compress.h" -#include "disk_accounting.h" -#include "disk_groups.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "inode.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "keylist.h" -#include "move.h" -#include "movinggc.h" -#include "nocow_locking.h" -#include "opts.h" -#include "rebalance.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-errors.h" -#include "super-io.h" -#include "tests.h" - -#include <linux/blkdev.h> -#include <linux/sort.h> -#include <linux/sched/clock.h> - -#include "util.h" - -#define SYSFS_OPS(type) \ -const struct sysfs_ops type ## _sysfs_ops = { \ - .show = type ## _show, \ - .store = type ## _store \ -} - -#define SHOW(fn) \ -static ssize_t fn ## _to_text(struct printbuf *, \ - struct kobject *, struct attribute *); \ - \ -static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ - char *buf) \ -{ \ - struct printbuf out = PRINTBUF; \ - ssize_t ret = fn ## _to_text(&out, kobj, attr); \ - \ - if (out.pos && out.buf[out.pos - 1] != '\n') \ - prt_newline(&out); \ - \ - if (!ret && out.allocation_failure) \ - ret = -ENOMEM; \ - \ - if (!ret) { \ - ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ - memcpy(buf, out.buf, ret); \ - } \ - printbuf_exit(&out); \ - return bch2_err_class(ret); \ -} \ - \ -static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\ - struct attribute *attr) - -#define STORE(fn) \ -static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\ - const char *, size_t); \ - \ -static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ - const char *buf, size_t size) \ -{ \ - return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \ -} \ - \ -static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\ - const char *buf, size_t size) - -#define __sysfs_attribute(_name, _mode) \ - static struct attribute sysfs_##_name = \ - { .name = #_name, .mode = _mode } - -#define write_attribute(n) __sysfs_attribute(n, 0200) -#define read_attribute(n) __sysfs_attribute(n, 0444) -#define rw_attribute(n) __sysfs_attribute(n, 0644) - -#define sysfs_printf(file, fmt, ...) \ -do { \ - if (attr == &sysfs_ ## file) \ - prt_printf(out, fmt "\n", __VA_ARGS__); \ -} while (0) - -#define sysfs_print(file, var) \ -do { \ - if (attr == &sysfs_ ## file) \ - snprint(out, var); \ -} while (0) - -#define sysfs_hprint(file, val) \ -do { \ - if (attr == &sysfs_ ## file) \ - prt_human_readable_s64(out, val); \ -} while (0) - -#define sysfs_strtoul(file, var) \ -do { \ - if (attr == &sysfs_ ## file) \ - return strtoul_safe(buf, var) ?: (ssize_t) size; \ -} while (0) - -#define sysfs_strtoul_clamp(file, var, min, max) \ -do { \ - if (attr == &sysfs_ ## file) \ - return strtoul_safe_clamp(buf, var, min, max) \ - ?: (ssize_t) size; \ -} while (0) - -#define strtoul_or_return(cp) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (_r) \ - return _r; \ - _v; \ -}) - -write_attribute(trigger_gc); -write_attribute(trigger_discards); -write_attribute(trigger_invalidates); -write_attribute(trigger_journal_commit); -write_attribute(trigger_journal_flush); -write_attribute(trigger_journal_writes); -write_attribute(trigger_btree_cache_shrink); -write_attribute(trigger_btree_key_cache_shrink); -write_attribute(trigger_btree_updates); -write_attribute(trigger_freelist_wakeup); -write_attribute(trigger_recalc_capacity); -write_attribute(trigger_delete_dead_snapshots); -write_attribute(trigger_emergency_read_only); -read_attribute(gc_gens_pos); - -read_attribute(uuid); -read_attribute(minor); -read_attribute(flags); -read_attribute(first_bucket); -read_attribute(nbuckets); -read_attribute(io_done); -read_attribute(io_errors); -write_attribute(io_errors_reset); - -read_attribute(io_latency_read); -read_attribute(io_latency_write); -read_attribute(io_latency_stats_read); -read_attribute(io_latency_stats_write); -read_attribute(congested); - -read_attribute(btree_write_stats); - -read_attribute(btree_cache_size); -read_attribute(compression_stats); -read_attribute(errors); -read_attribute(journal_debug); -read_attribute(btree_cache); -read_attribute(btree_key_cache); -read_attribute(btree_reserve_cache); -read_attribute(open_buckets); -read_attribute(open_buckets_partial); -read_attribute(nocow_lock_table); - -read_attribute(read_refs); -read_attribute(write_refs); - -read_attribute(internal_uuid); -read_attribute(disk_groups); - -read_attribute(has_data); -read_attribute(alloc_debug); -read_attribute(usage_base); - -#define x(t, n, ...) read_attribute(t); -BCH_PERSISTENT_COUNTERS() -#undef x - -rw_attribute(label); - -read_attribute(copy_gc_wait); - -sysfs_pd_controller_attribute(rebalance); -read_attribute(rebalance_status); -read_attribute(snapshot_delete_status); -read_attribute(recovery_status); - -read_attribute(new_stripes); - -read_attribute(io_timers_read); -read_attribute(io_timers_write); - -read_attribute(moving_ctxts); - -#ifdef CONFIG_BCACHEFS_TESTS -write_attribute(perf_test); -#endif /* CONFIG_BCACHEFS_TESTS */ - -#define x(_name) \ - static struct attribute sysfs_time_stat_##_name = \ - { .name = #_name, .mode = 0644 }; - BCH_TIME_STATS() -#undef x - -static size_t bch2_btree_cache_size(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - size_t ret = 0; - struct btree *b; - - mutex_lock(&bc->lock); - list_for_each_entry(b, &bc->live[0].list, list) - ret += btree_buf_bytes(b); - list_for_each_entry(b, &bc->live[1].list, list) - ret += btree_buf_bytes(b); - list_for_each_entry(b, &bc->freeable, list) - ret += btree_buf_bytes(b); - mutex_unlock(&bc->lock); - return ret; -} - -static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) -{ - prt_str(out, "type"); - printbuf_tabstop_push(out, 12); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 24); - prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n"); - - for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) { - struct disk_accounting_pos a; - disk_accounting_key_init(a, compression, .type = i); - struct bpos p = disk_accounting_pos_to_bpos(&a); - u64 v[3]; - bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v)); - - u64 nr_extents = v[0]; - u64 sectors_uncompressed = v[1]; - u64 sectors_compressed = v[2]; - - bch2_prt_compression_type(out, i); - prt_tab(out); - - prt_human_readable_u64(out, sectors_compressed << 9); - prt_tab_rjust(out); - - prt_human_readable_u64(out, sectors_uncompressed << 9); - prt_tab_rjust(out); - - prt_human_readable_u64(out, nr_extents - ? div64_u64(sectors_uncompressed << 9, nr_extents) - : 0); - prt_tab_rjust(out); - prt_newline(out); - } - - return 0; -} - -static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) -{ - bch2_btree_id_to_text(out, c->gc_gens_btree); - prt_printf(out, ": "); - bch2_bpos_to_text(out, c->gc_gens_pos); - prt_printf(out, "\n"); -} - -static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bch_fs_usage_base b = {}; - - acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64)); - - prt_printf(out, "hidden:\t\t%llu\n", b.hidden); - prt_printf(out, "btree:\t\t%llu\n", b.btree); - prt_printf(out, "data:\t\t%llu\n", b.data); - prt_printf(out, "cached:\t%llu\n", b.cached); - prt_printf(out, "reserved:\t\t%llu\n", b.reserved); - prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes); -} - -SHOW(bch2_fs) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - sysfs_print(minor, c->minor); - sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); - - if (attr == &sysfs_flags) - prt_bitflags(out, bch2_fs_flag_strs, c->flags); - - sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); - - if (attr == &sysfs_btree_write_stats) - bch2_btree_write_stats_to_text(out, c); - - if (attr == &sysfs_gc_gens_pos) - bch2_gc_gens_pos_to_text(out, c); - - sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ - - if (attr == &sysfs_copy_gc_wait) - bch2_copygc_wait_to_text(out, c); - - if (attr == &sysfs_rebalance_status) - bch2_rebalance_status_to_text(out, c); - - if (attr == &sysfs_snapshot_delete_status) - bch2_snapshot_delete_status_to_text(out, c); - - if (attr == &sysfs_recovery_status) - bch2_recovery_pass_status_to_text(out, c); - - /* Debugging: */ - - if (attr == &sysfs_journal_debug) - bch2_journal_debug_to_text(out, &c->journal); - - if (attr == &sysfs_btree_cache) - bch2_btree_cache_to_text(out, &c->btree_cache); - - if (attr == &sysfs_btree_key_cache) - bch2_btree_key_cache_to_text(out, &c->btree_key_cache); - - if (attr == &sysfs_btree_reserve_cache) - bch2_btree_reserve_cache_to_text(out, c); - - if (attr == &sysfs_open_buckets) - bch2_open_buckets_to_text(out, c, NULL); - - if (attr == &sysfs_open_buckets_partial) - bch2_open_buckets_partial_to_text(out, c); - - if (attr == &sysfs_compression_stats) - bch2_compression_stats_to_text(out, c); - - if (attr == &sysfs_errors) - bch2_fs_errors_to_text(out, c); - - if (attr == &sysfs_new_stripes) - bch2_new_stripes_to_text(out, c); - - if (attr == &sysfs_io_timers_read) - bch2_io_timers_to_text(out, &c->io_clock[READ]); - - if (attr == &sysfs_io_timers_write) - bch2_io_timers_to_text(out, &c->io_clock[WRITE]); - - if (attr == &sysfs_moving_ctxts) - bch2_fs_moving_ctxts_to_text(out, c); - - if (attr == &sysfs_write_refs) - enumerated_ref_to_text(out, &c->writes, bch2_write_refs); - - if (attr == &sysfs_nocow_lock_table) - bch2_nocow_locks_to_text(out, &c->nocow_locks); - - if (attr == &sysfs_disk_groups) - bch2_disk_groups_to_text(out, c); - - if (attr == &sysfs_alloc_debug) - bch2_fs_alloc_debug_to_text(out, c); - - if (attr == &sysfs_usage_base) - bch2_fs_usage_base_to_text(out, c); - - return 0; -} - -STORE(bch2_fs) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - sysfs_pd_controller_store(rebalance, &c->rebalance.pd); - - /* Debugging: */ - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EPERM; - - /* Debugging: */ - - if (attr == &sysfs_trigger_btree_updates) - queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)) - return -EROFS; - - if (attr == &sysfs_trigger_btree_cache_shrink) { - struct btree_cache *bc = &c->btree_cache; - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = strtoul_or_return(buf); - bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc); - } - - if (attr == &sysfs_trigger_btree_key_cache_shrink) { - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = strtoul_or_return(buf); - c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc); - } - - if (attr == &sysfs_trigger_gc) - bch2_gc_gens(c); - - if (attr == &sysfs_trigger_discards) - bch2_do_discards(c); - - if (attr == &sysfs_trigger_invalidates) - bch2_do_invalidates(c); - - if (attr == &sysfs_trigger_journal_commit) - bch2_journal_flush(&c->journal); - - if (attr == &sysfs_trigger_journal_flush) { - bch2_journal_flush_all_pins(&c->journal); - bch2_journal_meta(&c->journal); - } - - if (attr == &sysfs_trigger_journal_writes) - bch2_journal_do_writes(&c->journal); - - if (attr == &sysfs_trigger_freelist_wakeup) - closure_wake_up(&c->freelist_wait); - - if (attr == &sysfs_trigger_recalc_capacity) { - down_read(&c->state_lock); - bch2_recalc_capacity(c); - up_read(&c->state_lock); - } - - if (attr == &sysfs_trigger_delete_dead_snapshots) - __bch2_delete_dead_snapshots(c); - - if (attr == &sysfs_trigger_emergency_read_only) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "shutdown by sysfs\n"); - bch2_fs_emergency_read_only2(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - -#ifdef CONFIG_BCACHEFS_TESTS - if (attr == &sysfs_perf_test) { - char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; - char *test = strsep(&p, " \t\n"); - char *nr_str = strsep(&p, " \t\n"); - char *threads_str = strsep(&p, " \t\n"); - unsigned threads; - u64 nr; - int ret = -EINVAL; - - if (threads_str && - !(ret = kstrtouint(threads_str, 10, &threads)) && - !(ret = bch2_strtoull_h(nr_str, &nr))) - ret = bch2_btree_perf_test(c, test, nr, threads); - kfree(tmp); - - if (ret) - size = ret; - } -#endif - enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); - return size; -} -SYSFS_OPS(bch2_fs); - -struct attribute *bch2_fs_files[] = { - &sysfs_minor, - &sysfs_btree_cache_size, - &sysfs_btree_write_stats, - - &sysfs_rebalance_status, - &sysfs_snapshot_delete_status, - &sysfs_recovery_status, - - &sysfs_compression_stats, - &sysfs_errors, - -#ifdef CONFIG_BCACHEFS_TESTS - &sysfs_perf_test, -#endif - NULL -}; - -/* counters dir */ - -SHOW(bch2_fs_counters) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj); - u64 counter = 0; - u64 counter_since_mount = 0; - - printbuf_tabstop_push(out, 32); - - #define x(t, n, f, ...) \ - if (attr == &sysfs_##t) { \ - counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ - counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ - if (f & TYPE_SECTORS) { \ - counter <<= 9; \ - counter_since_mount <<= 9; \ - } \ - \ - prt_printf(out, "since mount:\t"); \ - (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\ - prt_human_readable_u64(out, counter_since_mount); \ - prt_newline(out); \ - \ - prt_printf(out, "since filesystem creation:\t"); \ - (f & TYPE_COUNTER) ? prt_u64(out, counter) : \ - prt_human_readable_u64(out, counter); \ - prt_newline(out); \ - } - BCH_PERSISTENT_COUNTERS() - #undef x - return 0; -} - -STORE(bch2_fs_counters) { - return 0; -} - -SYSFS_OPS(bch2_fs_counters); - -struct attribute *bch2_fs_counters_files[] = { -#define x(t, ...) \ - &sysfs_##t, - BCH_PERSISTENT_COUNTERS() -#undef x - NULL -}; -/* internal dir - just a wrapper */ - -SHOW(bch2_fs_internal) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, internal); - - return bch2_fs_to_text(out, &c->kobj, attr); -} - -STORE(bch2_fs_internal) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, internal); - - return bch2_fs_store(&c->kobj, attr, buf, size); -} -SYSFS_OPS(bch2_fs_internal); - -struct attribute *bch2_fs_internal_files[] = { - &sysfs_flags, - &sysfs_journal_debug, - &sysfs_btree_cache, - &sysfs_btree_key_cache, - &sysfs_btree_reserve_cache, - &sysfs_new_stripes, - &sysfs_open_buckets, - &sysfs_open_buckets_partial, - &sysfs_write_refs, - &sysfs_nocow_lock_table, - &sysfs_io_timers_read, - &sysfs_io_timers_write, - - &sysfs_trigger_gc, - &sysfs_trigger_discards, - &sysfs_trigger_invalidates, - &sysfs_trigger_journal_commit, - &sysfs_trigger_journal_flush, - &sysfs_trigger_journal_writes, - &sysfs_trigger_btree_cache_shrink, - &sysfs_trigger_btree_key_cache_shrink, - &sysfs_trigger_btree_updates, - &sysfs_trigger_freelist_wakeup, - &sysfs_trigger_recalc_capacity, - &sysfs_trigger_delete_dead_snapshots, - &sysfs_trigger_emergency_read_only, - - &sysfs_gc_gens_pos, - - &sysfs_copy_gc_wait, - - sysfs_pd_controller_files(rebalance), - - &sysfs_moving_ctxts, - - &sysfs_internal_uuid, - - &sysfs_disk_groups, - &sysfs_alloc_debug, - &sysfs_usage_base, - NULL -}; - -/* options */ - -static ssize_t sysfs_opt_show(struct bch_fs *c, - struct bch_dev *ca, - enum bch_opt_id id, - struct printbuf *out) -{ - const struct bch_option *opt = bch2_opt_table + id; - u64 v; - - if (opt->flags & OPT_FS) { - v = bch2_opt_get_by_id(&c->opts, id); - } else if ((opt->flags & OPT_DEVICE) && opt->get_member) { - v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx); - } else { - return -EINVAL; - } - - bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); - prt_char(out, '\n'); - return 0; -} - -static ssize_t sysfs_opt_store(struct bch_fs *c, - struct bch_dev *ca, - enum bch_opt_id id, - const char *buf, size_t size) -{ - const struct bch_option *opt = bch2_opt_table + id; - int ret = 0; - - /* - * We don't need to take c->writes for correctness, but it eliminates an - * unsightly error message in the dmesg log when we're RO: - */ - if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))) - return -EROFS; - - char *tmp = kstrdup(buf, GFP_KERNEL); - if (!tmp) { - ret = -ENOMEM; - goto err; - } - - u64 v; - ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: - bch2_opt_hook_pre_set(c, ca, id, v); - kfree(tmp); - - if (ret < 0) - goto err; - - bool is_sb = opt->get_sb || opt->get_member; - bool changed = false; - - if (is_sb) { - changed = bch2_opt_set_sb(c, ca, opt, v); - } else if (!ca) { - changed = bch2_opt_get_by_id(&c->opts, id) != v; - } else { - /* device options that aren't superblock options aren't - * supported */ - BUG(); - } - - if (!ca) - bch2_opt_set_by_id(&c->opts, id, v); - - if (changed) - bch2_opt_hook_post_set(c, ca, 0, &c->opts, id); - - ret = size; -err: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); - return ret; -} - -SHOW(bch2_fs_opts_dir) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - int id = bch2_opt_lookup(attr->name); - if (id < 0) - return 0; - - return sysfs_opt_show(c, NULL, id, out); -} - -STORE(bch2_fs_opts_dir) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - int id = bch2_opt_lookup(attr->name); - if (id < 0) - return 0; - - return sysfs_opt_store(c, NULL, id, buf, size); -} -SYSFS_OPS(bch2_fs_opts_dir); - -struct attribute *bch2_fs_opts_dir_files[] = { NULL }; - -int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) -{ - for (const struct bch_option *i = bch2_opt_table; - i < bch2_opt_table + bch2_opts_nr; - i++) { - if (i->flags & OPT_HIDDEN) - continue; - if (!(i->flags & type)) - continue; - - int ret = sysfs_create_file(kobj, &i->attr); - if (ret) - return ret; - } - - return 0; -} - -/* time stats */ - -SHOW(bch2_fs_time_stats) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); - -#define x(name) \ - if (attr == &sysfs_time_stat_##name) \ - bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]); - BCH_TIME_STATS() -#undef x - - return 0; -} - -STORE(bch2_fs_time_stats) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); - -#define x(name) \ - if (attr == &sysfs_time_stat_##name) \ - bch2_time_stats_reset(&c->times[BCH_TIME_##name]); - BCH_TIME_STATS() -#undef x - return size; -} -SYSFS_OPS(bch2_fs_time_stats); - -struct attribute *bch2_fs_time_stats_files[] = { -#define x(name) \ - &sysfs_time_stat_##name, - BCH_TIME_STATS() -#undef x - NULL -}; - -static const char * const bch2_rw[] = { - "read", - "write", - NULL -}; - -static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca) -{ - int rw, i; - - for (rw = 0; rw < 2; rw++) { - prt_printf(out, "%s:\n", bch2_rw[rw]); - - for (i = 1; i < BCH_DATA_NR; i++) - prt_printf(out, "%-12s:%12llu\n", - bch2_data_type_str(i), - percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); - } -} - -SHOW(bch2_dev) -{ - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - struct bch_fs *c = ca->fs; - - sysfs_printf(uuid, "%pU\n", ca->uuid.b); - - sysfs_print(first_bucket, ca->mi.first_bucket); - sysfs_print(nbuckets, ca->mi.nbuckets); - - if (attr == &sysfs_label) { - if (ca->mi.group) - bch2_disk_path_to_text(out, c, ca->mi.group - 1); - prt_char(out, '\n'); - } - - if (attr == &sysfs_has_data) { - prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca)); - prt_char(out, '\n'); - } - - if (attr == &sysfs_io_done) - dev_io_done_to_text(out, ca); - - if (attr == &sysfs_io_errors) - bch2_dev_io_errors_to_text(out, ca); - - sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); - sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); - - if (attr == &sysfs_io_latency_stats_read) - bch2_time_stats_to_text(out, &ca->io_latency[READ].stats); - - if (attr == &sysfs_io_latency_stats_write) - bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats); - - sysfs_printf(congested, "%u%%", - clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) - * 100 / CONGESTED_MAX); - - if (attr == &sysfs_alloc_debug) - bch2_dev_alloc_debug_to_text(out, ca); - - if (attr == &sysfs_open_buckets) - bch2_open_buckets_to_text(out, c, ca); - - int opt_id = bch2_opt_lookup(attr->name); - if (opt_id >= 0) - return sysfs_opt_show(c, ca, opt_id, out); - - if (attr == &sysfs_read_refs) - enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs); - - if (attr == &sysfs_write_refs) - enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs); - - return 0; -} - -STORE(bch2_dev) -{ - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - struct bch_fs *c = ca->fs; - - if (attr == &sysfs_label) { - char *tmp; - int ret; - - tmp = kstrdup(buf, GFP_KERNEL); - if (!tmp) - return -ENOMEM; - - ret = bch2_dev_group_set(c, ca, strim(tmp)); - kfree(tmp); - if (ret) - return ret; - } - - if (attr == &sysfs_io_errors_reset) - bch2_dev_errors_reset(ca); - - int opt_id = bch2_opt_lookup(attr->name); - if (opt_id >= 0) - return sysfs_opt_store(c, ca, opt_id, buf, size); - - return size; -} -SYSFS_OPS(bch2_dev); - -struct attribute *bch2_dev_files[] = { - &sysfs_uuid, - &sysfs_first_bucket, - &sysfs_nbuckets, - - /* settings: */ - &sysfs_label, - - &sysfs_has_data, - &sysfs_io_done, - &sysfs_io_errors, - &sysfs_io_errors_reset, - - &sysfs_io_latency_read, - &sysfs_io_latency_write, - &sysfs_io_latency_stats_read, - &sysfs_io_latency_stats_write, - &sysfs_congested, - - /* debug: */ - &sysfs_alloc_debug, - &sysfs_open_buckets, - - &sysfs_read_refs, - &sysfs_write_refs, - NULL -}; - -#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h deleted file mode 100644 index 303e0433c702..000000000000 --- a/fs/bcachefs/sysfs.h +++ /dev/null @@ -1,49 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SYSFS_H_ -#define _BCACHEFS_SYSFS_H_ - -#include <linux/sysfs.h> - -#ifndef NO_BCACHEFS_SYSFS - -struct attribute; -struct sysfs_ops; - -extern struct attribute *bch2_fs_files[]; -extern struct attribute *bch2_fs_counters_files[]; -extern struct attribute *bch2_fs_internal_files[]; -extern struct attribute *bch2_fs_opts_dir_files[]; -extern struct attribute *bch2_fs_time_stats_files[]; -extern struct attribute *bch2_dev_files[]; - -extern const struct sysfs_ops bch2_fs_sysfs_ops; -extern const struct sysfs_ops bch2_fs_counters_sysfs_ops; -extern const struct sysfs_ops bch2_fs_internal_sysfs_ops; -extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -extern const struct sysfs_ops bch2_dev_sysfs_ops; - -int bch2_opts_create_sysfs_files(struct kobject *, unsigned); - -#else - -static struct attribute *bch2_fs_files[] = {}; -static struct attribute *bch2_fs_counters_files[] = {}; -static struct attribute *bch2_fs_internal_files[] = {}; -static struct attribute *bch2_fs_opts_dir_files[] = {}; -static struct attribute *bch2_fs_time_stats_files[] = {}; -static struct attribute *bch2_dev_files[] = {}; - -static const struct sysfs_ops bch2_fs_sysfs_ops; -static const struct sysfs_ops bch2_fs_counters_sysfs_ops; -static const struct sysfs_ops bch2_fs_internal_sysfs_ops; -static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -static const struct sysfs_ops bch2_dev_sysfs_ops; - -static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) -{ return 0; } - -#endif /* NO_BCACHEFS_SYSFS */ - -#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c deleted file mode 100644 index 782a05fe7656..000000000000 --- a/fs/bcachefs/tests.c +++ /dev/null @@ -1,891 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifdef CONFIG_BCACHEFS_TESTS - -#include "bcachefs.h" -#include "btree_update.h" -#include "journal_reclaim.h" -#include "snapshot.h" -#include "tests.h" - -#include "linux/kthread.h" -#include "linux/random.h" - -static void delete_test_keys(struct bch_fs *c) -{ - int ret; - - ret = bch2_btree_delete_range(c, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), - POS(0, U64_MAX), - 0, NULL); - BUG_ON(ret); - - ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - POS(0, U64_MAX), - 0, NULL); - BUG_ON(ret); -} - -/* unit tests */ - -static int test_delete(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i_cookie k; - int ret; - - bkey_cookie_init(&k.k_i); - k.k.p.snapshot = U32_MAX; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_intent); - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, &k.k_i, 0)); - bch_err_msg(c, ret, "update error"); - if (ret) - goto err; - - pr_info("deleting once"); - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, 0)); - bch_err_msg(c, ret, "delete error (first)"); - if (ret) - goto err; - - pr_info("deleting twice"); - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, 0)); - bch_err_msg(c, ret, "delete error (second)"); - if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int test_delete_written(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i_cookie k; - int ret; - - bkey_cookie_init(&k.k_i); - k.k.p.snapshot = U32_MAX; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_intent); - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, &k.k_i, 0)); - bch_err_msg(c, ret, "update error"); - if (ret) - goto err; - - bch2_trans_unlock(trans); - bch2_journal_flush_all_pins(&c->journal); - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, 0)); - bch_err_msg(c, ret, "delete error"); - if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int test_iterate(struct bch_fs *c, u64 nr) -{ - u64 i; - int ret = 0; - - delete_test_keys(c); - - pr_info("inserting test keys"); - - for (i = 0; i < nr; i++) { - struct bkey_i_cookie ck; - - bkey_cookie_init(&ck.k_i); - ck.k.p.offset = i; - ck.k.p.snapshot = U32_MAX; - - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "insert error"); - if (ret) - return ret; - } - - pr_info("iterating forwards"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i++); - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards"); - if (ret) - return ret; - - BUG_ON(i != nr); - - pr_info("iterating backwards"); - - ret = bch2_trans_run(c, - for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, - SPOS(0, U64_MAX, U32_MAX), 0, k, ({ - BUG_ON(k.k->p.offset != --i); - 0; - }))); - bch_err_msg(c, ret, "error iterating backwards"); - if (ret) - return ret; - - BUG_ON(i); - return 0; -} - -static int test_iterate_extents(struct bch_fs *c, u64 nr) -{ - u64 i; - int ret = 0; - - delete_test_keys(c); - - pr_info("inserting test extents"); - - for (i = 0; i < nr; i += 8) { - struct bkey_i_cookie ck; - - bkey_cookie_init(&ck.k_i); - ck.k.p.offset = i + 8; - ck.k.p.snapshot = U32_MAX; - ck.k.size = 8; - - ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "insert error"); - if (ret) - return ret; - } - - pr_info("iterating forwards"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i); - i = k.k->p.offset; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards"); - if (ret) - return ret; - - BUG_ON(i != nr); - - pr_info("iterating backwards"); - - ret = bch2_trans_run(c, - for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, - SPOS(0, U64_MAX, U32_MAX), 0, k, ({ - BUG_ON(k.k->p.offset != i); - i = bkey_start_offset(k.k); - 0; - }))); - bch_err_msg(c, ret, "error iterating backwards"); - if (ret) - return ret; - - BUG_ON(i); - return 0; -} - -static int test_iterate_slots(struct bch_fs *c, u64 nr) -{ - u64 i; - int ret = 0; - - delete_test_keys(c); - - pr_info("inserting test keys"); - - for (i = 0; i < nr; i++) { - struct bkey_i_cookie ck; - - bkey_cookie_init(&ck.k_i); - ck.k.p.offset = i * 2; - ck.k.p.snapshot = U32_MAX; - - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "insert error"); - if (ret) - return ret; - } - - pr_info("iterating forwards"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i); - i += 2; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards"); - if (ret) - return ret; - - BUG_ON(i != nr * 2); - - pr_info("iterating forwards by slots"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_slots, k, ({ - if (i >= nr * 2) - break; - - BUG_ON(k.k->p.offset != i); - BUG_ON(bkey_deleted(k.k) != (i & 1)); - - i++; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards by slots"); - return ret; -} - -static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) -{ - u64 i; - int ret = 0; - - delete_test_keys(c); - - pr_info("inserting test keys"); - - for (i = 0; i < nr; i += 16) { - struct bkey_i_cookie ck; - - bkey_cookie_init(&ck.k_i); - ck.k.p.offset = i + 16; - ck.k.p.snapshot = U32_MAX; - ck.k.size = 8; - - ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "insert error"); - if (ret) - return ret; - } - - pr_info("iterating forwards"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i + 8); - BUG_ON(k.k->size != 8); - i += 16; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards"); - if (ret) - return ret; - - BUG_ON(i != nr); - - pr_info("iterating forwards by slots"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_slots, k, ({ - if (i == nr) - break; - BUG_ON(bkey_deleted(k.k) != !(i % 16)); - - BUG_ON(bkey_start_offset(k.k) != i); - BUG_ON(k.k->size != 8); - i = k.k->p.offset; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards by slots"); - return ret; -} - -/* - * XXX: we really want to make sure we've got a btree with depth > 0 for these - * tests - */ -static int test_peek_end(struct bch_fs *c, u64 nr) -{ - delete_test_keys(c); - - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return 0; -} - -static int test_peek_end_extents(struct bch_fs *c, u64 nr) -{ - delete_test_keys(c); - - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), 0); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return 0; -} - -/* extent unit tests */ - -static u64 test_version; - -static int insert_test_extent(struct bch_fs *c, - u64 start, u64 end) -{ - struct bkey_i_cookie k; - int ret; - - bkey_cookie_init(&k.k_i); - k.k_i.k.p.offset = end; - k.k_i.k.p.snapshot = U32_MAX; - k.k_i.k.size = end - start; - k.k_i.k.bversion.lo = test_version++; - - ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0); - bch_err_fn(c, ret); - return ret; -} - -static int __test_extent_overwrite(struct bch_fs *c, - u64 e1_start, u64 e1_end, - u64 e2_start, u64 e2_end) -{ - int ret; - - ret = insert_test_extent(c, e1_start, e1_end) ?: - insert_test_extent(c, e2_start, e2_end); - - delete_test_keys(c); - return ret; -} - -static int test_extent_overwrite_front(struct bch_fs *c, u64 nr) -{ - return __test_extent_overwrite(c, 0, 64, 0, 32) ?: - __test_extent_overwrite(c, 8, 64, 0, 32); -} - -static int test_extent_overwrite_back(struct bch_fs *c, u64 nr) -{ - return __test_extent_overwrite(c, 0, 64, 32, 64) ?: - __test_extent_overwrite(c, 0, 64, 32, 72); -} - -static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr) -{ - return __test_extent_overwrite(c, 0, 64, 32, 40); -} - -static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) -{ - return __test_extent_overwrite(c, 32, 64, 0, 64) ?: - __test_extent_overwrite(c, 32, 64, 0, 128) ?: - __test_extent_overwrite(c, 32, 64, 32, 64) ?: - __test_extent_overwrite(c, 32, 64, 32, 128); -} - -static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid) -{ - struct bkey_i_cookie k; - int ret; - - bkey_cookie_init(&k.k_i); - k.k_i.k.p.inode = inum; - k.k_i.k.p.offset = start + len; - k.k_i.k.p.snapshot = snapid; - k.k_i.k.size = len; - - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, - BTREE_UPDATE_internal_snapshot_node)); - bch_err_fn(c, ret); - return ret; -} - -static int test_extent_create_overlapping(struct bch_fs *c, u64 inum) -{ - return insert_test_overlapping_extent(c, inum, 0, 16, U32_MAX - 2) ?: /* overwrite entire */ - insert_test_overlapping_extent(c, inum, 2, 8, U32_MAX - 2) ?: - insert_test_overlapping_extent(c, inum, 4, 4, U32_MAX) ?: - insert_test_overlapping_extent(c, inum, 32, 8, U32_MAX - 2) ?: /* overwrite front/back */ - insert_test_overlapping_extent(c, inum, 36, 8, U32_MAX) ?: - insert_test_overlapping_extent(c, inum, 60, 8, U32_MAX - 2) ?: - insert_test_overlapping_extent(c, inum, 64, 8, U32_MAX); -} - -/* snapshot unit tests */ - -/* Test skipping over keys in unrelated snapshots: */ -static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) -{ - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i_cookie cookie; - int ret; - - bkey_cookie_init(&cookie.k_i); - cookie.k.p.snapshot = snapid_hi; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); - if (ret) - return ret; - - trans = bch2_trans_get(c); - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, snapid_lo), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - - BUG_ON(k.k->p.snapshot != U32_MAX); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int test_snapshots(struct bch_fs *c, u64 nr) -{ - struct bkey_i_cookie cookie; - u32 snapids[2]; - u32 snapid_subvols[2] = { 1, 1 }; - int ret; - - bkey_cookie_init(&cookie.k_i); - cookie.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); - if (ret) - return ret; - - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_snapshot_node_create(trans, U32_MAX, - snapids, - snapid_subvols, - 2)); - if (ret) - return ret; - - if (snapids[0] > snapids[1]) - swap(snapids[0], snapids[1]); - - ret = test_snapshot_filter(c, snapids[0], snapids[1]); - bch_err_msg(c, ret, "from test_snapshot_filter"); - return ret; -} - -/* perf tests */ - -static u64 test_rand(void) -{ - u64 v; - - get_random_bytes(&v, sizeof(v)); - return v; -} - -static int rand_insert(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bkey_i_cookie k; - int ret = 0; - u64 i; - - for (i = 0; i < nr; i++) { - bkey_cookie_init(&k.k_i); - k.k.p.offset = test_rand(); - k.k.p.snapshot = U32_MAX; - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0)); - if (ret) - break; - } - - bch2_trans_put(trans); - return ret; -} - -static int rand_insert_multi(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bkey_i_cookie k[8]; - int ret = 0; - unsigned j; - u64 i; - - for (i = 0; i < nr; i += ARRAY_SIZE(k)) { - for (j = 0; j < ARRAY_SIZE(k); j++) { - bkey_cookie_init(&k[j].k_i); - k[j].k.p.offset = test_rand(); - k[j].k.p.snapshot = U32_MAX; - } - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0)); - if (ret) - break; - } - - bch2_trans_put(trans); - return ret; -} - -static int rand_lookup(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - u64 i; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - - for (i = 0; i < nr; i++) { - bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX)); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter))); - ret = bkey_err(k); - if (ret) - break; - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int rand_mixed_trans(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i_cookie *cookie, - u64 i, u64 pos) -{ - struct bkey_s_c k; - int ret; - - bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX)); - - k = bch2_btree_iter_peek(trans, iter); - ret = bkey_err(k); - bch_err_msg(trans->c, ret, "lookup error"); - if (ret) - return ret; - - if (!(i & 3) && k.k) { - bkey_cookie_init(&cookie->k_i); - cookie->k.p = iter->pos; - ret = bch2_trans_update(trans, iter, &cookie->k_i, 0); - } - - return ret; -} - -static int rand_mixed(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i_cookie cookie; - int ret = 0; - u64 i, rand; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - - for (i = 0; i < nr; i++) { - rand = test_rand(); - ret = commit_do(trans, NULL, NULL, 0, - rand_mixed_trans(trans, &iter, &cookie, i, rand)); - if (ret) - break; - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int __do_delete(struct btree_trans *trans, struct bpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, - BTREE_ITER_intent); - k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k) - goto err; - - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int rand_delete(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - u64 i; - - for (i = 0; i < nr; i++) { - struct bpos pos = SPOS(0, test_rand(), U32_MAX); - - ret = commit_do(trans, NULL, NULL, 0, - __do_delete(trans, pos)); - if (ret) - break; - } - - bch2_trans_put(trans); - return ret; -} - -static int seq_insert(struct bch_fs *c, u64 nr) -{ - struct bkey_i_cookie insert; - - bkey_cookie_init(&insert.k_i); - - return bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - BTREE_ITER_slots|BTREE_ITER_intent, k, - NULL, NULL, 0, ({ - if (iter.pos.offset >= nr) - break; - insert.k.p = iter.pos; - bch2_trans_update(trans, &iter, &insert.k_i, 0); - }))); -} - -static int seq_lookup(struct bch_fs *c, u64 nr) -{ - return bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, - 0)); -} - -static int seq_overwrite(struct bch_fs *c, u64 nr) -{ - return bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - BTREE_ITER_intent, k, - NULL, NULL, 0, ({ - struct bkey_i_cookie u; - - bkey_reassemble(&u.k_i, k); - bch2_trans_update(trans, &iter, &u.k_i, 0); - }))); -} - -static int seq_delete(struct bch_fs *c, u64 nr) -{ - return bch2_btree_delete_range(c, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - POS(0, U64_MAX), - 0, NULL); -} - -typedef int (*perf_test_fn)(struct bch_fs *, u64); - -struct test_job { - struct bch_fs *c; - u64 nr; - unsigned nr_threads; - perf_test_fn fn; - - atomic_t ready; - wait_queue_head_t ready_wait; - - atomic_t done; - struct completion done_completion; - - u64 start; - u64 finish; - int ret; -}; - -static int btree_perf_test_thread(void *data) -{ - struct test_job *j = data; - int ret; - - if (atomic_dec_and_test(&j->ready)) { - wake_up(&j->ready_wait); - j->start = sched_clock(); - } else { - wait_event(j->ready_wait, !atomic_read(&j->ready)); - } - - ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); - if (ret) { - bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret)); - j->ret = ret; - } - - if (atomic_dec_and_test(&j->done)) { - j->finish = sched_clock(); - complete(&j->done_completion); - } - - return 0; -} - -int bch2_btree_perf_test(struct bch_fs *c, const char *testname, - u64 nr, unsigned nr_threads) -{ - struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; - char name_buf[20]; - struct printbuf nr_buf = PRINTBUF; - struct printbuf per_sec_buf = PRINTBUF; - unsigned i; - u64 time; - - if (nr == 0 || nr_threads == 0) { - pr_err("nr of iterations or threads is not allowed to be 0"); - return -EINVAL; - } - - atomic_set(&j.ready, nr_threads); - init_waitqueue_head(&j.ready_wait); - - atomic_set(&j.done, nr_threads); - init_completion(&j.done_completion); - -#define perf_test(_test) \ - if (!strcmp(testname, #_test)) j.fn = _test - - perf_test(rand_insert); - perf_test(rand_insert_multi); - perf_test(rand_lookup); - perf_test(rand_mixed); - perf_test(rand_delete); - - perf_test(seq_insert); - perf_test(seq_lookup); - perf_test(seq_overwrite); - perf_test(seq_delete); - - /* a unit test, not a perf test: */ - perf_test(test_delete); - perf_test(test_delete_written); - perf_test(test_iterate); - perf_test(test_iterate_extents); - perf_test(test_iterate_slots); - perf_test(test_iterate_slots_extents); - perf_test(test_peek_end); - perf_test(test_peek_end_extents); - - perf_test(test_extent_overwrite_front); - perf_test(test_extent_overwrite_back); - perf_test(test_extent_overwrite_middle); - perf_test(test_extent_overwrite_all); - perf_test(test_extent_create_overlapping); - - perf_test(test_snapshots); - - if (!j.fn) { - pr_err("unknown test %s", testname); - return -EINVAL; - } - - //pr_info("running test %s:", testname); - - if (nr_threads == 1) - btree_perf_test_thread(&j); - else - for (i = 0; i < nr_threads; i++) - kthread_run(btree_perf_test_thread, &j, - "bcachefs perf test[%u]", i); - - while (wait_for_completion_interruptible(&j.done_completion)) - ; - - time = j.finish - j.start; - - scnprintf(name_buf, sizeof(name_buf), "%s:", testname); - prt_human_readable_u64(&nr_buf, nr); - prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time)); - printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", - name_buf, nr_buf.buf, nr_threads, - div_u64(time, NSEC_PER_SEC), - div_u64(time * nr_threads, nr), - per_sec_buf.buf); - printbuf_exit(&per_sec_buf); - printbuf_exit(&nr_buf); - return j.ret; -} - -#endif /* CONFIG_BCACHEFS_TESTS */ diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h deleted file mode 100644 index c73b18aea7e0..000000000000 --- a/fs/bcachefs/tests.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_TEST_H -#define _BCACHEFS_TEST_H - -struct bch_fs; - -#ifdef CONFIG_BCACHEFS_TESTS - -int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); - -#else - -#endif /* CONFIG_BCACHEFS_TESTS */ - -#endif /* _BCACHEFS_TEST_H */ diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c deleted file mode 100644 index 314a24d15d4e..000000000000 --- a/fs/bcachefs/thread_with_file.c +++ /dev/null @@ -1,494 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "thread_with_file.h" - -#include <linux/anon_inodes.h> -#include <linux/file.h> -#include <linux/kthread.h> -#include <linux/pagemap.h> -#include <linux/poll.h> -#include <linux/sched/sysctl.h> - -void bch2_thread_with_file_exit(struct thread_with_file *thr) -{ - if (thr->task) { - kthread_stop(thr->task); - put_task_struct(thr->task); - } -} - -int bch2_run_thread_with_file(struct thread_with_file *thr, - const struct file_operations *fops, - int (*fn)(void *)) -{ - struct file *file = NULL; - int ret, fd = -1; - unsigned fd_flags = O_CLOEXEC; - - if (fops->read && fops->write) - fd_flags |= O_RDWR; - else if (fops->read) - fd_flags |= O_RDONLY; - else if (fops->write) - fd_flags |= O_WRONLY; - - char name[TASK_COMM_LEN]; - get_task_comm(name, current); - - thr->ret = 0; - thr->task = kthread_create(fn, thr, "%s", name); - ret = PTR_ERR_OR_ZERO(thr->task); - if (ret) - return ret; - - ret = get_unused_fd_flags(fd_flags); - if (ret < 0) - goto err; - fd = ret; - - file = anon_inode_getfile(name, fops, thr, fd_flags); - ret = PTR_ERR_OR_ZERO(file); - if (ret) - goto err; - - get_task_struct(thr->task); - wake_up_process(thr->task); - fd_install(fd, file); - return fd; -err: - if (fd >= 0) - put_unused_fd(fd); - if (thr->task) - kthread_stop(thr->task); - return ret; -} - -/* stdio_redirect */ - -static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen) -{ - return stdio->input.buf.nr > seen || stdio->done; -} - -static bool stdio_redirect_has_input(struct stdio_redirect *stdio) -{ - return stdio_redirect_has_more_input(stdio, 0); -} - -static bool stdio_redirect_has_output(struct stdio_redirect *stdio) -{ - return stdio->output.buf.nr || stdio->done; -} - -#define STDIO_REDIRECT_BUFSIZE 4096 - -static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio) -{ - return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; -} - -static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio) -{ - return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; -} - -static void stdio_buf_init(struct stdio_buf *buf) -{ - spin_lock_init(&buf->lock); - init_waitqueue_head(&buf->wait); - darray_init(&buf->buf); -} - -/* thread_with_stdio */ - -static void thread_with_stdio_done(struct thread_with_stdio *thr) -{ - thr->thr.done = true; - thr->stdio.done = true; - wake_up(&thr->stdio.input.wait); - wake_up(&thr->stdio.output.wait); -} - -static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf, - size_t len, loff_t *ppos) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - struct stdio_buf *buf = &thr->stdio.output; - size_t copied = 0, b; - int ret = 0; - - if (!(file->f_flags & O_NONBLOCK)) { - ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio)); - if (ret) - return ret; - } else if (!stdio_redirect_has_output(&thr->stdio)) - return -EAGAIN; - - while (len && buf->buf.nr) { - if (fault_in_writeable(ubuf, len) == len) { - ret = -EFAULT; - break; - } - - spin_lock_irq(&buf->lock); - b = min_t(size_t, len, buf->buf.nr); - - if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) { - ubuf += b; - len -= b; - copied += b; - buf->buf.nr -= b; - memmove(buf->buf.data, - buf->buf.data + b, - buf->buf.nr); - } - spin_unlock_irq(&buf->lock); - } - - return copied ?: ret; -} - -static int thread_with_stdio_release(struct inode *inode, struct file *file) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - thread_with_stdio_done(thr); - bch2_thread_with_file_exit(&thr->thr); - darray_exit(&thr->stdio.input.buf); - darray_exit(&thr->stdio.output.buf); - thr->ops->exit(thr); - return 0; -} - -static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, - size_t len, loff_t *ppos) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - struct stdio_buf *buf = &thr->stdio.input; - size_t copied = 0; - ssize_t ret = 0; - - while (len) { - if (thr->thr.done) { - ret = -EPIPE; - break; - } - - size_t b = len - fault_in_readable(ubuf, len); - if (!b) { - ret = -EFAULT; - break; - } - - spin_lock(&buf->lock); - size_t makeroom = b; - if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr)) - makeroom = min_t(ssize_t, makeroom, - max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr, - 0)); - darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT); - - b = min(len, darray_room(buf->buf)); - - if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { - buf->buf.nr += b; - ubuf += b; - len -= b; - copied += b; - } - spin_unlock(&buf->lock); - - if (b) { - wake_up(&buf->wait); - } else { - if ((file->f_flags & O_NONBLOCK)) { - ret = -EAGAIN; - break; - } - - ret = wait_event_interruptible(buf->wait, - stdio_redirect_has_input_space(&thr->stdio)); - if (ret) - break; - } - } - - return copied ?: ret; -} - -static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - poll_wait(file, &thr->stdio.output.wait, wait); - poll_wait(file, &thr->stdio.input.wait, wait); - - __poll_t mask = 0; - - if (stdio_redirect_has_output(&thr->stdio)) - mask |= EPOLLIN; - if (stdio_redirect_has_input_space(&thr->stdio)) - mask |= EPOLLOUT; - if (thr->thr.done) - mask |= EPOLLHUP|EPOLLERR; - return mask; -} - -static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - poll_wait(file, &thr->stdio.output.wait, wait); - - __poll_t mask = 0; - - if (stdio_redirect_has_output(&thr->stdio)) - mask |= EPOLLIN; - if (thr->thr.done) - mask |= EPOLLHUP|EPOLLERR; - return mask; -} - -static int thread_with_stdio_flush(struct file *file, fl_owner_t id) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - return thr->thr.ret; -} - -static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - if (thr->ops->unlocked_ioctl) - return thr->ops->unlocked_ioctl(thr, cmd, p); - return -ENOTTY; -} - -static const struct file_operations thread_with_stdio_fops = { - .read = thread_with_stdio_read, - .write = thread_with_stdio_write, - .poll = thread_with_stdio_poll, - .flush = thread_with_stdio_flush, - .release = thread_with_stdio_release, - .unlocked_ioctl = thread_with_stdio_ioctl, -}; - -static const struct file_operations thread_with_stdout_fops = { - .read = thread_with_stdio_read, - .poll = thread_with_stdout_poll, - .flush = thread_with_stdio_flush, - .release = thread_with_stdio_release, - .unlocked_ioctl = thread_with_stdio_ioctl, -}; - -static int thread_with_stdio_fn(void *arg) -{ - struct thread_with_stdio *thr = arg; - - thr->thr.ret = thr->ops->fn(thr); - - thread_with_stdio_done(thr); - return 0; -} - -void bch2_thread_with_stdio_init(struct thread_with_stdio *thr, - const struct thread_with_stdio_ops *ops) -{ - stdio_buf_init(&thr->stdio.input); - stdio_buf_init(&thr->stdio.output); - thr->ops = ops; -} - -int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr) -{ - return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn); -} - -int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, - const struct thread_with_stdio_ops *ops) -{ - bch2_thread_with_stdio_init(thr, ops); - - return __bch2_run_thread_with_stdio(thr); -} - -int bch2_run_thread_with_stdout(struct thread_with_stdio *thr, - const struct thread_with_stdio_ops *ops) -{ - stdio_buf_init(&thr->stdio.input); - stdio_buf_init(&thr->stdio.output); - thr->ops = ops; - - return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn); -} -EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout); - -int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len) -{ - struct stdio_buf *buf = &stdio->input; - - /* - * we're waiting on user input (or for the file descriptor to be - * closed), don't want a hung task warning: - */ - do { - wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), - sysctl_hung_task_timeout_secs * HZ / 2); - } while (!stdio_redirect_has_input(stdio)); - - if (stdio->done) - return -1; - - spin_lock(&buf->lock); - int ret = min(len, buf->buf.nr); - buf->buf.nr -= ret; - memcpy(ubuf, buf->buf.data, ret); - memmove(buf->buf.data, - buf->buf.data + ret, - buf->buf.nr); - spin_unlock(&buf->lock); - - wake_up(&buf->wait); - return ret; -} - -int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio, - darray_char *line, - unsigned long timeout) -{ - unsigned long until = jiffies + timeout, t; - struct stdio_buf *buf = &stdio->input; - size_t seen = 0; -again: - t = timeout != MAX_SCHEDULE_TIMEOUT - ? max_t(long, until - jiffies, 0) - : timeout; - - t = min(t, sysctl_hung_task_timeout_secs * HZ / 2); - - wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t); - - if (stdio->done) - return -1; - - spin_lock(&buf->lock); - seen = buf->buf.nr; - char *n = memchr(buf->buf.data, '\n', seen); - - if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) { - spin_unlock(&buf->lock); - return -ETIME; - } - - if (!n) { - buf->waiting_for_line = true; - spin_unlock(&buf->lock); - goto again; - } - - size_t b = n + 1 - buf->buf.data; - if (b > line->size) { - spin_unlock(&buf->lock); - int ret = darray_resize(line, b); - if (ret) - return ret; - seen = 0; - goto again; - } - - buf->buf.nr -= b; - memcpy(line->data, buf->buf.data, b); - memmove(buf->buf.data, - buf->buf.data + b, - buf->buf.nr); - line->nr = b; - - buf->waiting_for_line = false; - spin_unlock(&buf->lock); - - wake_up(&buf->wait); - return 0; -} - -int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line) -{ - return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT); -} - -__printf(3, 0) -static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args) -{ - ssize_t ret; - - do { - va_list args2; - size_t len; - - va_copy(args2, args); - len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2); - va_end(args2); - - if (len + 1 <= darray_room(*out)) { - out->nr += len; - return len; - } - - ret = darray_make_room_gfp(out, len + 1, gfp); - } while (ret == 0); - - return ret; -} - -ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking, - const char *fmt, va_list args) -{ - struct stdio_buf *buf = &stdio->output; - unsigned long flags; - ssize_t ret; -again: - if (stdio->done) - return -EPIPE; - - spin_lock_irqsave(&buf->lock, flags); - ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args); - spin_unlock_irqrestore(&buf->lock, flags); - - if (ret < 0) { - if (nonblocking) - return -EAGAIN; - - ret = wait_event_interruptible(buf->wait, - stdio_redirect_has_output_space(stdio)); - if (ret) - return ret; - goto again; - } - - wake_up(&buf->wait); - return ret; -} - -ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking, - const char *fmt, ...) -{ - va_list args; - ssize_t ret; - - va_start(args, fmt); - ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args); - va_end(args); - - return ret; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h deleted file mode 100644 index 72497b921911..000000000000 --- a/fs/bcachefs/thread_with_file.h +++ /dev/null @@ -1,81 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_THREAD_WITH_FILE_H -#define _BCACHEFS_THREAD_WITH_FILE_H - -#include "thread_with_file_types.h" - -/* - * Thread with file: Run a kthread and connect it to a file descriptor, so that - * it can be interacted with via fd read/write methods and closing the file - * descriptor stops the kthread. - * - * We have two different APIs: - * - * thread_with_file, the low level version. - * You get to define the full file_operations, including your release function, - * which means that you must call bch2_thread_with_file_exit() from your - * .release method - * - * thread_with_stdio, the higher level version - * This implements full piping of input and output, including .poll. - * - * Notes on behaviour: - * - kthread shutdown behaves like writing or reading from a pipe that has been - * closed - * - Input and output buffers are 4096 bytes, although buffers may in some - * situations slightly exceed that limit so as to avoid chopping off a - * message in the middle in nonblocking mode. - * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations - - * should be fine but might change in future revisions. - * - Output buffer may grow past 4096 bytes to deal with messages that are - * bigger than 4096 bytes - * - Writing may be done blocking or nonblocking; in nonblocking mode, we only - * drop entire messages. - * - * To write, use stdio_redirect_printf() - * To read, use stdio_redirect_read() or stdio_redirect_readline() - */ - -struct task_struct; - -struct thread_with_file { - struct task_struct *task; - int ret; - bool done; -}; - -void bch2_thread_with_file_exit(struct thread_with_file *); -int bch2_run_thread_with_file(struct thread_with_file *, - const struct file_operations *, - int (*fn)(void *)); - -struct thread_with_stdio; - -struct thread_with_stdio_ops { - void (*exit)(struct thread_with_stdio *); - int (*fn)(struct thread_with_stdio *); - long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long); -}; - -struct thread_with_stdio { - struct thread_with_file thr; - struct stdio_redirect stdio; - const struct thread_with_stdio_ops *ops; -}; - -void bch2_thread_with_stdio_init(struct thread_with_stdio *, - const struct thread_with_stdio_ops *); -int __bch2_run_thread_with_stdio(struct thread_with_stdio *); -int bch2_run_thread_with_stdio(struct thread_with_stdio *, - const struct thread_with_stdio_ops *); -int bch2_run_thread_with_stdout(struct thread_with_stdio *, - const struct thread_with_stdio_ops *); -int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); - -int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long); -int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *); - -__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list); -__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...); - -#endif /* _BCACHEFS_THREAD_WITH_FILE_H */ diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h deleted file mode 100644 index f4d484d44f63..000000000000 --- a/fs/bcachefs/thread_with_file_types.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H -#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H - -#include "darray.h" - -struct stdio_buf { - spinlock_t lock; - wait_queue_head_t wait; - darray_char buf; - bool waiting_for_line; -}; - -struct stdio_redirect { - struct stdio_buf input; - struct stdio_buf output; - bool done; -}; - -#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */ diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c deleted file mode 100644 index 2c34fe4be912..000000000000 --- a/fs/bcachefs/time_stats.c +++ /dev/null @@ -1,191 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/jiffies.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/preempt.h> -#include <linux/time.h> -#include <linux/spinlock.h> - -#include "eytzinger.h" -#include "time_stats.h" - -/* disable automatic switching to percpu mode */ -#define TIME_STATS_NONPCPU ((unsigned long) 1) - -static const struct time_unit time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "d", (u64) NSEC_PER_SEC * 3600 * 24}, - { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7}, - { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */ - { "eon", U64_MAX }, -}; - -const struct time_unit *bch2_pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} - -static void quantiles_update(struct quantiles *q, u64 v) -{ - unsigned i = 0; - - while (i < ARRAY_SIZE(q->entries)) { - struct quantile_entry *e = q->entries + i; - - if (unlikely(!e->step)) { - e->m = v; - e->step = max_t(unsigned, v / 2, 1024); - } else if (e->m > v) { - e->m = e->m >= e->step - ? e->m - e->step - : 0; - } else if (e->m < v) { - e->m = e->m + e->step > e->m - ? e->m + e->step - : U32_MAX; - } - - if ((e->m > v ? e->m - v : v - e->m) < e->step) - e->step = max_t(unsigned, e->step / 2, 1); - - if (v >= e->m) - break; - - i = eytzinger0_child(i, v > e->m); - } -} - -static inline void time_stats_update_one(struct bch2_time_stats *stats, - u64 start, u64 end) -{ - u64 duration, freq; - bool initted = stats->last_event != 0; - - if (time_after64(end, start)) { - struct quantiles *quantiles = time_stats_to_quantiles(stats); - - duration = end - start; - mean_and_variance_update(&stats->duration_stats, duration); - mean_and_variance_weighted_update(&stats->duration_stats_weighted, - duration, initted, TIME_STATS_MV_WEIGHT); - stats->max_duration = max(stats->max_duration, duration); - stats->min_duration = min(stats->min_duration, duration); - stats->total_duration += duration; - - if (quantiles) - quantiles_update(quantiles, duration); - } - - if (stats->last_event && time_after64(end, stats->last_event)) { - freq = end - stats->last_event; - mean_and_variance_update(&stats->freq_stats, freq); - mean_and_variance_weighted_update(&stats->freq_stats_weighted, - freq, initted, TIME_STATS_MV_WEIGHT); - stats->max_freq = max(stats->max_freq, freq); - stats->min_freq = min(stats->min_freq, freq); - } - - stats->last_event = end; -} - -void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct time_stat_buffer *b) -{ - for (struct time_stat_buffer_entry *i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - time_stats_update_one(stats, i->start, i->end); - b->nr = 0; -} - -static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats, - struct time_stat_buffer *b) -{ - unsigned long flags; - - spin_lock_irqsave(&stats->lock, flags); - __bch2_time_stats_clear_buffer(stats, b); - spin_unlock_irqrestore(&stats->lock, flags); -} - -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) -{ - unsigned long flags; - - if ((unsigned long) stats->buffer <= TIME_STATS_NONPCPU) { - spin_lock_irqsave(&stats->lock, flags); - time_stats_update_one(stats, start, end); - - if (!stats->buffer && - mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && - stats->duration_stats.n > 1024) - stats->buffer = - alloc_percpu_gfp(struct time_stat_buffer, - GFP_ATOMIC); - spin_unlock_irqrestore(&stats->lock, flags); - } else { - struct time_stat_buffer *b; - - preempt_disable(); - b = this_cpu_ptr(stats->buffer); - - BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); - b->entries[b->nr++] = (struct time_stat_buffer_entry) { - .start = start, - .end = end - }; - - if (unlikely(b->nr == ARRAY_SIZE(b->entries))) - time_stats_clear_buffer(stats, b); - preempt_enable(); - } -} - -void bch2_time_stats_reset(struct bch2_time_stats *stats) -{ - spin_lock_irq(&stats->lock); - unsigned offset = offsetof(struct bch2_time_stats, min_duration); - memset((void *) stats + offset, 0, sizeof(*stats) - offset); - - if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) { - int cpu; - for_each_possible_cpu(cpu) - per_cpu_ptr(stats->buffer, cpu)->nr = 0; - } - spin_unlock_irq(&stats->lock); -} - -void bch2_time_stats_exit(struct bch2_time_stats *stats) -{ - if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) - free_percpu(stats->buffer); - stats->buffer = NULL; -} - -void bch2_time_stats_init(struct bch2_time_stats *stats) -{ - memset(stats, 0, sizeof(*stats)); - stats->min_duration = U64_MAX; - stats->min_freq = U64_MAX; - spin_lock_init(&stats->lock); -} - -void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats) -{ - bch2_time_stats_init(stats); - stats->buffer = (struct time_stat_buffer __percpu *) TIME_STATS_NONPCPU; -} diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h deleted file mode 100644 index eddb0985bab4..000000000000 --- a/fs/bcachefs/time_stats.h +++ /dev/null @@ -1,161 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * bch2_time_stats - collect statistics on events that have a duration, with nicely - * formatted textual output on demand - * - * - percpu buffering of event collection: cheap enough to shotgun - * everywhere without worrying about overhead - * - * tracks: - * - number of events - * - maximum event duration ever seen - * - sum of all event durations - * - average event duration, standard and weighted - * - standard deviation of event durations, standard and weighted - * and analagous statistics for the frequency of events - * - * We provide both mean and weighted mean (exponentially weighted), and standard - * deviation and weighted standard deviation, to give an efficient-to-compute - * view of current behaviour versus. average behaviour - "did this event source - * just become wonky, or is this typical?". - * - * Particularly useful for tracking down latency issues. - */ -#ifndef _BCACHEFS_TIME_STATS_H -#define _BCACHEFS_TIME_STATS_H - -#include <linux/sched/clock.h> -#include <linux/spinlock_types.h> -#include <linux/string.h> - -#include "mean_and_variance.h" - -struct time_unit { - const char *name; - u64 nsecs; -}; - -/* - * given a nanosecond value, pick the preferred time units for printing: - */ -const struct time_unit *bch2_pick_time_units(u64 ns); - -/* - * quantiles - do not use: - * - * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't - * use in new code. - */ - -#define NR_QUANTILES 15 -#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) -#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) -#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) - -struct quantiles { - struct quantile_entry { - u64 m; - u64 step; - } entries[NR_QUANTILES]; -}; - -struct time_stat_buffer { - unsigned nr; - struct time_stat_buffer_entry { - u64 start; - u64 end; - } entries[31]; -}; - -struct bch2_time_stats { - spinlock_t lock; - bool have_quantiles; - struct time_stat_buffer __percpu *buffer; - /* all fields are in nanoseconds */ - u64 min_duration; - u64 max_duration; - u64 total_duration; - u64 max_freq; - u64 min_freq; - u64 last_event; - u64 last_event_start; - - struct mean_and_variance duration_stats; - struct mean_and_variance freq_stats; - -/* default weight for weighted mean and variance calculations */ -#define TIME_STATS_MV_WEIGHT 8 - - struct mean_and_variance_weighted duration_stats_weighted; - struct mean_and_variance_weighted freq_stats_weighted; -}; - -struct bch2_time_stats_quantiles { - struct bch2_time_stats stats; - struct quantiles quantiles; -}; - -static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats) -{ - return stats->have_quantiles - ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles - : NULL; -} - -void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *); -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); - -/** - * time_stats_update - collect a new event being tracked - * - * @stats - bch2_time_stats to update - * @start - start time of event, recorded with local_clock() - * - * The end duration of the event will be the current time - */ -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) -{ - __bch2_time_stats_update(stats, start, local_clock()); -} - -/** - * track_event_change - track state change events - * - * @stats - bch2_time_stats to update - * @v - new state, true or false - * - * Use this when tracking time stats for state changes, i.e. resource X becoming - * blocked/unblocked. - */ -static inline bool track_event_change(struct bch2_time_stats *stats, bool v) -{ - if (v != !!stats->last_event_start) { - if (!v) { - bch2_time_stats_update(stats, stats->last_event_start); - stats->last_event_start = 0; - } else { - stats->last_event_start = local_clock() ?: 1; - return true; - } - } - - return false; -} - -void bch2_time_stats_reset(struct bch2_time_stats *); -void bch2_time_stats_exit(struct bch2_time_stats *); -void bch2_time_stats_init(struct bch2_time_stats *); -void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *); - -static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) -{ - bch2_time_stats_exit(&statq->stats); -} -static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq) -{ - bch2_time_stats_init(&statq->stats); - statq->stats.have_quantiles = true; - memset(&statq->quantiles, 0, sizeof(statq->quantiles)); -} - -#endif /* _BCACHEFS_TIME_STATS_H */ diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c deleted file mode 100644 index dfad1d06633d..000000000000 --- a/fs/bcachefs/trace.c +++ /dev/null @@ -1,18 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "alloc_types.h" -#include "buckets.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_update_interior.h" -#include "keylist.h" -#include "move_types.h" -#include "opts.h" -#include "six.h" - -#include <linux/blktrace_api.h> - -#define CREATE_TRACE_POINTS -#include "trace.h" diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h deleted file mode 100644 index 9c5a9c551f03..000000000000 --- a/fs/bcachefs/trace.h +++ /dev/null @@ -1,1883 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM bcachefs - -#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ) - -#include <linux/tracepoint.h> - -#define TRACE_BPOS_entries(name) \ - __field(u64, name##_inode ) \ - __field(u64, name##_offset ) \ - __field(u32, name##_snapshot ) - -#define TRACE_BPOS_assign(dst, src) \ - __entry->dst##_inode = (src).inode; \ - __entry->dst##_offset = (src).offset; \ - __entry->dst##_snapshot = (src).snapshot - -DECLARE_EVENT_CLASS(bpos, - TP_PROTO(const struct bpos *p), - TP_ARGS(p), - - TP_STRUCT__entry( - TRACE_BPOS_entries(p) - ), - - TP_fast_assign( - TRACE_BPOS_assign(p, *p); - ), - - TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot) -); - -DECLARE_EVENT_CLASS(fs_str, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __string(str, str ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __assign_str(str); - ), - - TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) -); - -DECLARE_EVENT_CLASS(trans_str, - TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), - TP_ARGS(trans, caller_ip, str), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __string(str, str ) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __assign_str(str); - ), - - TP_printk("%d,%d %s %pS %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str)) -); - -DECLARE_EVENT_CLASS(trans_str_nocaller, - TP_PROTO(struct btree_trans *trans, const char *str), - TP_ARGS(trans, str), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - __string(str, str ) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(str); - ), - - TP_printk("%d,%d %s %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->trans_fn, __get_str(str)) -); - -DECLARE_EVENT_CLASS(btree_node_nofs, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u8, level ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->level = b->c.level; - __entry->btree_id = b->c.btree_id; - TRACE_BPOS_assign(pos, b->key.k.p); - ), - - TP_printk("%d,%d %u %s %llu:%llu:%u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->level, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) -); - -DECLARE_EVENT_CLASS(btree_node, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - __field(u8, level ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->level = b->c.level; - __entry->btree_id = b->c.btree_id; - TRACE_BPOS_assign(pos, b->key.k.p); - ), - - TP_printk("%d,%d %s %u %s %llu:%llu:%u", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, - __entry->level, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) -); - -DECLARE_EVENT_CLASS(bch_fs, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c), - - TP_STRUCT__entry( - __field(dev_t, dev ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - ), - - TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) -); - -DECLARE_EVENT_CLASS(btree_trans, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - ), - - TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn) -); - -DECLARE_EVENT_CLASS(bio, - TP_PROTO(struct bio *bio), - TP_ARGS(bio), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(sector_t, sector ) - __field(unsigned int, nr_sector ) - __array(char, rwbs, 6 ) - ), - - TP_fast_assign( - __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0; - __entry->sector = bio->bi_iter.bi_sector; - __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf); - ), - - TP_printk("%d,%d %s %llu + %u", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, __entry->nr_sector) -); - -/* errors */ - -TRACE_EVENT(error_throw, - TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip), - TP_ARGS(c, bch_err, ip), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(int, err ) - __array(char, err_str, 32 ) - __array(char, ip, 32 ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->err = bch_err; - strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str)); - snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); - ), - - TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ip, __entry->err_str) -); - -TRACE_EVENT(error_downcast, - TP_PROTO(int bch_err, int std_err, unsigned long ip), - TP_ARGS(bch_err, std_err, ip), - - TP_STRUCT__entry( - __array(char, bch_err, 32 ) - __array(char, std_err, 32 ) - __array(char, ip, 32 ) - ), - - TP_fast_assign( - strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); - strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); - snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); - ), - - TP_printk("%s ret %s -> %s %s", __entry->ip, - __entry->bch_err, __entry->std_err, __entry->ip) -); - -/* disk_accounting.c */ - -TRACE_EVENT(accounting_mem_insert, - TP_PROTO(struct bch_fs *c, const char *acc), - TP_ARGS(c, acc), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(unsigned, new_nr ) - __string(acc, acc ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->new_nr = c->accounting.k.nr; - __assign_str(acc); - ), - - TP_printk("%d,%d entries %u added %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->new_nr, - __get_str(acc)) -); - -/* fs.c: */ -TRACE_EVENT(bch2_sync_fs, - TP_PROTO(struct super_block *sb, int wait), - - TP_ARGS(sb, wait), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( int, wait ) - - ), - - TP_fast_assign( - __entry->dev = sb->s_dev; - __entry->wait = wait; - ), - - TP_printk("dev %d,%d wait %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->wait) -); - -/* fs-io.c: */ -TRACE_EVENT(bch2_fsync, - TP_PROTO(struct file *file, int datasync), - - TP_ARGS(file, datasync), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ino_t, parent ) - __field( int, datasync ) - ), - - TP_fast_assign( - struct dentry *dentry = file->f_path.dentry; - - __entry->dev = dentry->d_sb->s_dev; - __entry->ino = d_inode(dentry)->i_ino; - __entry->parent = d_inode(dentry->d_parent)->i_ino; - __entry->datasync = datasync; - ), - - TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned long) __entry->parent, __entry->datasync) -); - -/* super-io.c: */ -TRACE_EVENT(write_super, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(unsigned long, ip ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->ip = ip; - ), - - TP_printk("%d,%d for %pS", - MAJOR(__entry->dev), MINOR(__entry->dev), - (void *) __entry->ip) -); - -/* io.c: */ - -DEFINE_EVENT(bio, io_read_promote, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -TRACE_EVENT(io_read_nopromote, - TP_PROTO(struct bch_fs *c, int ret), - TP_ARGS(c, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, ret, 32 ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); - ), - - TP_printk("%d,%d ret %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ret) -); - -DEFINE_EVENT(bio, io_read_bounce, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DEFINE_EVENT(bio, io_read_split, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DEFINE_EVENT(bio, io_read_retry, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DEFINE_EVENT(bio, io_read_reuse_race, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DEFINE_EVENT(bio, io_read_fail_and_poison, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -/* ec.c */ - -TRACE_EVENT(stripe_create, - TP_PROTO(struct bch_fs *c, u64 idx, int ret), - TP_ARGS(c, idx, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, idx ) - __field(int, ret ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->idx = idx; - __entry->ret = ret; - ), - - TP_printk("%d,%d idx %llu ret %i", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->idx, - __entry->ret) -); - -/* Journal */ - -DEFINE_EVENT(bch_fs, journal_full, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(fs_str, journal_entry_full, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, journal_entry_close, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(bio, journal_write, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -TRACE_EVENT(journal_reclaim_start, - TP_PROTO(struct bch_fs *c, bool direct, bool kicked, - u64 min_nr, u64 min_key_cache, - u64 btree_cache_dirty, u64 btree_cache_total, - u64 btree_key_cache_dirty, u64 btree_key_cache_total), - TP_ARGS(c, direct, kicked, min_nr, min_key_cache, - btree_cache_dirty, btree_cache_total, - btree_key_cache_dirty, btree_key_cache_total), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(bool, direct ) - __field(bool, kicked ) - __field(u64, min_nr ) - __field(u64, min_key_cache ) - __field(u64, btree_cache_dirty ) - __field(u64, btree_cache_total ) - __field(u64, btree_key_cache_dirty ) - __field(u64, btree_key_cache_total ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->direct = direct; - __entry->kicked = kicked; - __entry->min_nr = min_nr; - __entry->min_key_cache = min_key_cache; - __entry->btree_cache_dirty = btree_cache_dirty; - __entry->btree_cache_total = btree_cache_total; - __entry->btree_key_cache_dirty = btree_key_cache_dirty; - __entry->btree_key_cache_total = btree_key_cache_total; - ), - - TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->direct, - __entry->kicked, - __entry->min_nr, - __entry->min_key_cache, - __entry->btree_cache_dirty, - __entry->btree_cache_total, - __entry->btree_key_cache_dirty, - __entry->btree_key_cache_total) -); - -TRACE_EVENT(journal_reclaim_finish, - TP_PROTO(struct bch_fs *c, u64 nr_flushed), - TP_ARGS(c, nr_flushed), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, nr_flushed ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->nr_flushed = nr_flushed; - ), - - TP_printk("%d,%d flushed %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->nr_flushed) -); - -/* bset.c: */ - -DEFINE_EVENT(bpos, bkey_pack_pos_fail, - TP_PROTO(const struct bpos *p), - TP_ARGS(p) -); - -/* Btree cache: */ - -TRACE_EVENT(btree_cache_scan, - TP_PROTO(long nr_to_scan, long can_free, long ret), - TP_ARGS(nr_to_scan, can_free, ret), - - TP_STRUCT__entry( - __field(long, nr_to_scan ) - __field(long, can_free ) - __field(long, ret ) - ), - - TP_fast_assign( - __entry->nr_to_scan = nr_to_scan; - __entry->can_free = can_free; - __entry->ret = ret; - ), - - TP_printk("scanned for %li nodes, can free %li, ret %li", - __entry->nr_to_scan, __entry->can_free, __entry->ret) -); - -DEFINE_EVENT(btree_node_nofs, btree_cache_reap, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) -); - -DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans) -); - -DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans) -); - -DEFINE_EVENT(btree_trans, btree_cache_cannibalize, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans) -); - -DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans) -); - -/* Btree */ - -DEFINE_EVENT(btree_node, btree_node_read, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -TRACE_EVENT(btree_node_write, - TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), - TP_ARGS(b, bytes, sectors), - - TP_STRUCT__entry( - __field(enum btree_node_type, type) - __field(unsigned, bytes ) - __field(unsigned, sectors ) - ), - - TP_fast_assign( - __entry->type = btree_node_type(b); - __entry->bytes = bytes; - __entry->sectors = sectors; - ), - - TP_printk("bkey type %u bytes %u sectors %u", - __entry->type , __entry->bytes, __entry->sectors) -); - -DEFINE_EVENT(btree_node, btree_node_alloc, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_free, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -TRACE_EVENT(btree_reserve_get_fail, - TP_PROTO(const char *trans_fn, - unsigned long caller_ip, - size_t required, - int ret), - TP_ARGS(trans_fn, caller_ip, required, ret), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(size_t, required ) - __array(char, ret, 32 ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->required = required; - strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); - ), - - TP_printk("%s %pS required %zu ret %s", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->required, - __entry->ret) -); - -DEFINE_EVENT(btree_node, btree_node_compact, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_merge, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_split, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_rewrite, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_set_root, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -TRACE_EVENT(btree_path_relock_fail, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned level), - TP_ARGS(trans, caller_ip, path, level), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, level ) - __field(u8, path_idx) - TRACE_BPOS_entries(pos) - __array(char, node, 24 ) - __field(u8, self_read_count ) - __field(u8, self_intent_count) - __field(u8, read_count ) - __field(u8, intent_count ) - __field(u32, iter_lock_seq ) - __field(u32, node_lock_seq ) - ), - - TP_fast_assign( - struct btree *b = btree_path_node(path, level); - struct six_lock_count c; - - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - __entry->level = level; - __entry->path_idx = path - trans->paths; - TRACE_BPOS_assign(pos, path->pos); - - c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level); - __entry->self_read_count = c.n[SIX_LOCK_read]; - __entry->self_intent_count = c.n[SIX_LOCK_intent]; - - if (IS_ERR(b)) { - strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node)); - } else { - c = six_lock_counts(&path->l[level].b->c.lock); - __entry->read_count = c.n[SIX_LOCK_read]; - __entry->intent_count = c.n[SIX_LOCK_intent]; - scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c); - } - __entry->iter_lock_seq = path->l[level].lock_seq; - __entry->node_lock_seq = is_btree_node(path, level) - ? six_lock_seq(&path->l[level].b->c.lock) - : 0; - ), - - TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->path_idx, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->level, - __entry->node, - __entry->self_read_count, - __entry->self_intent_count, - __entry->read_count, - __entry->intent_count, - __entry->iter_lock_seq, - __entry->node_lock_seq) -); - -TRACE_EVENT(btree_path_upgrade_fail, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned level), - TP_ARGS(trans, caller_ip, path, level), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, level ) - __field(u8, path_idx) - TRACE_BPOS_entries(pos) - __field(u8, locked ) - __field(u8, self_read_count ) - __field(u8, self_intent_count) - __field(u8, read_count ) - __field(u8, intent_count ) - __field(u32, iter_lock_seq ) - __field(u32, node_lock_seq ) - ), - - TP_fast_assign( - struct six_lock_count c; - - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - __entry->level = level; - __entry->path_idx = path - trans->paths; - TRACE_BPOS_assign(pos, path->pos); - __entry->locked = btree_node_locked(path, level); - - c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), - __entry->self_read_count = c.n[SIX_LOCK_read]; - __entry->self_intent_count = c.n[SIX_LOCK_intent]; - c = six_lock_counts(&path->l[level].b->c.lock); - __entry->read_count = c.n[SIX_LOCK_read]; - __entry->intent_count = c.n[SIX_LOCK_intent]; - __entry->iter_lock_seq = path->l[level].lock_seq; - __entry->node_lock_seq = is_btree_node(path, level) - ? six_lock_seq(&path->l[level].b->c.lock) - : 0; - ), - - TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->path_idx, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->level, - __entry->locked, - __entry->self_read_count, - __entry->self_intent_count, - __entry->read_count, - __entry->intent_count, - __entry->iter_lock_seq, - __entry->node_lock_seq) -); - -/* Garbage collection */ - -DEFINE_EVENT(bch_fs, gc_gens_start, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(bch_fs, gc_gens_end, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -/* Allocator */ - -DEFINE_EVENT(fs_str, bucket_alloc, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, bucket_alloc_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DECLARE_EVENT_CLASS(discard_buckets_class, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, seen ) - __field(u64, open ) - __field(u64, need_journal_commit ) - __field(u64, discarded ) - __array(char, err, 16 ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->seen = seen; - __entry->open = open; - __entry->need_journal_commit = need_journal_commit; - __entry->discarded = discarded; - strscpy(__entry->err, err, sizeof(__entry->err)); - ), - - TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->seen, - __entry->open, - __entry->need_journal_commit, - __entry->discarded, - __entry->err) -); - -DEFINE_EVENT(discard_buckets_class, discard_buckets, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err) -); - -DEFINE_EVENT(discard_buckets_class, discard_buckets_fast, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err) -); - -TRACE_EVENT(bucket_invalidate, - TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), - TP_ARGS(c, dev, bucket, sectors), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u32, dev_idx ) - __field(u32, sectors ) - __field(u64, bucket ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->dev_idx = dev; - __entry->sectors = sectors; - __entry->bucket = bucket; - ), - - TP_printk("%d:%d invalidated %u:%llu cached sectors %u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->dev_idx, __entry->bucket, - __entry->sectors) -); - -/* Moving IO */ - -DEFINE_EVENT(fs_str, io_move, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_read, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_write, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_finish, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_write_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_start_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -TRACE_EVENT(move_data, - TP_PROTO(struct bch_fs *c, - struct bch_move_stats *stats), - TP_ARGS(c, stats), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, keys_moved ) - __field(u64, keys_raced ) - __field(u64, sectors_seen ) - __field(u64, sectors_moved ) - __field(u64, sectors_raced ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->keys_moved = atomic64_read(&stats->keys_moved); - __entry->keys_raced = atomic64_read(&stats->keys_raced); - __entry->sectors_seen = atomic64_read(&stats->sectors_seen); - __entry->sectors_moved = atomic64_read(&stats->sectors_moved); - __entry->sectors_raced = atomic64_read(&stats->sectors_raced); - ), - - TP_printk("%d,%d keys moved %llu raced %llu" - "sectors seen %llu moved %llu raced %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->keys_moved, - __entry->keys_raced, - __entry->sectors_seen, - __entry->sectors_moved, - __entry->sectors_raced) -); - -TRACE_EVENT(copygc, - TP_PROTO(struct bch_fs *c, - u64 buckets, - u64 sectors_seen, - u64 sectors_moved), - TP_ARGS(c, buckets, sectors_seen, sectors_moved), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, buckets ) - __field(u64, sectors_seen ) - __field(u64, sectors_moved ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->buckets = buckets; - __entry->sectors_seen = sectors_seen; - __entry->sectors_moved = sectors_moved; - ), - - TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->buckets, - __entry->sectors_seen, - __entry->sectors_moved) -); - -TRACE_EVENT(copygc_wait, - TP_PROTO(struct bch_fs *c, - u64 wait_amount, u64 until), - TP_ARGS(c, wait_amount, until), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, wait_amount ) - __field(u64, until ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->wait_amount = wait_amount; - __entry->until = until; - ), - - TP_printk("%d,%u waiting for %llu sectors until %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->wait_amount, __entry->until) -); - -/* btree transactions: */ - -DECLARE_EVENT_CLASS(transaction_event, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - ), - - TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) -); - -DEFINE_EVENT(transaction_event, transaction_commit, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -DEFINE_EVENT(transaction_event, trans_restart_injected, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -TRACE_EVENT(trans_restart_split_race, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree *b), - TP_ARGS(trans, caller_ip, b), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, level ) - __field(u16, written ) - __field(u16, blocks ) - __field(u16, u64s_remaining ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->level = b->c.level; - __entry->written = b->written; - __entry->blocks = btree_blocks(trans->c); - __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b); - ), - - TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", - __entry->trans_fn, (void *) __entry->caller_ip, - __entry->level, - __entry->written, __entry->blocks, - __entry->u64s_remaining) -); - -TRACE_EVENT(trans_blocked_journal_reclaim, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - - __field(unsigned long, key_cache_nr_keys ) - __field(unsigned long, key_cache_nr_dirty ) - __field(long, must_wait ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->key_cache_nr_keys = atomic_long_read(&trans->c->btree_key_cache.nr_keys); - __entry->key_cache_nr_dirty = atomic_long_read(&trans->c->btree_key_cache.nr_dirty); - __entry->must_wait = __bch2_btree_key_cache_must_wait(trans->c); - ), - - TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li", - __entry->trans_fn, (void *) __entry->caller_ip, - __entry->key_cache_nr_keys, - __entry->key_cache_nr_dirty, - __entry->must_wait) -); - -#if 0 -/* todo: bring back dynamic fault injection */ -DEFINE_EVENT(transaction_event, trans_restart_fault_inject, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); -#endif - -DEFINE_EVENT(transaction_event, trans_traverse_all, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -DEFINE_EVENT(trans_str, trans_restart_too_many_iters, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - const char *paths), - TP_ARGS(trans, caller_ip, paths) -); - -DECLARE_EVENT_CLASS(transaction_restart_iter, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos) - ), - - TP_printk("%s %pS btree %s pos %llu:%llu:%u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(fs_str, trans_restart_upgrade, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(trans_str, trans_restart_relock, - TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), - TP_ARGS(trans, caller_ip, str) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock, - TP_PROTO(struct btree_trans *trans, - const char *cycle), - TP_ARGS(trans, cycle) -); - -DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -TRACE_EVENT(trans_restart_would_deadlock_write, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - ), - - TP_printk("%s", __entry->trans_fn) -); - -TRACE_EVENT(trans_restart_mem_realloced, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - unsigned long bytes), - TP_ARGS(trans, caller_ip, bytes), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(unsigned long, bytes ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->bytes = bytes; - ), - - TP_printk("%s %pS bytes %lu", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->bytes) -); - -DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -TRACE_EVENT(path_downgrade, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned old_locks_want), - TP_ARGS(trans, caller_ip, path, old_locks_want), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(unsigned, old_locks_want ) - __field(unsigned, new_locks_want ) - __field(unsigned, btree ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->old_locks_want = old_locks_want; - __entry->new_locks_want = path->locks_want; - __entry->btree = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - ), - - TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->old_locks_want, - __entry->new_locks_want, - bch2_btree_id_str(__entry->btree), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot) -); - -TRACE_EVENT(key_cache_fill, - TP_PROTO(struct btree_trans *trans, const char *key), - TP_ARGS(trans, key), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __string(key, key ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(key); - ), - - TP_printk("%s %s", __entry->trans_fn, __get_str(key)) -); - -TRACE_EVENT(write_buffer_flush, - TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size), - TP_ARGS(trans, nr, skipped, fast, size), - - TP_STRUCT__entry( - __field(size_t, nr ) - __field(size_t, skipped ) - __field(size_t, fast ) - __field(size_t, size ) - ), - - TP_fast_assign( - __entry->nr = nr; - __entry->skipped = skipped; - __entry->fast = fast; - __entry->size = size; - ), - - TP_printk("%zu/%zu skipped %zu fast %zu", - __entry->nr, __entry->size, __entry->skipped, __entry->fast) -); - -TRACE_EVENT(write_buffer_flush_sync, - TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans, caller_ip), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - ), - - TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) -); - -TRACE_EVENT(write_buffer_flush_slowpath, - TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total), - TP_ARGS(trans, slowpath, total), - - TP_STRUCT__entry( - __field(size_t, slowpath ) - __field(size_t, total ) - ), - - TP_fast_assign( - __entry->slowpath = slowpath; - __entry->total = total; - ), - - TP_printk("%zu/%zu", __entry->slowpath, __entry->total) -); - -TRACE_EVENT(write_buffer_maybe_flush, - TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key), - TP_ARGS(trans, caller_ip, key), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __string(key, key ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(key); - ), - - TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key)) -); - -DEFINE_EVENT(fs_str, rebalance_extent, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, data_update, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_pred, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_created_rebalance, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_evacuate_bucket, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, extent_trim_atomic, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, btree_iter_peek_slot, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, __btree_iter_peek, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, btree_iter_peek_max, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, btree_iter_peek_prev_min, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS - -TRACE_EVENT(update_by_path, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, - struct btree_insert_entry *i, bool overwrite), - TP_ARGS(trans, path, i, overwrite), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(btree_path_idx_t, path_idx ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - __field(u8, overwrite ) - __field(btree_path_idx_t, update_idx ) - __field(btree_path_idx_t, nr_updates ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->path_idx = path - trans->paths; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - __entry->overwrite = overwrite; - __entry->update_idx = i - trans->updates; - __entry->nr_updates = trans->nr_updates; - ), - - TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u", - __entry->trans_fn, - __entry->path_idx, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->overwrite, - __entry->update_idx, - __entry->nr_updates) -); - -TRACE_EVENT(btree_path_lock, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_bkey_cached_common *b), - TP_ARGS(trans, caller_ip, b), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, level ) - __array(char, node, 24 ) - __field(u32, lock_seq ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = b->btree_id; - __entry->level = b->level; - - scnprintf(__entry->node, sizeof(__entry->node), "%px", b); - __entry->lock_seq = six_lock_seq(&b->lock); - ), - - TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->level, - __entry->node, - __entry->lock_seq) -); - -DECLARE_EVENT_CLASS(btree_path_ev, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path), - - TP_STRUCT__entry( - __field(u16, idx ) - __field(u8, ref ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->ref = path->ref; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - ), - - TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u", - __entry->idx, __entry->ref, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot) -); - -DEFINE_EVENT(btree_path_ev, btree_path_get_ll, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path) -); - -DEFINE_EVENT(btree_path_ev, btree_path_put_ll, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path) -); - -DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path) -); - -TRACE_EVENT(btree_path_alloc, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, locks_want ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->locks_want = path->locks_want; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - ), - - TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u", - __entry->idx, - bch2_btree_id_str(__entry->btree_id), - __entry->locks_want, - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot) -); - -TRACE_EVENT(btree_path_get, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos), - TP_ARGS(trans, path, new_pos), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, ref ) - __field(u8, preserve ) - __field(u8, locks_want ) - __field(u8, btree_id ) - TRACE_BPOS_entries(old_pos) - TRACE_BPOS_entries(new_pos) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->ref = path->ref; - __entry->preserve = path->preserve; - __entry->locks_want = path->locks_want; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(old_pos, path->pos); - TRACE_BPOS_assign(new_pos, *new_pos); - ), - - TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u", - __entry->idx, - __entry->ref, - __entry->preserve, - bch2_btree_id_str(__entry->btree_id), - __entry->locks_want, - __entry->old_pos_inode, - __entry->old_pos_offset, - __entry->old_pos_snapshot, - __entry->new_pos_inode, - __entry->new_pos_offset, - __entry->new_pos_snapshot) -); - -DECLARE_EVENT_CLASS(btree_path_clone, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), - TP_ARGS(trans, path, new), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, new_idx ) - __field(u8, btree_id ) - __field(u8, ref ) - __field(u8, preserve ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->new_idx = new - trans->paths; - __entry->btree_id = path->btree_id; - __entry->ref = path->ref; - __entry->preserve = path->preserve; - TRACE_BPOS_assign(pos, path->pos); - ), - - TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u", - __entry->idx, - __entry->ref, - __entry->preserve, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->new_idx) -); - -DEFINE_EVENT(btree_path_clone, btree_path_clone, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), - TP_ARGS(trans, path, new) -); - -DEFINE_EVENT(btree_path_clone, btree_path_save_pos, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), - TP_ARGS(trans, path, new) -); - -DECLARE_EVENT_CLASS(btree_path_traverse, - TP_PROTO(struct btree_trans *trans, - struct btree_path *path), - TP_ARGS(trans, path), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(btree_path_idx_t, idx ) - __field(u8, ref ) - __field(u8, preserve ) - __field(u8, should_be_locked ) - __field(u8, btree_id ) - __field(u8, level ) - TRACE_BPOS_entries(pos) - __field(u8, locks_want ) - __field(u8, nodes_locked ) - __array(char, node0, 24 ) - __array(char, node1, 24 ) - __array(char, node2, 24 ) - __array(char, node3, 24 ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - - __entry->idx = path - trans->paths; - __entry->ref = path->ref; - __entry->preserve = path->preserve; - __entry->btree_id = path->btree_id; - __entry->level = path->level; - TRACE_BPOS_assign(pos, path->pos); - - __entry->locks_want = path->locks_want; - __entry->nodes_locked = path->nodes_locked; - struct btree *b = path->l[0].b; - if (IS_ERR(b)) - strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c); - b = path->l[1].b; - if (IS_ERR(b)) - strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c); - b = path->l[2].b; - if (IS_ERR(b)) - strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c); - b = path->l[3].b; - if (IS_ERR(b)) - strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c); - ), - - TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n" - "locks %u %u %u %u node %s %s %s %s", - __entry->trans_fn, - __entry->idx, - __entry->ref, - __entry->preserve, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->level, - __entry->locks_want, - (__entry->nodes_locked >> 6) & 3, - (__entry->nodes_locked >> 4) & 3, - (__entry->nodes_locked >> 2) & 3, - (__entry->nodes_locked >> 0) & 3, - __entry->node3, - __entry->node2, - __entry->node1, - __entry->node0) -); - -DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start, - TP_PROTO(struct btree_trans *trans, - struct btree_path *path), - TP_ARGS(trans, path) -); - -DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path) -); - -TRACE_EVENT(btree_path_set_pos, - TP_PROTO(struct btree_trans *trans, - struct btree_path *path, - struct bpos *new_pos), - TP_ARGS(trans, path, new_pos), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, ref ) - __field(u8, preserve ) - __field(u8, btree_id ) - TRACE_BPOS_entries(old_pos) - TRACE_BPOS_entries(new_pos) - __field(u8, locks_want ) - __field(u8, nodes_locked ) - __array(char, node0, 24 ) - __array(char, node1, 24 ) - __array(char, node2, 24 ) - __array(char, node3, 24 ) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->ref = path->ref; - __entry->preserve = path->preserve; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(old_pos, path->pos); - TRACE_BPOS_assign(new_pos, *new_pos); - - __entry->nodes_locked = path->nodes_locked; - struct btree *b = path->l[0].b; - if (IS_ERR(b)) - strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c); - b = path->l[1].b; - if (IS_ERR(b)) - strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c); - b = path->l[2].b; - if (IS_ERR(b)) - strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c); - b = path->l[3].b; - if (IS_ERR(b)) - strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c); - ), - - TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n" - "locks %u %u %u %u node %s %s %s %s", - __entry->idx, - __entry->ref, - __entry->preserve, - bch2_btree_id_str(__entry->btree_id), - __entry->old_pos_inode, - __entry->old_pos_offset, - __entry->old_pos_snapshot, - __entry->new_pos_inode, - __entry->new_pos_offset, - __entry->new_pos_snapshot, - (__entry->nodes_locked >> 6) & 3, - (__entry->nodes_locked >> 4) & 3, - (__entry->nodes_locked >> 2) & 3, - (__entry->nodes_locked >> 0) & 3, - __entry->node3, - __entry->node2, - __entry->node1, - __entry->node0) -); - -TRACE_EVENT(btree_path_free, - TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup), - TP_ARGS(trans, path, dup), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, preserve ) - __field(u8, should_be_locked) - __field(s8, dup ) - __field(u8, dup_locked ) - ), - - TP_fast_assign( - __entry->idx = path; - __entry->preserve = trans->paths[path].preserve; - __entry->should_be_locked = trans->paths[path].should_be_locked; - __entry->dup = dup ? dup - trans->paths : -1; - __entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0; - ), - - TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx, - __entry->preserve ? 'P' : ' ', - __entry->should_be_locked ? 'S' : ' ', - __entry->dup, - __entry->dup_locked) -); - -#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ -#ifndef _TRACE_BCACHEFS_H - -static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path, - struct btree_insert_entry *i, bool overwrite) {} -static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {} -static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} -static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {} -static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {} -static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} -static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {} - -#endif -#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ - -#define _TRACE_BCACHEFS_H -#endif /* _TRACE_BCACHEFS_H */ - -/* This part must be outside protection */ -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../fs/bcachefs - -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -#include <trace/define_trace.h> diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c deleted file mode 100644 index 9764c2e6a910..000000000000 --- a/fs/bcachefs/two_state_shared_lock.c +++ /dev/null @@ -1,8 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "two_state_shared_lock.h" - -void __bch2_two_state_lock(two_state_lock_t *lock, int s) -{ - __wait_event(lock->wait, bch2_two_state_trylock(lock, s)); -} diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h deleted file mode 100644 index 7f647846b511..000000000000 --- a/fs/bcachefs/two_state_shared_lock.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_TWO_STATE_LOCK_H -#define _BCACHEFS_TWO_STATE_LOCK_H - -#include <linux/atomic.h> -#include <linux/sched.h> -#include <linux/wait.h> - -#include "util.h" - -/* - * Two-state lock - can be taken for add or block - both states are shared, - * like read side of rwsem, but conflict with other state: - */ -typedef struct { - atomic_long_t v; - wait_queue_head_t wait; -} two_state_lock_t; - -static inline void two_state_lock_init(two_state_lock_t *lock) -{ - atomic_long_set(&lock->v, 0); - init_waitqueue_head(&lock->wait); -} - -static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s) -{ - long i = s ? 1 : -1; - - EBUG_ON(atomic_long_read(&lock->v) == 0); - - if (atomic_long_sub_return_release(i, &lock->v) == 0) - wake_up_all(&lock->wait); -} - -static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s) -{ - long i = s ? 1 : -1; - long old; - - old = atomic_long_read(&lock->v); - do { - if (i > 0 ? old < 0 : old > 0) - return false; - } while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i)); - - return true; -} - -void __bch2_two_state_lock(two_state_lock_t *, int); - -static inline void bch2_two_state_lock(two_state_lock_t *lock, int s) -{ - if (!bch2_two_state_trylock(lock, s)) - __bch2_two_state_lock(lock, s); -} - -#endif /* _BCACHEFS_TWO_STATE_LOCK_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c deleted file mode 100644 index df9a6071fe18..000000000000 --- a/fs/bcachefs/util.c +++ /dev/null @@ -1,1047 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * random utility code, for bcache but in theory not specific to bcache - * - * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> - * Copyright 2012 Google, Inc. - */ - -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/console.h> -#include <linux/ctype.h> -#include <linux/debugfs.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/log2.h> -#include <linux/math64.h> -#include <linux/percpu.h> -#include <linux/preempt.h> -#include <linux/random.h> -#include <linux/seq_file.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/sched/clock.h> - -#include "eytzinger.h" -#include "mean_and_variance.h" -#include "util.h" - -static const char si_units[] = "?kMGTPEZY"; - -/* string_get_size units: */ -static const char *const units_2[] = { - "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" -}; -static const char *const units_10[] = { - "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" -}; - -static int parse_u64(const char *cp, u64 *res) -{ - const char *start = cp; - u64 v = 0; - - if (!isdigit(*cp)) - return -EINVAL; - - do { - if (v > U64_MAX / 10) - return -ERANGE; - v *= 10; - if (v > U64_MAX - (*cp - '0')) - return -ERANGE; - v += *cp - '0'; - cp++; - } while (isdigit(*cp)); - - *res = v; - return cp - start; -} - -static int bch2_pow(u64 n, u64 p, u64 *res) -{ - *res = 1; - - while (p--) { - if (*res > div64_u64(U64_MAX, n)) - return -ERANGE; - *res *= n; - } - return 0; -} - -static int parse_unit_suffix(const char *cp, u64 *res) -{ - const char *start = cp; - u64 base = 1024; - unsigned u; - int ret; - - if (*cp == ' ') - cp++; - - for (u = 1; u < strlen(si_units); u++) - if (*cp == si_units[u]) { - cp++; - goto got_unit; - } - - for (u = 0; u < ARRAY_SIZE(units_2); u++) - if (!strncmp(cp, units_2[u], strlen(units_2[u]))) { - cp += strlen(units_2[u]); - goto got_unit; - } - - for (u = 0; u < ARRAY_SIZE(units_10); u++) - if (!strncmp(cp, units_10[u], strlen(units_10[u]))) { - cp += strlen(units_10[u]); - base = 1000; - goto got_unit; - } - - *res = 1; - return 0; -got_unit: - ret = bch2_pow(base, u, res); - if (ret) - return ret; - - return cp - start; -} - -#define parse_or_ret(cp, _f) \ -do { \ - int _ret = _f; \ - if (_ret < 0) \ - return _ret; \ - cp += _ret; \ -} while (0) - -static int __bch2_strtou64_h(const char *cp, u64 *res) -{ - const char *start = cp; - u64 v = 0, b, f_n = 0, f_d = 1; - int ret; - - parse_or_ret(cp, parse_u64(cp, &v)); - - if (*cp == '.') { - cp++; - ret = parse_u64(cp, &f_n); - if (ret < 0) - return ret; - cp += ret; - - ret = bch2_pow(10, ret, &f_d); - if (ret) - return ret; - } - - parse_or_ret(cp, parse_unit_suffix(cp, &b)); - - if (v > div64_u64(U64_MAX, b)) - return -ERANGE; - v *= b; - - if (f_n > div64_u64(U64_MAX, b)) - return -ERANGE; - - f_n = div64_u64(f_n * b, f_d); - if (v + f_n < v) - return -ERANGE; - v += f_n; - - *res = v; - return cp - start; -} - -static int __bch2_strtoh(const char *cp, u64 *res, - u64 t_max, bool t_signed) -{ - bool positive = *cp != '-'; - u64 v = 0; - - if (*cp == '+' || *cp == '-') - cp++; - - parse_or_ret(cp, __bch2_strtou64_h(cp, &v)); - - if (*cp == '\n') - cp++; - if (*cp) - return -EINVAL; - - if (positive) { - if (v > t_max) - return -ERANGE; - } else { - if (v && !t_signed) - return -ERANGE; - - if (v > t_max + 1) - return -ERANGE; - v = -v; - } - - *res = v; - return 0; -} - -#define STRTO_H(name, type) \ -int bch2_ ## name ## _h(const char *cp, type *res) \ -{ \ - u64 v = 0; \ - int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ - ANYSINT_MAX(type) != ((type) ~0ULL)); \ - *res = v; \ - return ret; \ -} - -STRTO_H(strtoint, int) -STRTO_H(strtouint, unsigned int) -STRTO_H(strtoll, long long) -STRTO_H(strtoull, unsigned long long) -STRTO_H(strtou64, u64) - -u64 bch2_read_flag_list(const char *opt, const char * const list[]) -{ - u64 ret = 0; - char *p, *s, *d = kstrdup(opt, GFP_KERNEL); - - if (!d) - return -ENOMEM; - - s = strim(d); - - while ((p = strsep(&s, ",;"))) { - int flag = match_string(list, -1, p); - - if (flag < 0) { - ret = -1; - break; - } - - ret |= BIT_ULL(flag); - } - - kfree(d); - - return ret; -} - -bool bch2_is_zero(const void *_p, size_t n) -{ - const char *p = _p; - size_t i; - - for (i = 0; i < n; i++) - if (p[i]) - return false; - return true; -} - -void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits) -{ - while (nr_bits) - prt_char(out, '0' + ((v >> --nr_bits) & 1)); -} - -void bch2_prt_u64_base2(struct printbuf *out, u64 v) -{ - bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); -} - -static bool string_is_spaces(const char *str) -{ - while (*str) { - if (*str != ' ') - return false; - str++; - } - return true; -} - -void bch2_print_string_as_lines(const char *prefix, const char *lines) -{ - bool locked = false; - const char *p; - - if (!lines) { - printk("%s (null)\n", prefix); - return; - } - - locked = console_trylock(); - - while (*lines) { - p = strchrnul(lines, '\n'); - if (!*p && string_is_spaces(lines)) - break; - - printk("%s%.*s\n", prefix, (int) (p - lines), lines); - if (!*p) - break; - lines = p + 1; - } - if (locked) - console_unlock(); -} - -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, - gfp_t gfp) -{ -#ifdef CONFIG_STACKTRACE - unsigned nr_entries = 0; - - stack->nr = 0; - int ret = darray_make_room_gfp(stack, 32, gfp); - if (ret) - return ret; - - if (!down_read_trylock(&task->signal->exec_update_lock)) - return -1; - - do { - nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); - } while (nr_entries == stack->size && - !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp))); - - stack->nr = nr_entries; - up_read(&task->signal->exec_update_lock); - - return ret; -#else - return 0; -#endif -} - -void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) -{ - darray_for_each(*stack, i) { - prt_printf(out, "[<0>] %pB", (void *) *i); - prt_newline(out); - } -} - -int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp) -{ - bch_stacktrace stack = { 0 }; - int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp); - - bch2_prt_backtrace(out, &stack); - darray_exit(&stack); - return ret; -} - -#ifndef __KERNEL__ -#include <time.h> -void bch2_prt_datetime(struct printbuf *out, time64_t sec) -{ - time_t t = sec; - char buf[64]; - ctime_r(&t, buf); - strim(buf); - prt_str(out, buf); -} -#else -void bch2_prt_datetime(struct printbuf *out, time64_t sec) -{ - char buf[64]; - snprintf(buf, sizeof(buf), "%ptT", &sec); - prt_u64(out, sec); -} -#endif - -void bch2_pr_time_units(struct printbuf *out, u64 ns) -{ - const struct time_unit *u = bch2_pick_time_units(ns); - - prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name); -} - -static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) -{ - const struct time_unit *u = bch2_pick_time_units(ns); - - prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name); -} - -static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) -{ - prt_printf(out, "%s\t", name); - bch2_pr_time_units_aligned(out, ns); - prt_newline(out); -} - -#define TABSTOP_SIZE 12 - -void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) -{ - struct quantiles *quantiles = time_stats_to_quantiles(stats); - s64 f_mean = 0, d_mean = 0; - u64 f_stddev = 0, d_stddev = 0; - - if (stats->buffer) { - int cpu; - - spin_lock_irq(&stats->lock); - for_each_possible_cpu(cpu) - __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); - spin_unlock_irq(&stats->lock); - } - - /* - * avoid divide by zero - */ - if (stats->freq_stats.n) { - f_mean = mean_and_variance_get_mean(stats->freq_stats); - f_stddev = mean_and_variance_get_stddev(stats->freq_stats); - d_mean = mean_and_variance_get_mean(stats->duration_stats); - d_stddev = mean_and_variance_get_stddev(stats->duration_stats); - } - - printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); - prt_printf(out, "count:\t%llu\n", stats->duration_stats.n); - printbuf_tabstop_pop(out); - - printbuf_tabstops_reset(out); - - printbuf_tabstop_push(out, out->indent + 20); - printbuf_tabstop_push(out, TABSTOP_SIZE + 2); - printbuf_tabstop_push(out, 0); - printbuf_tabstop_push(out, TABSTOP_SIZE + 2); - - prt_printf(out, "\tsince mount\r\trecent\r\n"); - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, out->indent + 20); - printbuf_tabstop_push(out, TABSTOP_SIZE); - printbuf_tabstop_push(out, 2); - printbuf_tabstop_push(out, TABSTOP_SIZE); - - prt_printf(out, "duration of events\n"); - printbuf_indent_add(out, 2); - - pr_name_and_units(out, "min:", stats->min_duration); - pr_name_and_units(out, "max:", stats->max_duration); - pr_name_and_units(out, "total:", stats->total_duration); - - prt_printf(out, "mean:\t"); - bch2_pr_time_units_aligned(out, d_mean); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); - prt_newline(out); - - prt_printf(out, "stddev:\t"); - bch2_pr_time_units_aligned(out, d_stddev); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); - - printbuf_indent_sub(out, 2); - prt_newline(out); - - prt_printf(out, "time between events\n"); - printbuf_indent_add(out, 2); - - pr_name_and_units(out, "min:", stats->min_freq); - pr_name_and_units(out, "max:", stats->max_freq); - - prt_printf(out, "mean:\t"); - bch2_pr_time_units_aligned(out, f_mean); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); - prt_newline(out); - - prt_printf(out, "stddev:\t"); - bch2_pr_time_units_aligned(out, f_stddev); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); - - printbuf_indent_sub(out, 2); - prt_newline(out); - - printbuf_tabstops_reset(out); - - if (quantiles) { - int i = eytzinger0_first(NR_QUANTILES); - const struct time_unit *u = - bch2_pick_time_units(quantiles->entries[i].m); - u64 last_q = 0; - - prt_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(j, NR_QUANTILES) { - bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1; - - u64 q = max(quantiles->entries[j].m, last_q); - prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); - if (is_last) - prt_newline(out); - last_q = q; - } - } -} - -/* ratelimit: */ - -/** - * bch2_ratelimit_delay() - return how long to delay until the next time to do - * some work - * @d: the struct bch_ratelimit to update - * Returns: the amount of time to delay by, in jiffies - */ -u64 bch2_ratelimit_delay(struct bch_ratelimit *d) -{ - u64 now = local_clock(); - - return time_after64(d->next, now) - ? nsecs_to_jiffies(d->next - now) - : 0; -} - -/** - * bch2_ratelimit_increment() - increment @d by the amount of work done - * @d: the struct bch_ratelimit to update - * @done: the amount of work done, in arbitrary units - */ -void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) -{ - u64 now = local_clock(); - - d->next += div_u64(done * NSEC_PER_SEC, d->rate); - - if (time_before64(now + NSEC_PER_SEC, d->next)) - d->next = now + NSEC_PER_SEC; - - if (time_after64(now - NSEC_PER_SEC * 2, d->next)) - d->next = now - NSEC_PER_SEC * 2; -} - -/* pd controller: */ - -/* - * Updates pd_controller. Attempts to scale inputed values to units per second. - * @target: desired value - * @actual: current value - * - * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing - * it makes actual go down. - */ -void bch2_pd_controller_update(struct bch_pd_controller *pd, - s64 target, s64 actual, int sign) -{ - s64 proportional, derivative, change; - - unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; - - if (seconds_since_update == 0) - return; - - pd->last_update = jiffies; - - proportional = actual - target; - proportional *= seconds_since_update; - proportional = div_s64(proportional, pd->p_term_inverse); - - derivative = actual - pd->last_actual; - derivative = div_s64(derivative, seconds_since_update); - derivative = ewma_add(pd->smoothed_derivative, derivative, - (pd->d_term / seconds_since_update) ?: 1); - derivative = derivative * pd->d_term; - derivative = div_s64(derivative, pd->p_term_inverse); - - change = proportional + derivative; - - /* Don't increase rate if not keeping up */ - if (change > 0 && - pd->backpressure && - time_after64(local_clock(), - pd->rate.next + NSEC_PER_MSEC)) - change = 0; - - change *= (sign * -1); - - pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, - 1, UINT_MAX); - - pd->last_actual = actual; - pd->last_derivative = derivative; - pd->last_proportional = proportional; - pd->last_change = change; - pd->last_target = target; -} - -void bch2_pd_controller_init(struct bch_pd_controller *pd) -{ - pd->rate.rate = 1024; - pd->last_update = jiffies; - pd->p_term_inverse = 6000; - pd->d_term = 30; - pd->d_smooth = pd->d_term; - pd->backpressure = 1; -} - -void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 20); - - prt_printf(out, "rate:\t"); - prt_human_readable_s64(out, pd->rate.rate); - prt_newline(out); - - prt_printf(out, "target:\t"); - prt_human_readable_u64(out, pd->last_target); - prt_newline(out); - - prt_printf(out, "actual:\t"); - prt_human_readable_u64(out, pd->last_actual); - prt_newline(out); - - prt_printf(out, "proportional:\t"); - prt_human_readable_s64(out, pd->last_proportional); - prt_newline(out); - - prt_printf(out, "derivative:\t"); - prt_human_readable_s64(out, pd->last_derivative); - prt_newline(out); - - prt_printf(out, "change:\t"); - prt_human_readable_s64(out, pd->last_change); - prt_newline(out); - - prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); -} - -/* misc: */ - -void bch2_bio_map(struct bio *bio, void *base, size_t size) -{ - while (size) { - struct page *page = is_vmalloc_addr(base) - ? vmalloc_to_page(base) - : virt_to_page(base); - unsigned offset = offset_in_page(base); - unsigned len = min_t(size_t, PAGE_SIZE - offset, size); - - BUG_ON(!bio_add_page(bio, page, len, offset)); - size -= len; - base += len; - } -} - -int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) -{ - while (size) { - struct page *page = alloc_pages(gfp_mask, 0); - unsigned len = min_t(size_t, PAGE_SIZE, size); - - if (!page) - return -ENOMEM; - - if (unlikely(!bio_add_page(bio, page, len, 0))) { - __free_page(page); - break; - } - - size -= len; - } - - return 0; -} - -u64 bch2_get_random_u64_below(u64 ceil) -{ - if (ceil <= U32_MAX) - return __get_random_u32_below(ceil); - - /* this is the same (clever) algorithm as in __get_random_u32_below() */ - u64 rand = get_random_u64(); - u64 mult = ceil * rand; - - if (unlikely(mult < ceil)) { - u64 bound; - div64_u64_rem(-ceil, ceil, &bound); - while (unlikely(mult < bound)) { - rand = get_random_u64(); - mult = ceil * rand; - } - } - - return mul_u64_u64_shr(ceil, rand, 64); -} - -void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) -{ - struct bio_vec bv; - struct bvec_iter iter; - - __bio_for_each_segment(bv, dst, iter, dst_iter) { - void *dstp = kmap_local_page(bv.bv_page); - - memcpy(dstp + bv.bv_offset, src, bv.bv_len); - kunmap_local(dstp); - - src += bv.bv_len; - } -} - -void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) -{ - struct bio_vec bv; - struct bvec_iter iter; - - __bio_for_each_segment(bv, src, iter, src_iter) { - void *srcp = kmap_local_page(bv.bv_page); - - memcpy(dst, srcp + bv.bv_offset, bv.bv_len); - kunmap_local(srcp); - - dst += bv.bv_len; - } -} - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_corrupt_bio(struct bio *bio) -{ - struct bvec_iter iter; - struct bio_vec bv; - unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64)); - - bio_for_each_segment(bv, bio, iter) { - unsigned u64s = bv.bv_len / sizeof(u64); - - if (offset < u64s) { - u64 *segment = bvec_kmap_local(&bv); - segment[offset] = get_random_u64(); - kunmap_local(segment); - return; - } - offset -= u64s; - } -} -#endif - -void bch2_bio_to_text(struct printbuf *out, struct bio *bio) -{ - prt_printf(out, "bi_remaining:\t%u\n", - atomic_read(&bio->__bi_remaining)); - prt_printf(out, "bi_end_io:\t%ps\n", - bio->bi_end_io); - prt_printf(out, "bi_status:\t%u\n", - bio->bi_status); -} - -#if 0 -void eytzinger1_test(void) -{ - unsigned inorder, size; - - pr_info("1 based eytzinger test:\n"); - - for (size = 2; - size < 65536; - size++) { - unsigned extra = eytzinger1_extra(size); - - if (!(size % 4096)) - pr_info("tree size %u\n", size); - - inorder = 1; - eytzinger1_for_each(eytz, size) { - BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); - BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); - BUG_ON(eytz != eytzinger1_last(size) && - eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); - - inorder++; - } - BUG_ON(inorder - 1 != size); - } -} - -void eytzinger0_test(void) -{ - - unsigned inorder, size; - - pr_info("0 based eytzinger test:\n"); - - for (size = 1; - size < 65536; - size++) { - unsigned extra = eytzinger0_extra(size); - - if (!(size % 4096)) - pr_info("tree size %u\n", size); - - inorder = 0; - eytzinger0_for_each(eytz, size) { - BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); - BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); - BUG_ON(eytz != eytzinger0_last(size) && - eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); - - inorder++; - } - BUG_ON(inorder != size); - - inorder = size - 1; - eytzinger0_for_each_prev(eytz, size) { - BUG_ON(eytz != eytzinger0_first(size) && - eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz); - - inorder--; - } - BUG_ON(inorder != -1); - } -} - -static inline int cmp_u16(const void *_l, const void *_r) -{ - const u16 *l = _l, *r = _r; - - return (*l > *r) - (*r > *l); -} - -static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search) -{ - int r, s; - bool bad; - - r = eytzinger0_find_le(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - if (r >= 0) { - if (test_array[r] > search) { - bad = true; - } else { - s = eytzinger0_next(r, nr); - bad = s >= 0 && test_array[s] <= search; - } - } else { - s = eytzinger0_last(nr); - bad = s >= 0 && test_array[s] <= search; - } - - if (bad) { - s = -1; - eytzinger0_for_each_prev(j, nr) { - if (test_array[j] <= search) { - s = j; - break; - } - } - - eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find_le(%12u) = %3i should be %3i\n", - search, r, s); - BUG(); - } -} - -static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search) -{ - int r, s; - bool bad; - - r = eytzinger0_find_gt(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - if (r >= 0) { - if (test_array[r] <= search) { - bad = true; - } else { - s = eytzinger0_prev(r, nr); - bad = s >= 0 && test_array[s] > search; - } - } else { - s = eytzinger0_first(nr); - bad = s >= 0 && test_array[s] > search; - } - - if (bad) { - s = -1; - eytzinger0_for_each(j, nr) { - if (test_array[j] > search) { - s = j; - break; - } - } - - eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find_gt(%12u) = %3i should be %3i\n", - search, r, s); - BUG(); - } -} - -static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) -{ - int r, s; - bool bad; - - r = eytzinger0_find_ge(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - if (r >= 0) { - if (test_array[r] < search) { - bad = true; - } else { - s = eytzinger0_prev(r, nr); - bad = s >= 0 && test_array[s] >= search; - } - } else { - s = eytzinger0_first(nr); - bad = s >= 0 && test_array[s] >= search; - } - - if (bad) { - s = -1; - eytzinger0_for_each(j, nr) { - if (test_array[j] >= search) { - s = j; - break; - } - } - - eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find_ge(%12u) = %3i should be %3i\n", - search, r, s); - BUG(); - } -} - -static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search) -{ - unsigned r; - int s; - bool bad; - - r = eytzinger0_find(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - - if (r < nr) { - bad = test_array[r] != search; - } else { - s = eytzinger0_find_le(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - bad = s >= 0 && test_array[s] == search; - } - - if (bad) { - eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find(%12u) = %3i is incorrect\n", - search, r); - BUG(); - } -} - -static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) -{ - eytzinger0_find_test_le(test_array, nr, search); - eytzinger0_find_test_gt(test_array, nr, search); - eytzinger0_find_test_ge(test_array, nr, search); - eytzinger0_find_test_eq(test_array, nr, search); -} - -void eytzinger0_find_test(void) -{ - unsigned i, nr, allocated = 1 << 12; - u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); - - for (nr = 1; nr < allocated; nr++) { - u16 prev = 0; - - pr_info("testing %u elems\n", nr); - - get_random_bytes(test_array, nr * sizeof(test_array[0])); - eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); - - /* verify array is sorted correctly: */ - eytzinger0_for_each(j, nr) { - BUG_ON(test_array[j] < prev); - prev = test_array[j]; - } - - for (i = 0; i < U16_MAX; i += 1 << 12) - eytzinger0_find_test_val(test_array, nr, i); - - for (i = 0; i < nr; i++) { - eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); - eytzinger0_find_test_val(test_array, nr, test_array[i]); - eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); - } - } - - kfree(test_array); -} -#endif - -/* - * Accumulate percpu counters onto one cpu's copy - only valid when access - * against any percpu counter is guarded against - */ -u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) -{ - u64 *ret; - int cpu; - - /* access to pcpu vars has to be blocked by other locking */ - preempt_disable(); - ret = this_cpu_ptr(p); - preempt_enable(); - - for_each_possible_cpu(cpu) { - u64 *i = per_cpu_ptr(p, cpu); - - if (i != ret) { - acc_u64s(ret, i, nr); - memset(i, 0, nr * sizeof(u64)); - } - } - - return ret; -} - -void bch2_darray_str_exit(darray_const_str *d) -{ - darray_for_each(*d, i) - kfree(*i); - darray_exit(d); -} - -int bch2_split_devs(const char *_dev_name, darray_const_str *ret) -{ - darray_init(ret); - - char *dev_name, *s, *orig; - - dev_name = orig = kstrdup(_dev_name, GFP_KERNEL); - if (!dev_name) - return -ENOMEM; - - while ((s = strsep(&dev_name, ":"))) { - char *p = kstrdup(s, GFP_KERNEL); - if (!p) - goto err; - - if (darray_push(ret, p)) { - kfree(p); - goto err; - } - } - - kfree(orig); - return 0; -err: - bch2_darray_str_exit(ret); - kfree(orig); - return -ENOMEM; -} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h deleted file mode 100644 index 6488f098d140..000000000000 --- a/fs/bcachefs/util.h +++ /dev/null @@ -1,782 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_UTIL_H -#define _BCACHEFS_UTIL_H - -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/closure.h> -#include <linux/errno.h> -#include <linux/freezer.h> -#include <linux/kernel.h> -#include <linux/min_heap.h> -#include <linux/sched/clock.h> -#include <linux/llist.h> -#include <linux/log2.h> -#include <linux/percpu.h> -#include <linux/preempt.h> -#include <linux/random.h> -#include <linux/ratelimit.h> -#include <linux/slab.h> -#include <linux/sort.h> -#include <linux/vmalloc.h> -#include <linux/workqueue.h> - -#include "mean_and_variance.h" - -#include "darray.h" -#include "time_stats.h" - -struct closure; - -#ifdef CONFIG_BCACHEFS_DEBUG -#define EBUG_ON(cond) BUG_ON(cond) -#else -#define EBUG_ON(cond) -#endif - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define CPU_BIG_ENDIAN 0 -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#define CPU_BIG_ENDIAN 1 -#endif - -/* type hackery */ - -#define type_is_exact(_val, _type) \ - __builtin_types_compatible_p(typeof(_val), _type) - -#define type_is(_val, _type) \ - (__builtin_types_compatible_p(typeof(_val), _type) || \ - __builtin_types_compatible_p(typeof(_val), const _type)) - -/* Userspace doesn't align allocations as nicely as the kernel allocators: */ -static inline size_t buf_pages(void *p, size_t len) -{ - return DIV_ROUND_UP(len + - ((unsigned long) p & (PAGE_SIZE - 1)), - PAGE_SIZE); -} - -static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) -{ - void *p = unlikely(n >= INT_MAX) - ? vmalloc_noprof(n) - : kvmalloc_noprof(n, flags & ~__GFP_ZERO); - if (p && (flags & __GFP_ZERO)) - memset(p, 0, n); - return p; -} -#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__)) - -#define init_heap(heap, _size, gfp) \ -({ \ - (heap)->nr = 0; \ - (heap)->size = (_size); \ - (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\ - (gfp)); \ -}) - -#define free_heap(heap) \ -do { \ - kvfree((heap)->data); \ - (heap)->data = NULL; \ -} while (0) - -#define ANYSINT_MAX(t) \ - ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) - -#include "printbuf.h" - -#define prt_vprintf(_out, ...) bch2_prt_vprintf(_out, __VA_ARGS__) -#define prt_printf(_out, ...) bch2_prt_printf(_out, __VA_ARGS__) -#define printbuf_str(_buf) bch2_printbuf_str(_buf) -#define printbuf_exit(_buf) bch2_printbuf_exit(_buf) - -#define printbuf_tabstops_reset(_buf) bch2_printbuf_tabstops_reset(_buf) -#define printbuf_tabstop_pop(_buf) bch2_printbuf_tabstop_pop(_buf) -#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n) - -#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n) -#define printbuf_indent_add_nextline(_out, _n) bch2_printbuf_indent_add_nextline(_out, _n) -#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n) - -#define prt_newline(_out) bch2_prt_newline(_out) -#define prt_tab(_out) bch2_prt_tab(_out) -#define prt_tab_rjust(_out) bch2_prt_tab_rjust(_out) - -#define prt_bytes_indented(...) bch2_prt_bytes_indented(__VA_ARGS__) -#define prt_u64(_out, _v) prt_printf(_out, "%llu", (u64) (_v)) -#define prt_human_readable_u64(...) bch2_prt_human_readable_u64(__VA_ARGS__) -#define prt_human_readable_s64(...) bch2_prt_human_readable_s64(__VA_ARGS__) -#define prt_units_u64(...) bch2_prt_units_u64(__VA_ARGS__) -#define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__) -#define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__) -#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) -#define prt_bitflags_vector(...) bch2_prt_bitflags_vector(__VA_ARGS__) - -void bch2_pr_time_units(struct printbuf *, u64); -void bch2_prt_datetime(struct printbuf *, time64_t); - -#ifdef __KERNEL__ -static inline void uuid_unparse_lower(u8 *uuid, char *out) -{ - sprintf(out, "%pUb", uuid); -} -#else -#include <uuid/uuid.h> -#endif - -static inline void pr_uuid(struct printbuf *out, u8 *uuid) -{ - char uuid_str[40]; - - uuid_unparse_lower(uuid, uuid_str); - prt_printf(out, "%s", uuid_str); -} - -int bch2_strtoint_h(const char *, int *); -int bch2_strtouint_h(const char *, unsigned int *); -int bch2_strtoll_h(const char *, long long *); -int bch2_strtoull_h(const char *, unsigned long long *); -int bch2_strtou64_h(const char *, u64 *); - -static inline int bch2_strtol_h(const char *cp, long *res) -{ -#if BITS_PER_LONG == 32 - return bch2_strtoint_h(cp, (int *) res); -#else - return bch2_strtoll_h(cp, (long long *) res); -#endif -} - -static inline int bch2_strtoul_h(const char *cp, long *res) -{ -#if BITS_PER_LONG == 32 - return bch2_strtouint_h(cp, (unsigned int *) res); -#else - return bch2_strtoull_h(cp, (unsigned long long *) res); -#endif -} - -#define strtoi_h(cp, res) \ - ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ - : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ - : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ - : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ - : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ - : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ - : -EINVAL) - -#define strtoul_safe(cp, var) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (!_r) \ - var = _v; \ - _r; \ -}) - -#define strtoul_safe_clamp(cp, var, min, max) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (!_r) \ - var = clamp_t(typeof(var), _v, min, max); \ - _r; \ -}) - -#define strtoul_safe_restrict(cp, var, min, max) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (!_r && _v >= min && _v <= max) \ - var = _v; \ - else \ - _r = -EINVAL; \ - _r; \ -}) - -#define snprint(out, var) \ - prt_printf(out, \ - type_is(var, int) ? "%i\n" \ - : type_is(var, unsigned) ? "%u\n" \ - : type_is(var, long) ? "%li\n" \ - : type_is(var, unsigned long) ? "%lu\n" \ - : type_is(var, s64) ? "%lli\n" \ - : type_is(var, u64) ? "%llu\n" \ - : type_is(var, char *) ? "%s\n" \ - : "%i\n", var) - -bool bch2_is_zero(const void *, size_t); - -u64 bch2_read_flag_list(const char *, const char * const[]); - -void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); -void bch2_prt_u64_base2(struct printbuf *, u64); - -void bch2_print_string_as_lines(const char *, const char *); - -typedef DARRAY(unsigned long) bch_stacktrace; -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); -void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); -int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t); - -static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) -{ -#ifdef __KERNEL__ - prt_printf(out, "%pg", bdev); -#else - prt_str(out, bdev->name); -#endif -} - -void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); - -#define ewma_add(ewma, val, weight) \ -({ \ - typeof(ewma) _ewma = (ewma); \ - typeof(weight) _weight = (weight); \ - \ - (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ -}) - -struct bch_ratelimit { - /* Next time we want to do some work, in nanoseconds */ - u64 next; - - /* - * Rate at which we want to do work, in units per nanosecond - * The units here correspond to the units passed to - * bch2_ratelimit_increment() - */ - unsigned rate; -}; - -static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) -{ - d->next = local_clock(); -} - -u64 bch2_ratelimit_delay(struct bch_ratelimit *); -void bch2_ratelimit_increment(struct bch_ratelimit *, u64); - -struct bch_pd_controller { - struct bch_ratelimit rate; - unsigned long last_update; - - s64 last_actual; - s64 smoothed_derivative; - - unsigned p_term_inverse; - unsigned d_smooth; - unsigned d_term; - - /* for exporting to sysfs (no effect on behavior) */ - s64 last_derivative; - s64 last_proportional; - s64 last_change; - s64 last_target; - - /* - * If true, the rate will not increase if bch2_ratelimit_delay() - * is not being called often enough. - */ - bool backpressure; -}; - -void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); -void bch2_pd_controller_init(struct bch_pd_controller *); -void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *); - -#define sysfs_pd_controller_attribute(name) \ - rw_attribute(name##_rate); \ - rw_attribute(name##_rate_bytes); \ - rw_attribute(name##_rate_d_term); \ - rw_attribute(name##_rate_p_term_inverse); \ - read_attribute(name##_rate_debug) - -#define sysfs_pd_controller_files(name) \ - &sysfs_##name##_rate, \ - &sysfs_##name##_rate_bytes, \ - &sysfs_##name##_rate_d_term, \ - &sysfs_##name##_rate_p_term_inverse, \ - &sysfs_##name##_rate_debug - -#define sysfs_pd_controller_show(name, var) \ -do { \ - sysfs_hprint(name##_rate, (var)->rate.rate); \ - sysfs_print(name##_rate_bytes, (var)->rate.rate); \ - sysfs_print(name##_rate_d_term, (var)->d_term); \ - sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ - \ - if (attr == &sysfs_##name##_rate_debug) \ - bch2_pd_controller_debug_to_text(out, var); \ -} while (0) - -#define sysfs_pd_controller_store(name, var) \ -do { \ - sysfs_strtoul_clamp(name##_rate, \ - (var)->rate.rate, 1, UINT_MAX); \ - sysfs_strtoul_clamp(name##_rate_bytes, \ - (var)->rate.rate, 1, UINT_MAX); \ - sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ - sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ - (var)->p_term_inverse, 1, INT_MAX); \ -} while (0) - -#define container_of_or_null(ptr, type, member) \ -({ \ - typeof(ptr) _ptr = ptr; \ - _ptr ? container_of(_ptr, type, member) : NULL; \ -}) - -static inline struct list_head *list_pop(struct list_head *head) -{ - if (list_empty(head)) - return NULL; - - struct list_head *ret = head->next; - list_del_init(ret); - return ret; -} - -#define list_pop_entry(head, type, member) \ - container_of_or_null(list_pop(head), type, member) - -/* Does linear interpolation between powers of two */ -static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) -{ - unsigned fract = x & ~(~0 << fract_bits); - - x >>= fract_bits; - x = 1 << x; - x += (x * fract) >> fract_bits; - - return x; -} - -void bch2_bio_map(struct bio *bio, void *base, size_t); -int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); - -#define closure_bio_submit(bio, cl) \ -do { \ - closure_get(cl); \ - submit_bio(bio); \ -} while (0) - -#define kthread_wait(cond) \ -({ \ - int _ret = 0; \ - \ - while (1) { \ - set_current_state(TASK_INTERRUPTIBLE); \ - if (kthread_should_stop()) { \ - _ret = -1; \ - break; \ - } \ - \ - if (cond) \ - break; \ - \ - schedule(); \ - } \ - set_current_state(TASK_RUNNING); \ - _ret; \ -}) - -#define kthread_wait_freezable(cond) \ -({ \ - int _ret = 0; \ - while (1) { \ - set_current_state(TASK_INTERRUPTIBLE); \ - if (kthread_should_stop()) { \ - _ret = -1; \ - break; \ - } \ - \ - if (cond) \ - break; \ - \ - schedule(); \ - try_to_freeze(); \ - } \ - set_current_state(TASK_RUNNING); \ - _ret; \ -}) - -u64 bch2_get_random_u64_below(u64); - -void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); -void memcpy_from_bio(void *, struct bio *, struct bvec_iter); - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_corrupt_bio(struct bio *); - -static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) -{ - if (ratio && !get_random_u32_below(ratio)) - bch2_corrupt_bio(bio); -} -#else -#define bch2_maybe_corrupt_bio(...) do {} while (0) -#endif - -void bch2_bio_to_text(struct printbuf *, struct bio *); - -static inline void memcpy_u64s_small(void *dst, const void *src, - unsigned u64s) -{ - u64 *d = dst; - const u64 *s = src; - - while (u64s--) - *d++ = *s++; -} - -static inline void __memcpy_u64s(void *dst, const void *src, - unsigned u64s) -{ -#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) - long d0, d1, d2; - - asm volatile("rep ; movsq" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (u64s), "1" (dst), "2" (src) - : "memory"); -#else - u64 *d = dst; - const u64 *s = src; - - while (u64s--) - *d++ = *s++; -#endif -} - -static inline void memcpy_u64s(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(!(dst >= src + u64s * sizeof(u64) || - dst + u64s * sizeof(u64) <= src)); - - __memcpy_u64s(dst, src, u64s); -} - -static inline void __memmove_u64s_down(void *dst, const void *src, - unsigned u64s) -{ - __memcpy_u64s(dst, src, u64s); -} - -static inline void memmove_u64s_down(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst > src); - - __memmove_u64s_down(dst, src, u64s); -} - -static inline void __memmove_u64s_down_small(void *dst, const void *src, - unsigned u64s) -{ - memcpy_u64s_small(dst, src, u64s); -} - -static inline void memmove_u64s_down_small(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst > src); - - __memmove_u64s_down_small(dst, src, u64s); -} - -static inline void __memmove_u64s_up_small(void *_dst, const void *_src, - unsigned u64s) -{ - u64 *dst = (u64 *) _dst + u64s; - u64 *src = (u64 *) _src + u64s; - - while (u64s--) - *--dst = *--src; -} - -static inline void memmove_u64s_up_small(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst < src); - - __memmove_u64s_up_small(dst, src, u64s); -} - -static inline void __memmove_u64s_up(void *_dst, const void *_src, - unsigned u64s) -{ - u64 *dst = (u64 *) _dst + u64s - 1; - u64 *src = (u64 *) _src + u64s - 1; - -#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) - long d0, d1, d2; - - asm volatile("std ;\n" - "rep ; movsq\n" - "cld ;\n" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (u64s), "1" (dst), "2" (src) - : "memory"); -#else - while (u64s--) - *dst-- = *src--; -#endif -} - -static inline void memmove_u64s_up(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst < src); - - __memmove_u64s_up(dst, src, u64s); -} - -static inline void memmove_u64s(void *dst, const void *src, - unsigned u64s) -{ - if (dst < src) - __memmove_u64s_down(dst, src, u64s); - else - __memmove_u64s_up(dst, src, u64s); -} - -/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ -static inline void memset_u64s_tail(void *s, int c, unsigned bytes) -{ - unsigned rem = round_up(bytes, sizeof(u64)) - bytes; - - memset(s + bytes, c, rem); -} - -/* just the memmove, doesn't update @_nr */ -#define __array_insert_item(_array, _nr, _pos) \ - memmove(&(_array)[(_pos) + 1], \ - &(_array)[(_pos)], \ - sizeof((_array)[0]) * ((_nr) - (_pos))) - -#define array_insert_item(_array, _nr, _pos, _new_item) \ -do { \ - __array_insert_item(_array, _nr, _pos); \ - (_nr)++; \ - (_array)[(_pos)] = (_new_item); \ -} while (0) - -#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ -do { \ - (_nr) -= (_nr_to_remove); \ - memmove(&(_array)[(_pos)], \ - &(_array)[(_pos) + (_nr_to_remove)], \ - sizeof((_array)[0]) * ((_nr) - (_pos))); \ -} while (0) - -#define array_remove_item(_array, _nr, _pos) \ - array_remove_items(_array, _nr, _pos, 1) - -static inline void __move_gap(void *array, size_t element_size, - size_t nr, size_t size, - size_t old_gap, size_t new_gap) -{ - size_t gap_end = old_gap + size - nr; - - if (new_gap < old_gap) { - size_t move = old_gap - new_gap; - - memmove(array + element_size * (gap_end - move), - array + element_size * (old_gap - move), - element_size * move); - } else if (new_gap > old_gap) { - size_t move = new_gap - old_gap; - - memmove(array + element_size * old_gap, - array + element_size * gap_end, - element_size * move); - } -} - -/* Move the gap in a gap buffer: */ -#define move_gap(_d, _new_gap) \ -do { \ - BUG_ON(_new_gap > (_d)->nr); \ - BUG_ON((_d)->gap > (_d)->nr); \ - \ - __move_gap((_d)->data, sizeof((_d)->data[0]), \ - (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \ - (_d)->gap = _new_gap; \ -} while (0) - -#define bubble_sort(_base, _nr, _cmp) \ -do { \ - ssize_t _i, _last; \ - bool _swapped = true; \ - \ - for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\ - _swapped = false; \ - for (_i = 0; _i < _last; _i++) \ - if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ - swap((_base)[_i], (_base)[_i + 1]); \ - _swapped = true; \ - } \ - } \ -} while (0) - -#define per_cpu_sum(_p) \ -({ \ - TYPEOF_UNQUAL(*_p) _ret = 0; \ - \ - int cpu; \ - for_each_possible_cpu(cpu) \ - _ret += *per_cpu_ptr(_p, cpu); \ - _ret; \ -}) - -static inline u64 percpu_u64_get(u64 __percpu *src) -{ - return per_cpu_sum(src); -} - -static inline void percpu_u64_set(u64 __percpu *dst, u64 src) -{ - int cpu; - - for_each_possible_cpu(cpu) - *per_cpu_ptr(dst, cpu) = 0; - this_cpu_write(*dst, src); -} - -static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) -{ - for (unsigned i = 0; i < nr; i++) - acc[i] += src[i]; -} - -static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, - unsigned nr) -{ - int cpu; - - for_each_possible_cpu(cpu) - acc_u64s(acc, per_cpu_ptr(src, cpu), nr); -} - -static inline void percpu_memset(void __percpu *p, int c, size_t bytes) -{ - int cpu; - - for_each_possible_cpu(cpu) - memset(per_cpu_ptr(p, cpu), c, bytes); -} - -u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); - -static inline int u8_cmp(u8 l, u8 r) -{ - return cmp_int(l, r); -} - -static inline int cmp_le32(__le32 l, __le32 r) -{ - return cmp_int(le32_to_cpu(l), le32_to_cpu(r)); -} - -#include <linux/uuid.h> - -static inline bool qstr_eq(const struct qstr l, const struct qstr r) -{ - return l.len == r.len && !memcmp(l.name, r.name, l.len); -} - -void bch2_darray_str_exit(darray_const_str *); -int bch2_split_devs(const char *, darray_const_str *); - -#ifdef __KERNEL__ - -__must_check -static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) -{ - return copy_to_user(to, from, n) ? -EFAULT : 0; -} - -__must_check -static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n) -{ - return copy_from_user(to, from, n) ? -EFAULT : 0; -} - -#endif - -static inline void mod_bit(long nr, volatile unsigned long *addr, bool v) -{ - if (v) - set_bit(nr, addr); - else - clear_bit(nr, addr); -} - -static inline void __set_bit_le64(size_t bit, __le64 *addr) -{ - addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); -} - -static inline void __clear_bit_le64(size_t bit, __le64 *addr) -{ - addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64)); -} - -static inline bool test_bit_le64(size_t bit, __le64 *addr) -{ - return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; -} - -static inline void memcpy_swab(void *_dst, void *_src, size_t len) -{ - u8 *dst = _dst + len; - u8 *src = _src; - - while (len--) - *--dst = *src++; -} - -#define set_flags(_map, _in, _out) \ -do { \ - unsigned _i; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & (1 << _i)) \ - (_out) |= _map[_i]; \ - else \ - (_out) &= ~_map[_i]; \ -} while (0) - -#define map_flags(_map, _in) \ -({ \ - unsigned _out = 0; \ - \ - set_flags(_map, _in, _out); \ - _out; \ -}) - -#define map_flags_rev(_map, _in) \ -({ \ - unsigned _i, _out = 0; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & _map[_i]) { \ - (_out) |= 1 << _i; \ - (_in) &= ~_map[_i]; \ - } \ - (_out); \ -}) - -#define map_defined(_map) \ -({ \ - unsigned _in = ~0; \ - \ - map_flags_rev(_map, _in); \ -}) - -#endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c deleted file mode 100644 index 6620ecae26af..000000000000 --- a/fs/bcachefs/varint.c +++ /dev/null @@ -1,130 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/bitops.h> -#include <linux/math.h> -#include <linux/string.h> -#include <linux/unaligned.h> - -#ifdef CONFIG_VALGRIND -#include <valgrind/memcheck.h> -#endif - -#include "errcode.h" -#include "varint.h" - -/** - * bch2_varint_encode - encode a variable length integer - * @out: destination to encode to - * @v: unsigned integer to encode - * Returns: size in bytes of the encoded integer - at most 9 bytes - */ -int bch2_varint_encode(u8 *out, u64 v) -{ - unsigned bits = fls64(v|1); - unsigned bytes = DIV_ROUND_UP(bits, 7); - __le64 v_le; - - if (likely(bytes < 9)) { - v <<= bytes; - v |= ~(~0 << (bytes - 1)); - v_le = cpu_to_le64(v); - memcpy(out, &v_le, bytes); - } else { - *out++ = 255; - bytes = 9; - put_unaligned_le64(v, out); - } - - return bytes; -} - -/** - * bch2_varint_decode - encode a variable length integer - * @in: varint to decode - * @end: end of buffer to decode from - * @out: on success, decoded integer - * Returns: size in bytes of the decoded integer - or -1 on failure (would - * have read past the end of the buffer) - */ -int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) -{ - unsigned bytes = likely(in < end) - ? ffz(*in & 255) + 1 - : 1; - u64 v; - - if (unlikely(in + bytes > end)) - return -BCH_ERR_varint_decode_error; - - if (likely(bytes < 9)) { - __le64 v_le = 0; - - memcpy(&v_le, in, bytes); - v = le64_to_cpu(v_le); - v >>= bytes; - } else { - v = get_unaligned_le64(++in); - } - - *out = v; - return bytes; -} - -/** - * bch2_varint_encode_fast - fast version of bch2_varint_encode - * @out: destination to encode to - * @v: unsigned integer to encode - * Returns: size in bytes of the encoded integer - at most 9 bytes - * - * This version assumes it's always safe to write 8 bytes to @out, even if the - * encoded integer would be smaller. - */ -int bch2_varint_encode_fast(u8 *out, u64 v) -{ - unsigned bits = fls64(v|1); - unsigned bytes = DIV_ROUND_UP(bits, 7); - - if (likely(bytes < 9)) { - v <<= bytes; - v |= ~(~0U << (bytes - 1)); - } else { - *out++ = 255; - bytes = 9; - } - - put_unaligned_le64(v, out); - return bytes; -} - -/** - * bch2_varint_decode_fast - fast version of bch2_varint_decode - * @in: varint to decode - * @end: end of buffer to decode from - * @out: on success, decoded integer - * Returns: size in bytes of the decoded integer - or -1 on failure (would - * have read past the end of the buffer) - * - * This version assumes that it is safe to read at most 8 bytes past the end of - * @end (we still return an error if the varint extends past @end). - */ -int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) -{ -#ifdef CONFIG_VALGRIND - VALGRIND_MAKE_MEM_DEFINED(in, 8); -#endif - u64 v = get_unaligned_le64(in); - unsigned bytes = ffz(*in) + 1; - - if (unlikely(in + bytes > end)) - return -BCH_ERR_varint_decode_error; - - if (likely(bytes < 9)) { - v >>= bytes; - v &= ~(~0ULL << (7 * bytes)); - } else { - v = get_unaligned_le64(++in); - } - - *out = v; - return bytes; -} diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h deleted file mode 100644 index 92a182fb3d7a..000000000000 --- a/fs/bcachefs/varint.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_VARINT_H -#define _BCACHEFS_VARINT_H - -int bch2_varint_encode(u8 *, u64); -int bch2_varint_decode(const u8 *, const u8 *, u64 *); - -int bch2_varint_encode_fast(u8 *, u64); -int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *); - -#endif /* _BCACHEFS_VARINT_H */ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h deleted file mode 100644 index 2ad338e282da..000000000000 --- a/fs/bcachefs/vstructs.h +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _VSTRUCTS_H -#define _VSTRUCTS_H - -#include "util.h" - -/* - * NOTE: we can't differentiate between __le64 and u64 with type_is - this - * assumes u64 is little endian: - */ -#define __vstruct_u64s(_s) \ -({ \ - ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ - : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ - : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ - : ((__force u8) ((_s)->u64s))); \ -}) - -#define __vstruct_bytes(_type, _u64s) \ -({ \ - BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ - \ - (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ -}) - -#define vstruct_bytes(_s) \ - __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) - -#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ - (round_up(__vstruct_bytes(_type, _u64s), \ - 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) - -#define vstruct_blocks(_s, _sector_block_bits) \ - __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) - -#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ - __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ - __vstruct_u64s(_s) + (_u64s)) - -#define vstruct_sectors(_s, _sector_block_bits) \ - (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) - -#define vstruct_next(_s) \ - ((typeof(_s)) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) -#define vstruct_last(_s) \ - ((typeof(&(_s)->start[0])) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) -#define vstruct_end(_s) \ - ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) - -#define vstruct_for_each(_s, _i) \ - for (typeof(&(_s)->start[0]) _i = (_s)->start; \ - _i < vstruct_last(_s); \ - _i = vstruct_next(_i)) - -#define vstruct_for_each_safe(_s, _i) \ - for (typeof(&(_s)->start[0]) _next, _i = (_s)->start; \ - _i < vstruct_last(_s) && (_next = vstruct_next(_i), true); \ - _i = _next) - -#define vstruct_idx(_s, _idx) \ - ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) - -#endif /* _VSTRUCTS_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c deleted file mode 100644 index 627f153798c6..000000000000 --- a/fs/bcachefs/xattr.c +++ /dev/null @@ -1,642 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "acl.h" -#include "bkey_methods.h" -#include "btree_update.h" -#include "extents.h" -#include "fs.h" -#include "rebalance.h" -#include "str_hash.h" -#include "xattr.h" - -#include <linux/dcache.h> -#include <linux/posix_acl_xattr.h> -#include <linux/xattr.h> - -static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); - -static u64 bch2_xattr_hash(const struct bch_hash_info *info, - const struct xattr_search_key *key) -{ - struct bch_str_hash_ctx ctx; - - bch2_str_hash_init(&ctx, info); - bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); - bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); - - return bch2_str_hash_end(&ctx, info); -} - -static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) -{ - return bch2_xattr_hash(info, key); -} - -static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -{ - struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); - - return bch2_xattr_hash(info, - &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len)); -} - -static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) -{ - struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); - const struct xattr_search_key *r = _r; - - return l.v->x_type != r->type || - l.v->x_name_len != r->name.len || - memcmp(l.v->x_name_and_value, r->name.name, r->name.len); -} - -static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -{ - struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); - struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); - - return l.v->x_type != r.v->x_type || - l.v->x_name_len != r.v->x_name_len || - memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len); -} - -const struct bch_hash_desc bch2_xattr_hash_desc = { - .btree_id = BTREE_ID_xattrs, - .key_type = KEY_TYPE_xattr, - .hash_key = xattr_hash_key, - .hash_bkey = xattr_hash_bkey, - .cmp_key = xattr_cmp_key, - .cmp_bkey = xattr_cmp_bkey, -}; - -int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len)); - int ret = 0; - - bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, - c, xattr_val_size_too_small, - "value too small (%zu < %u)", - bkey_val_u64s(k.k), val_u64s); - - /* XXX why +4 ? */ - val_u64s = xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len) + 4); - - bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, - c, xattr_val_size_too_big, - "value too big (%zu > %u)", - bkey_val_u64s(k.k), val_u64s); - - bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), - c, xattr_invalid_type, - "invalid type (%u)", xattr.v->x_type); - - bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len), - c, xattr_name_invalid_chars, - "xattr name has invalid characters"); -fsck_err: - return ret; -} - -void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const struct xattr_handler *handler; - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - - handler = bch2_xattr_type_to_handler(xattr.v->x_type); - if (handler && handler->prefix) - prt_printf(out, "%s", handler->prefix); - else if (handler) - prt_printf(out, "(type %u)", xattr.v->x_type); - else - prt_printf(out, "(unknown type %u)", xattr.v->x_type); - - unsigned name_len = xattr.v->x_name_len; - unsigned val_len = le16_to_cpu(xattr.v->x_val_len); - unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - - offsetof(struct bch_xattr, x_name_and_value); - - val_len = min_t(int, val_len, max_name_val_bytes - name_len); - name_len = min(name_len, max_name_val_bytes); - - prt_printf(out, "%.*s:%.*s", - name_len, xattr.v->x_name_and_value, - val_len, (char *) xattr_val(xattr.v)); - - if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || - xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) { - prt_char(out, ' '); - bch2_acl_to_text(out, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); - } -} - -static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, - const char *name, void *buffer, size_t size, int type) -{ - struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); - struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); - struct btree_iter iter; - struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, - inode_inum(inode), &search, 0); - int ret = bkey_err(k); - if (ret) - return ret; - - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - ret = le16_to_cpu(xattr.v->x_val_len); - if (buffer) { - if (ret > size) - ret = -ERANGE; - else - memcpy(buffer, xattr_val(xattr.v), ret); - } - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode_u, - const struct bch_hash_info *hash_info, - const char *name, const void *value, size_t size, - int type, int flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter inode_iter = {}; - int ret; - - ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: - bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); - if (ret) - return ret; - - /* - * Besides the ctime update, extents, dirents and xattrs updates require - * that an inode update also happens - to ensure that if a key exists in - * one of those btrees with a given snapshot ID an inode is also present - */ - inode_u->bi_ctime = bch2_current_time(c); - - ret = bch2_inode_write(trans, &inode_iter, inode_u); - bch2_trans_iter_exit(trans, &inode_iter); - - if (ret) - return ret; - - if (value) { - struct bkey_i_xattr *xattr; - unsigned namelen = strlen(name); - unsigned u64s = BKEY_U64s + - xattr_val_u64s(namelen, size); - - if (u64s > U8_MAX) - return -ERANGE; - - xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); - if (IS_ERR(xattr)) - return PTR_ERR(xattr); - - bkey_xattr_init(&xattr->k_i); - xattr->k.u64s = u64s; - xattr->v.x_type = type; - xattr->v.x_name_len = namelen; - xattr->v.x_val_len = cpu_to_le16(size); - memcpy(xattr->v.x_name_and_value, name, namelen); - memcpy(xattr_val(&xattr->v), value, size); - - ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, - inum, &xattr->k_i, - (flags & XATTR_CREATE ? STR_HASH_must_create : 0)| - (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0)); - } else { - struct xattr_search_key search = - X_SEARCH(type, name, strlen(name)); - - ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, - hash_info, inum, &search); - } - - if (bch2_err_matches(ret, ENOENT)) - ret = flags & XATTR_REPLACE ? -ENODATA : 0; - - return ret; -} - -struct xattr_buf { - char *buf; - size_t len; - size_t used; -}; - -static int __bch2_xattr_emit(const char *prefix, - const char *name, size_t name_len, - struct xattr_buf *buf) -{ - const size_t prefix_len = strlen(prefix); - const size_t total_len = prefix_len + name_len + 1; - - if (buf->buf) { - if (buf->used + total_len > buf->len) - return -ERANGE; - - memcpy(buf->buf + buf->used, prefix, prefix_len); - memcpy(buf->buf + buf->used + prefix_len, - name, name_len); - buf->buf[buf->used + prefix_len + name_len] = '\0'; - } - - buf->used += total_len; - return 0; -} - -static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry) -{ - const struct xattr_handler *handler = bch2_xattr_type_to_handler(type); - - if (!xattr_handler_can_list(handler, dentry)) - return NULL; - - return xattr_prefix(handler); -} - -static int bch2_xattr_emit(struct dentry *dentry, - const struct bch_xattr *xattr, - struct xattr_buf *buf) -{ - const char *prefix; - - prefix = bch2_xattr_prefix(xattr->x_type, dentry); - if (!prefix) - return 0; - - return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf); -} - -static int bch2_xattr_list_bcachefs(struct bch_fs *c, - struct bch_inode_unpacked *inode, - struct xattr_buf *buf, - bool all) -{ - const char *prefix = all ? "bcachefs_effective." : "bcachefs."; - unsigned id; - int ret = 0; - u64 v; - - for (id = 0; id < Inode_opt_nr; id++) { - v = bch2_inode_opt_get(inode, id); - if (!v) - continue; - - if (!all && - !(inode->bi_fields_set & (1 << id))) - continue; - - ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], - strlen(bch2_inode_opts[id]), buf); - if (ret) - break; - } - - return ret; -} - -ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - struct bch_fs *c = dentry->d_sb->s_fs_info; - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; - u64 offset = 0, inum = inode->ei_inode.bi_inum; - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs, - POS(inum, offset), - POS(inum, U64_MAX), - inode->ei_inum.subvol, 0, k, ({ - if (k.k->type != KEY_TYPE_xattr) - continue; - - bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); - }))) ?: - bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?: - bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); - - return ret ? bch2_err_class(ret) : buf.used; -} - -static int bch2_xattr_get_handler(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret = bch2_trans_do(c, - bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); - - if (ret < 0 && bch2_err_matches(ret, ENOENT)) - ret = -ENODATA; - - return bch2_err_class(ret); -} - -static int bch2_xattr_set_handler(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *vinode, - const char *name, const void *value, - size_t size, int flags) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - struct bch_inode_unpacked inode_u; - int ret; - - ret = bch2_trans_run(c, - commit_do(trans, NULL, NULL, 0, - bch2_xattr_set(trans, inode_inum(inode), &inode_u, - &hash, name, value, size, - handler->flags, flags)) ?: - (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0)); - - return bch2_err_class(ret); -} - -static const struct xattr_handler bch_xattr_user_handler = { - .prefix = XATTR_USER_PREFIX, - .get = bch2_xattr_get_handler, - .set = bch2_xattr_set_handler, - .flags = KEY_TYPE_XATTR_INDEX_USER, -}; - -static bool bch2_xattr_trusted_list(struct dentry *dentry) -{ - return capable(CAP_SYS_ADMIN); -} - -static const struct xattr_handler bch_xattr_trusted_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .list = bch2_xattr_trusted_list, - .get = bch2_xattr_get_handler, - .set = bch2_xattr_set_handler, - .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, -}; - -static const struct xattr_handler bch_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .get = bch2_xattr_get_handler, - .set = bch2_xattr_set_handler, - .flags = KEY_TYPE_XATTR_INDEX_SECURITY, -}; - -#ifndef NO_BCACHEFS_FS - -static int opt_to_inode_opt(int id) -{ - switch (id) { -#define x(name, ...) \ - case Opt_##name: return Inode_opt_##name; - BCH_INODE_OPTS() -#undef x - default: - return -1; - } -} - -static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size, - bool all) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_opts opts = - bch2_inode_opts_to_opts(&inode->ei_inode); - const struct bch_option *opt; - int id, inode_opt_id; - struct printbuf out = PRINTBUF; - int ret; - u64 v; - - id = bch2_opt_lookup(name); - if (id < 0 || !bch2_opt_is_inode_opt(id)) - return -EINVAL; - - inode_opt_id = opt_to_inode_opt(id); - if (inode_opt_id < 0) - return -EINVAL; - - opt = bch2_opt_table + id; - - if (!bch2_opt_defined_by_id(&opts, id)) - return -ENODATA; - - if (!all && - !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) - return -ENODATA; - - v = bch2_opt_get_by_id(&opts, id); - bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); - - ret = out.pos; - - if (out.allocation_failure) { - ret = -ENOMEM; - } else if (buffer) { - if (out.pos > size) - ret = -ERANGE; - else - memcpy(buffer, out.buf, out.pos); - } - - printbuf_exit(&out); - return ret; -} - -static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size) -{ - return __bch2_xattr_bcachefs_get(handler, dentry, vinode, - name, buffer, size, false); -} - -struct inode_opt_set { - int id; - u64 v; - bool defined; -}; - -static int inode_opt_set_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct inode_opt_set *s = p; - - if (s->id == Inode_opt_casefold) { - int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->v); - if (ret) - return ret; - } - - if (s->defined) - bi->bi_fields_set |= 1U << s->id; - else - bi->bi_fields_set &= ~(1U << s->id); - - bch2_inode_opt_set(bi, s->id, s->v); - - return 0; -} - -static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *vinode, - const char *name, const void *value, - size_t size, int flags) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - const struct bch_option *opt; - char *buf; - struct inode_opt_set s; - int opt_id, inode_opt_id, ret; - - opt_id = bch2_opt_lookup(name); - if (opt_id < 0) - return -EINVAL; - - opt = bch2_opt_table + opt_id; - - inode_opt_id = opt_to_inode_opt(opt_id); - if (inode_opt_id < 0) - return -EINVAL; - - s.id = inode_opt_id; - - if (value) { - u64 v = 0; - - buf = kmalloc(size + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - memcpy(buf, value, size); - buf[size] = '\0'; - - ret = bch2_opt_parse(c, opt, buf, &v, NULL); - kfree(buf); - - if (ret < 0) - goto err_class_exit; - - ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v); - if (ret < 0) - goto err_class_exit; - - s.v = v + 1; - s.defined = true; - } else { - /* - * Check if this option was set on the parent - if so, switched - * back to inheriting from the parent: - * - * rename() also has to deal with keeping inherited options up - * to date - see bch2_reinherit_attrs() - */ - spin_lock(&dentry->d_lock); - if (!IS_ROOT(dentry)) { - struct bch_inode_info *dir = - to_bch_ei(d_inode(dentry->d_parent)); - - s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); - } else { - s.v = 0; - } - spin_unlock(&dentry->d_lock); - - s.defined = false; - } - - mutex_lock(&inode->ei_update_lock); - if (inode_opt_id == Inode_opt_project) { - /* - * inode fields accessible via the xattr interface are stored - * with a +1 bias, so that 0 means unset: - */ - ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); - if (ret) - goto err; - } - - ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); -err: - mutex_unlock(&inode->ei_update_lock); -err_class_exit: - return bch2_err_class(ret); -} - -static const struct xattr_handler bch_xattr_bcachefs_handler = { - .prefix = "bcachefs.", - .get = bch2_xattr_bcachefs_get, - .set = bch2_xattr_bcachefs_set, -}; - -static int bch2_xattr_bcachefs_get_effective( - const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size) -{ - return __bch2_xattr_bcachefs_get(handler, dentry, vinode, - name, buffer, size, true); -} - -/* Noop - xattrs in the bcachefs_effective namespace are inherited */ -static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *vinode, - const char *name, const void *value, - size_t size, int flags) -{ - return 0; -} - -static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { - .prefix = "bcachefs_effective.", - .get = bch2_xattr_bcachefs_get_effective, - .set = bch2_xattr_bcachefs_set_effective, -}; - -#endif /* NO_BCACHEFS_FS */ - -const struct xattr_handler * const bch2_xattr_handlers[] = { - &bch_xattr_user_handler, - &bch_xattr_trusted_handler, - &bch_xattr_security_handler, -#ifndef NO_BCACHEFS_FS - &bch_xattr_bcachefs_handler, - &bch_xattr_bcachefs_effective_handler, -#endif - NULL -}; - -static const struct xattr_handler *bch_xattr_handler_map[] = { - [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, - [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = - &nop_posix_acl_access, - [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = - &nop_posix_acl_default, - [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, - [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, -}; - -static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) -{ - return type < ARRAY_SIZE(bch_xattr_handler_map) - ? bch_xattr_handler_map[type] - : NULL; -} diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h deleted file mode 100644 index 1139bf345f70..000000000000 --- a/fs/bcachefs/xattr.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_XATTR_H -#define _BCACHEFS_XATTR_H - -#include "str_hash.h" - -extern const struct bch_hash_desc bch2_xattr_hash_desc; - -int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_xattr ((struct bkey_ops) { \ - .key_validate = bch2_xattr_validate, \ - .val_to_text = bch2_xattr_to_text, \ - .min_val_size = 8, \ -}) - -static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) -{ - return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) + - name_len + val_len, sizeof(u64)); -} - -#define xattr_val(_xattr) \ - ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len) - -struct xattr_search_key { - u8 type; - struct qstr name; -}; - -#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ - { .type = _type, .name = QSTR_INIT(_name, _len) }) - -struct dentry; -struct xattr_handler; -struct bch_hash_info; -struct bch_inode_info; - -/* Exported for cmd_migrate.c in tools: */ -int bch2_xattr_set(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, const struct bch_hash_info *, - const char *, const void *, size_t, int, int); - -ssize_t bch2_xattr_list(struct dentry *, char *, size_t); - -extern const struct xattr_handler * const bch2_xattr_handlers[]; - -#endif /* _BCACHEFS_XATTR_H */ diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h deleted file mode 100644 index 4121b78d9a92..000000000000 --- a/fs/bcachefs/xattr_format.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_XATTR_FORMAT_H -#define _BCACHEFS_XATTR_FORMAT_H - -#define KEY_TYPE_XATTR_INDEX_USER 0 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 -#define KEY_TYPE_XATTR_INDEX_SECURITY 4 - -struct bch_xattr { - struct bch_val v; - __u8 x_type; - __u8 x_name_len; - __le16 x_val_len; - /* - * x_name contains the name and value counted by - * x_name_len + x_val_len. The introduction of - * __counted_by(x_name_len) previously caused a false positive - * detection of an out of bounds write. - */ - __u8 x_name_and_value[]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_XATTR_FORMAT_H */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 264fba0d44bd..e4653bb99946 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -103,6 +103,21 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) +static inline void elf_coredump_set_mm_eflags(struct mm_struct *mm, u32 flags) +{ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + mm->saved_e_flags = flags; +#endif +} + +static inline u32 elf_coredump_get_mm_eflags(struct mm_struct *mm, u32 flags) +{ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + flags = mm->saved_e_flags; +#endif + return flags; +} + /* * We need to explicitly zero any trailing portion of the page that follows * p_filesz when it ends before the page ends (e.g. bss), otherwise this @@ -1290,6 +1305,8 @@ out_free_interp: mm->end_data = end_data; mm->start_stack = bprm->p; + elf_coredump_set_mm_eflags(mm, elf_ex->e_flags); + /** * DOC: "brk" handling * @@ -1804,6 +1821,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_thread_core_info *t; struct elf_prpsinfo *psinfo; struct core_thread *ct; + u16 machine; + u32 flags; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) @@ -1831,30 +1850,37 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, return 0; } - /* - * Initialize the ELF file header. - */ - fill_elf_header(elf, phdrs, - view->e_machine, view->e_flags); + machine = view->e_machine; + flags = view->e_flags; #else view = NULL; info->thread_notes = 2; - fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); + machine = ELF_ARCH; + flags = ELF_CORE_EFLAGS; #endif /* + * Override ELF e_flags with value taken from process, + * if arch needs that. + */ + flags = elf_coredump_get_mm_eflags(dump_task->mm, flags); + + /* + * Initialize the ELF file header. + */ + fill_elf_header(elf, phdrs, machine, flags); + + /* * Allocate a structure for each thread. */ - info->thread = kzalloc(offsetof(struct elf_thread_core_info, - notes[info->thread_notes]), - GFP_KERNEL); + info->thread = kzalloc(struct_size(info->thread, notes, info->thread_notes), + GFP_KERNEL); if (unlikely(!info->thread)) return 0; info->thread->task = dump_task; for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) { - t = kzalloc(offsetof(struct elf_thread_core_info, - notes[info->thread_notes]), + t = kzalloc(struct_size(t, notes, info->thread_notes), GFP_KERNEL); if (unlikely(!t)) return 0; diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ea95c90c8474..4438637c8900 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -62,6 +62,7 @@ config BTRFS_FS_RUN_SANITY_TESTS config BTRFS_DEBUG bool "Btrfs debugging support" depends on BTRFS_FS + select REF_TRACKER if STACKTRACE_SUPPORT help Enable run-time debugging support for the btrfs filesystem. @@ -117,14 +118,3 @@ config BTRFS_EXPERIMENTAL - large folio support If unsure, say N. - -config BTRFS_FS_REF_VERIFY - bool "Btrfs with the ref verify tool compiled in" - depends on BTRFS_FS - default n - help - Enable run-time extent reference verification instrumentation. This - is meant to be used by btrfs developers for tracking down extent - reference problems or verifying they didn't break something. - - If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 2d5f0482678b..743d7677b175 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -36,7 +36,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o -btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o +btrfs-$(CONFIG_BTRFS_DEBUG) += ref-verify.o btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o btrfs-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 861c7d92c437..1248aa2535d3 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -44,7 +44,7 @@ static __always_inline void memcpy_split_src(char *dest, const char *src1, * gives us all the type checking. * * The extent buffer pages stored in the array folios may not form a contiguous - * phyusical range, but the API functions assume the linear offset to the range + * physical range, but the API functions assume the linear offset to the range * from 0 to metadata node size. */ diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 6a450be293b1..2ab550a1e715 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -859,7 +859,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, free_pref(ref); return PTR_ERR(eb); } - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_pref(ref); free_extent_buffer(eb); return -EIO; @@ -1062,7 +1062,7 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); - if (type == BTRFS_REF_TYPE_INVALID) + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) return -EUCLEAN; offset = btrfs_extent_inline_ref_offset(leaf, iref); @@ -1422,7 +1422,7 @@ again: ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -1614,7 +1614,7 @@ again: ret = PTR_ERR(eb); goto out; } - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); ret = -EIO; goto out; @@ -1652,7 +1652,7 @@ again: * case. */ ASSERT(eie); - if (!eie) { + if (unlikely(!eie)) { ret = -EUCLEAN; goto out; } @@ -1690,7 +1690,7 @@ out: * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are * added to the ulist at @ctx->refs, and that ulist is allocated by this * function. The caller should free the ulist with free_leaf_list() if - * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is + * @ctx->ignore_extent_item_pos is false, otherwise a simple ulist_free() is * enough. * * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated. @@ -2215,7 +2215,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2312,7 +2312,7 @@ static int get_extent_inline_ref(unsigned long *ptr, *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); *out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref, BTRFS_REF_TYPE_ANY); - if (*out_type == BTRFS_REF_TYPE_INVALID) + if (unlikely(*out_type == BTRFS_REF_TYPE_INVALID)) return -EUCLEAN; *ptr += btrfs_extent_inline_ref_size(*out_type); @@ -2868,7 +2868,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2876,7 +2876,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) ret = -EUCLEAN; goto release; } - if (path->slots[0] == 0) { + if (unlikely(path->slots[0] == 0)) { DEBUG_WARN(); ret = -EUCLEAN; goto release; @@ -3457,7 +3457,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, if (ret < 0) goto out; /* No extra backref? This means the tree block is corrupted */ - if (ret > 0) { + if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -3500,7 +3500,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, ((unsigned long)iter->cur_ptr); type = btrfs_get_extent_inline_ref_type(eb, iref, BTRFS_REF_TYPE_BLOCK); - if (type == BTRFS_REF_TYPE_INVALID) { + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) { ret = -EUCLEAN; goto out; } @@ -3612,7 +3612,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, } /* Sanity check, we shouldn't have any unchecked nodes */ - if (!upper->checked) { + if (unlikely(!upper->checked)) { DEBUG_WARN("we should not have any unchecked nodes"); return -EUCLEAN; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 34b0193a181c..25d51c246070 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -190,7 +190,7 @@ struct btrfs_backref_share_check_ctx { * It's very common to have several file extent items that point to the * same extent (bytenr) but with different offsets and lengths. This * typically happens for COW writes, partial writes into prealloc - * extents, NOCOW writes after snapshoting a root, hole punching or + * extents, NOCOW writes after snapshotting a root, hole punching or * reflinking within the same file (less common perhaps). * So keep a small cache with the lookup results for the extent pointed * by the last few file extent items. This cache is checked, with a @@ -414,7 +414,7 @@ struct btrfs_backref_cache { /* * Whether this cache is for relocation * - * Reloction backref cache require more info for reloc root compared + * Relocation backref cache require more info for reloc root compared * to generic backref cache. */ bool is_reloc; diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 50b5fc1c06d7..21df48e6c4fa 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -93,6 +93,7 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, refcount_inc(&orig_bbio->ordered->refs); bbio->ordered = orig_bbio->ordered; } + bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; atomic_inc(&orig_bbio->pending_ios); return bbio; } @@ -166,7 +167,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, int mirror = repair_bbio->mirror_num; if (repair_bbio->bio.bi_status || - !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { + !btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); repair_bbio->bio.bi_iter = repair_bbio->saved_iter; @@ -203,18 +204,21 @@ done: */ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, u32 bio_offset, - struct bio_vec *bv, + phys_addr_t paddr, struct btrfs_failed_bio *fbio) { struct btrfs_inode *inode = failed_bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct folio *folio = page_folio(phys_to_page(paddr)); const u32 sectorsize = fs_info->sectorsize; + const u32 foff = offset_in_folio(folio, paddr); const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); struct btrfs_bio *repair_bbio; struct bio *repair_bio; int num_copies; int mirror; + ASSERT(foff + sectorsize <= folio_size(folio)); btrfs_debug(fs_info, "repair read error: read error at %llu", failed_bbio->file_offset + bio_offset); @@ -237,7 +241,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &btrfs_repair_bioset); repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; - __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + bio_add_folio_nofail(repair_bio, folio, sectorsize, foff); repair_bbio = btrfs_bio(repair_bio); btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); @@ -258,6 +262,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de struct bvec_iter *iter = &bbio->saved_iter; blk_status_t status = bbio->bio.bi_status; struct btrfs_failed_bio *fbio = NULL; + phys_addr_t paddr; u32 offset = 0; /* Read-repair requires the inode field to be set by the submitter. */ @@ -275,17 +280,11 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de /* Clear the I/O error. A failed repair will reset it. */ bbio->bio.bi_status = BLK_STS_OK; - while (iter->bi_size) { - struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); - - bv.bv_len = min(bv.bv_len, sectorsize); - if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) - fbio = repair_one_sector(bbio, offset, &bv, fbio); - - bio_advance_iter_single(&bbio->bio, iter, sectorsize); + btrfs_bio_for_each_block(paddr, &bbio->bio, iter, fs_info->sectorsize) { + if (status || !btrfs_data_csum_ok(bbio, dev, offset, paddr)) + fbio = repair_one_sector(bbio, offset, paddr, fbio); offset += sectorsize; } - if (bbio->csum != bbio->csum_inline) kfree(bbio->csum); @@ -780,11 +779,38 @@ end_bbio: return true; } +static void assert_bbio_alignment(struct btrfs_bio *bbio) +{ +#ifdef CONFIG_BTRFS_ASSERT + struct btrfs_fs_info *fs_info = bbio->fs_info; + struct bio_vec bvec; + struct bvec_iter iter; + const u32 blocksize = fs_info->sectorsize; + + /* Metadata has no extra bs > ps alignment requirement. */ + if (!is_data_bbio(bbio)) + return; + + bio_for_each_bvec(bvec, &bbio->bio, iter) + ASSERT(IS_ALIGNED(bvec.bv_offset, blocksize) && + IS_ALIGNED(bvec.bv_len, blocksize), + "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u", + btrfs_root_id(bbio->inode->root), + btrfs_ino(bbio->inode), + bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT, + bbio->bio.bi_iter.bi_size, iter.bi_idx, + bvec.bv_offset, + bvec.bv_len); +#endif +} + void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) { /* If bbio->inode is not populated, its file_offset must be 0. */ ASSERT(bbio->inode || bbio->file_offset == 0); + assert_bbio_alignment(bbio); + while (!btrfs_submit_chunk(bbio, mirror_num)) ; } @@ -823,8 +849,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (ret < 0) goto out_counter_dec; - if (!smap.dev->bdev || - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { + if (unlikely(!smap.dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) { ret = -EIO; goto out_counter_dec; } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index dc2eb43b7097..00883aea55d7 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -82,6 +82,8 @@ struct btrfs_bio { /* Save the first error status of split bio. */ blk_status_t status; + /* Use the commit root to look up csums (data read bio only). */ + bool csum_search_commit_root; /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9bf282d2453c..5322ef2ae015 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1358,7 +1358,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( * data in this block group. That check should be done by relocation routine, * not this function. */ -static int inc_block_group_ro(struct btrfs_block_group *cache, int force) +static int inc_block_group_ro(struct btrfs_block_group *cache, bool force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; @@ -1795,7 +1795,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, bg1 = list_entry(a, struct btrfs_block_group, bg_list); bg2 = list_entry(b, struct btrfs_block_group, bg_list); - return bg1->used > bg2->used; + /* + * Some other task may be updating the ->used field concurrently, but it + * is not serious if we get a stale value or load/store tearing issues, + * as sorting the list of block groups to reclaim is not critical and an + * occasional imperfect order is ok. So silence KCSAN and avoid the + * overhead of locking or any other synchronization. + */ + return data_race(bg1->used > bg2->used); } static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) @@ -1964,7 +1971,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) * called, which is where we will transfer a reserved extent's * size from the "reserved" counter to the "used" counter - this * happens when running delayed references. When we relocate the - * chunk below, relocation first flushes dellaloc, waits for + * chunk below, relocation first flushes delalloc, waits for * ordered extent completion (which is where we create delayed * references for data extents) and commits the current * transaction (which runs delayed references), and only after @@ -2031,7 +2038,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) btrfs_reclaim_sweep(fs_info); spin_lock(&fs_info->unused_bgs_lock); if (!list_empty(&fs_info->reclaim_bgs)) - queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); + queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work); spin_unlock(&fs_info->unused_bgs_lock); } @@ -2064,7 +2071,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key return -ENOENT; } - if (map->start != key->objectid || map->chunk_len != key->offset) { + if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) { btrfs_err(fs_info, "block group %llu len %llu mismatch with chunk %llu len %llu", key->objectid, key->offset, map->start, map->chunk_len); @@ -2077,7 +2084,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key flags = btrfs_stack_block_group_flags(&bg) & BTRFS_BLOCK_GROUP_TYPE_MASK; - if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) { btrfs_err(fs_info, "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", key->objectid, key->offset, flags, @@ -2238,7 +2245,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) return ret; /* Shouldn't have super stripes in sequential zones */ - if (zoned && nr) { + if (unlikely(zoned && nr)) { kfree(logical); btrfs_err(fs_info, "zoned: block group %llu must not contain super block", @@ -2329,7 +2336,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) break; bg = btrfs_lookup_block_group(fs_info, map->start); - if (!bg) { + if (unlikely(!bg)) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", map->start, map->chunk_len); @@ -2337,9 +2344,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) btrfs_free_chunk_map(map); break; } - if (bg->start != map->start || bg->length != map->chunk_len || - (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != - (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (unlikely(bg->start != map->start || bg->length != map->chunk_len || + (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != + (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) { btrfs_err(fs_info, "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", map->start, map->chunk_len, @@ -2832,7 +2839,7 @@ next: * space or none at all (due to no need to COW, extent buffers * were already COWed in the current transaction and still * unwritten, tree heights lower than the maximum possible - * height, etc). For data we generally reserve the axact amount + * height, etc). For data we generally reserve the exact amount * of space we are going to allocate later, the exception is * when using compression, as we must reserve space based on the * uncompressed data size, because the compression is only done @@ -3241,7 +3248,7 @@ again: */ BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { /* * So theoretically we could recover from this, simply set the * super cache generation to 0 so we know to invalidate the @@ -3988,7 +3995,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans struct btrfs_space_info *sys_space_info; sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags); - if (!sys_space_info) { + if (unlikely(!sys_space_info)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -4002,17 +4009,17 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans } ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } - } else if (ret) { + } else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a8bb8429c966..9172104a5889 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -63,7 +63,7 @@ enum btrfs_discard_state { * CHUNK_ALLOC_FORCE means it must try to allocate one * * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from - * find_free_extent() that also activaes the zone + * find_free_extent() that also activates the zone */ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_NO_FORCE, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b99fb0273292..af373d50a901 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -248,7 +248,7 @@ struct btrfs_inode { u64 new_delalloc_bytes; /* * The offset of the last dir index key that was logged. - * This is used only for directories. + * This is used only for directories. Protected by 'log_mutex'. */ u64 last_dir_index_offset; }; @@ -338,6 +338,11 @@ struct btrfs_inode { struct list_head delayed_iput; struct rw_semaphore i_mmap_lock; + +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif + struct inode vfs_inode; }; @@ -532,9 +537,9 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) /* We only allow BITS_PER_LONGS blocks for each bitmap. */ #ifdef CONFIG_BTRFS_EXPERIMENTAL - mapping_set_folio_order_range(inode->vfs_inode.i_mapping, 0, - ilog2(((BITS_PER_LONG << inode->root->fs_info->sectorsize_bits) - >> PAGE_SHIFT))); + mapping_set_folio_order_range(inode->vfs_inode.i_mapping, + inode->root->fs_info->block_min_order, + inode->root->fs_info->block_max_order); #endif } @@ -542,10 +547,12 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, - const u8 * const csum_expected); +void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, + u8 *dest); +int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, + const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, struct bio_vec *bv); + u32 bio_offset, phys_addr_t paddr); noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); @@ -558,7 +565,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, const struct fscrypt_str *name); int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct fscrypt_str *name, int add_backref, u64 index); + const struct fscrypt_str *name, bool add_backref, u64 index); int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d09d622016ef..bacad18357b3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -90,19 +90,19 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len) } static int compression_compress_pages(int type, struct list_head *ws, - struct address_space *mapping, u64 start, + struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { switch (type) { case BTRFS_COMPRESS_ZLIB: - return zlib_compress_folios(ws, mapping, start, folios, + return zlib_compress_folios(ws, inode, start, folios, out_folios, total_in, total_out); case BTRFS_COMPRESS_LZO: - return lzo_compress_folios(ws, mapping, start, folios, + return lzo_compress_folios(ws, inode, start, folios, out_folios, total_in, total_out); case BTRFS_COMPRESS_ZSTD: - return zstd_compress_folios(ws, mapping, start, folios, + return zstd_compress_folios(ws, inode, start, folios, out_folios, total_in, total_out); case BTRFS_COMPRESS_NONE: default: @@ -223,10 +223,14 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co /* * Common wrappers for page allocation from compression wrappers */ -struct folio *btrfs_alloc_compr_folio(void) +struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info) { struct folio *folio = NULL; + /* For bs > ps cases, no cached folio pool for now. */ + if (fs_info->block_min_order) + goto alloc; + spin_lock(&compr_pool.lock); if (compr_pool.count > 0) { folio = list_first_entry(&compr_pool.list, struct folio, lru); @@ -238,13 +242,18 @@ struct folio *btrfs_alloc_compr_folio(void) if (folio) return folio; - return folio_alloc(GFP_NOFS, 0); +alloc: + return folio_alloc(GFP_NOFS, fs_info->block_min_order); } void btrfs_free_compr_folio(struct folio *folio) { bool do_free = false; + /* The folio is from bs > ps fs, no cached pool for now. */ + if (folio_order(folio)) + goto free; + spin_lock(&compr_pool.lock); if (compr_pool.count > compr_pool.thresh) { do_free = true; @@ -257,6 +266,7 @@ void btrfs_free_compr_folio(struct folio *folio) if (!do_free) return; +free: ASSERT(folio_ref_count(folio) == 1); folio_put(folio); } @@ -344,16 +354,19 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio) static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb->bbio.fs_info; struct bio *bio = &cb->bbio.bio; u32 offset = 0; while (offset < cb->compressed_len) { + struct folio *folio; int ret; - u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); + u32 len = min_t(u32, cb->compressed_len - offset, + btrfs_min_folio_size(fs_info)); + folio = cb->compressed_folios[offset >> (PAGE_SHIFT + fs_info->block_min_order)]; /* Maximum compressed extent is smaller than bio size limit. */ - ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT], - len, 0); + ret = bio_add_folio(bio, folio, len, 0); ASSERT(ret); offset += len; } @@ -443,6 +456,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (fs_info->sectorsize < PAGE_SIZE) return 0; + /* For bs > ps cases, we don't support readahead for compressed folios for now. */ + if (fs_info->block_min_order) + return 0; + end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; while (cur < compressed_end) { @@ -602,17 +619,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) cb->compressed_len = compressed_len; cb->compress_type = btrfs_extent_map_compression(em); cb->orig_bbio = bbio; + cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root; btrfs_free_extent_map(em); - cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info)); cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); if (!cb->compressed_folios) { status = BLK_STS_RESOURCE; goto out_free_bio; } - ret = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); + ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order, + cb->compressed_folios); if (ret) { status = BLK_STS_RESOURCE; goto out_free_compressed_pages; @@ -687,8 +706,6 @@ struct heuristic_ws { struct list_head list; }; -static struct workspace_manager heuristic_wsm; - static void free_heuristic_ws(struct list_head *ws) { struct heuristic_ws *workspace; @@ -701,7 +718,7 @@ static void free_heuristic_ws(struct list_head *ws) kfree(workspace); } -static struct list_head *alloc_heuristic_ws(void) +static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info) { struct heuristic_ws *ws; @@ -728,11 +745,9 @@ fail: return ERR_PTR(-ENOMEM); } -const struct btrfs_compress_op btrfs_heuristic_compress = { - .workspace_manager = &heuristic_wsm, -}; +const struct btrfs_compress_levels btrfs_heuristic_compress = { 0 }; -static const struct btrfs_compress_op * const btrfs_compress_op[] = { +static const struct btrfs_compress_levels * const btrfs_compress_levels[] = { /* The heuristic is represented as compression type 0 */ &btrfs_heuristic_compress, &btrfs_zlib_compress, @@ -740,13 +755,13 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zstd_compress, }; -static struct list_head *alloc_workspace(int type, int level) +static struct list_head *alloc_workspace(struct btrfs_fs_info *fs_info, int type, int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); - case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); - case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(); - case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); + case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(fs_info); + case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(fs_info, level); + case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(fs_info); + case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(fs_info, level); default: /* * This can't happen, the type is validated several times @@ -772,44 +787,58 @@ static void free_workspace(int type, struct list_head *ws) } } -static void btrfs_init_workspace_manager(int type) +static int alloc_workspace_manager(struct btrfs_fs_info *fs_info, + enum btrfs_compression_type type) { - struct workspace_manager *wsm; + struct workspace_manager *gwsm; struct list_head *workspace; - wsm = btrfs_compress_op[type]->workspace_manager; - INIT_LIST_HEAD(&wsm->idle_ws); - spin_lock_init(&wsm->ws_lock); - atomic_set(&wsm->total_ws, 0); - init_waitqueue_head(&wsm->ws_wait); + ASSERT(fs_info->compr_wsm[type] == NULL); + gwsm = kzalloc(sizeof(*gwsm), GFP_KERNEL); + if (!gwsm) + return -ENOMEM; + + INIT_LIST_HEAD(&gwsm->idle_ws); + spin_lock_init(&gwsm->ws_lock); + atomic_set(&gwsm->total_ws, 0); + init_waitqueue_head(&gwsm->ws_wait); + fs_info->compr_wsm[type] = gwsm; /* * Preallocate one workspace for each compression type so we can * guarantee forward progress in the worst case */ - workspace = alloc_workspace(type, 0); + workspace = alloc_workspace(fs_info, type, 0); if (IS_ERR(workspace)) { - btrfs_warn(NULL, - "cannot preallocate compression workspace, will try later"); + btrfs_warn(fs_info, + "cannot preallocate compression workspace for %s, will try later", + btrfs_compress_type2str(type)); } else { - atomic_set(&wsm->total_ws, 1); - wsm->free_ws = 1; - list_add(workspace, &wsm->idle_ws); + atomic_set(&gwsm->total_ws, 1); + gwsm->free_ws = 1; + list_add(workspace, &gwsm->idle_ws); } + return 0; } -static void btrfs_cleanup_workspace_manager(int type) +static void free_workspace_manager(struct btrfs_fs_info *fs_info, + enum btrfs_compression_type type) { - struct workspace_manager *wsman; struct list_head *ws; + struct workspace_manager *gwsm = fs_info->compr_wsm[type]; - wsman = btrfs_compress_op[type]->workspace_manager; - while (!list_empty(&wsman->idle_ws)) { - ws = wsman->idle_ws.next; + /* ZSTD uses its own workspace manager, should enter here. */ + ASSERT(type != BTRFS_COMPRESS_ZSTD && type < BTRFS_NR_COMPRESS_TYPES); + if (!gwsm) + return; + fs_info->compr_wsm[type] = NULL; + while (!list_empty(&gwsm->idle_ws)) { + ws = gwsm->idle_ws.next; list_del(ws); free_workspace(type, ws); - atomic_dec(&wsman->total_ws); + atomic_dec(&gwsm->total_ws); } + kfree(gwsm); } /* @@ -818,9 +847,9 @@ static void btrfs_cleanup_workspace_manager(int type) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -struct list_head *btrfs_get_workspace(int type, int level) +struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level) { - struct workspace_manager *wsm; + struct workspace_manager *wsm = fs_info->compr_wsm[type]; struct list_head *workspace; int cpus = num_online_cpus(); unsigned nofs_flag; @@ -830,7 +859,7 @@ struct list_head *btrfs_get_workspace(int type, int level) wait_queue_head_t *ws_wait; int *free_ws; - wsm = btrfs_compress_op[type]->workspace_manager; + ASSERT(wsm); idle_ws = &wsm->idle_ws; ws_lock = &wsm->ws_lock; total_ws = &wsm->total_ws; @@ -866,7 +895,7 @@ again: * context of btrfs_compress_bio/btrfs_compress_pages */ nofs_flag = memalloc_nofs_save(); - workspace = alloc_workspace(type, level); + workspace = alloc_workspace(fs_info, type, level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(workspace)) { @@ -889,7 +918,7 @@ again: /* no burst */ 1); if (__ratelimit(&_rs)) - btrfs_warn(NULL, + btrfs_warn(fs_info, "no compression workspaces, low memory, retrying"); } goto again; @@ -897,13 +926,13 @@ again: return workspace; } -static struct list_head *get_workspace(int type, int level) +static struct list_head *get_workspace(struct btrfs_fs_info *fs_info, int type, int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level); - case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level); - case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level); - case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level); + case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(fs_info, type, level); + case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(fs_info, level); + case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(fs_info, type, level); + case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(fs_info, level); default: /* * This can't happen, the type is validated several times @@ -917,21 +946,21 @@ static struct list_head *get_workspace(int type, int level) * put a workspace struct back on the list or free it if we have enough * idle ones sitting around */ -void btrfs_put_workspace(int type, struct list_head *ws) +void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws) { - struct workspace_manager *wsm; + struct workspace_manager *gwsm = fs_info->compr_wsm[type]; struct list_head *idle_ws; spinlock_t *ws_lock; atomic_t *total_ws; wait_queue_head_t *ws_wait; int *free_ws; - wsm = btrfs_compress_op[type]->workspace_manager; - idle_ws = &wsm->idle_ws; - ws_lock = &wsm->ws_lock; - total_ws = &wsm->total_ws; - ws_wait = &wsm->ws_wait; - free_ws = &wsm->free_ws; + ASSERT(gwsm); + idle_ws = &gwsm->idle_ws; + ws_lock = &gwsm->ws_lock; + total_ws = &gwsm->total_ws; + ws_wait = &gwsm->ws_wait; + free_ws = &gwsm->free_ws; spin_lock(ws_lock); if (*free_ws <= num_online_cpus()) { @@ -948,13 +977,13 @@ wake: cond_wake_up(ws_wait); } -static void put_workspace(int type, struct list_head *ws) +static void put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws) { switch (type) { - case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws); - case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws); - case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws); - case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws); + case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(fs_info, type, ws); + case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(fs_info, type, ws); + case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(fs_info, type, ws); + case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(fs_info, ws); default: /* * This can't happen, the type is validated several times @@ -970,12 +999,12 @@ static void put_workspace(int type, struct list_head *ws) */ static int btrfs_compress_set_level(unsigned int type, int level) { - const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + const struct btrfs_compress_levels *levels = btrfs_compress_levels[type]; if (level == 0) - level = ops->default_level; + level = levels->default_level; else - level = clamp(level, ops->min_level, ops->max_level); + level = clamp(level, levels->min_level, levels->max_level); return level; } @@ -985,9 +1014,9 @@ static int btrfs_compress_set_level(unsigned int type, int level) */ bool btrfs_compress_level_valid(unsigned int type, int level) { - const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + const struct btrfs_compress_levels *levels = btrfs_compress_levels[type]; - return ops->min_level <= level && level <= ops->max_level; + return levels->min_level <= level && level <= levels->max_level; } /* Wrapper around find_get_page(), with extra error message. */ @@ -1022,44 +1051,46 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, * - compression algo are 0-3 * - the level are bits 4-7 * - * @out_pages is an in/out parameter, holds maximum number of pages to allocate - * and returns number of actually allocated pages + * @out_folios is an in/out parameter, holds maximum number of folios to allocate + * and returns number of actually allocated folios * * @total_in is used to return the number of bytes actually read. It * may be smaller than the input length if we had to exit early because we - * ran out of room in the pages array or because we cross the + * ran out of room in the folios array or because we cross the * max_out threshold. * * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; level = btrfs_compress_set_level(type, level); - workspace = get_workspace(type, level); - ret = compression_compress_pages(type, workspace, mapping, start, folios, + workspace = get_workspace(fs_info, type, level); + ret = compression_compress_pages(type, workspace, inode, start, folios, out_folios, total_in, total_out); /* The total read-in bytes should be no larger than the input. */ ASSERT(*total_in <= orig_len); - put_workspace(type, workspace); + put_workspace(fs_info, type, workspace); return ret; } static int btrfs_decompress_bio(struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct list_head *workspace; int ret; int type = cb->compress_type; - workspace = get_workspace(type, 0); + workspace = get_workspace(fs_info, type, 0); ret = compression_decompress_bio(workspace, cb); - put_workspace(type, workspace); + put_workspace(fs_info, type, workspace); if (!ret) zero_fill_bio(&cb->orig_bbio->bio); @@ -1080,20 +1111,50 @@ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, int ret; /* - * The full destination page range should not exceed the page size. + * The full destination folio range should not exceed the folio size. * And the @destlen should not exceed sectorsize, as this is only called for * inline file extents, which should not exceed sectorsize. */ - ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); + ASSERT(dest_pgoff + destlen <= folio_size(dest_folio) && destlen <= sectorsize); - workspace = get_workspace(type, 0); + workspace = get_workspace(fs_info, type, 0); ret = compression_decompress(type, workspace, data_in, dest_folio, dest_pgoff, srclen, destlen); - put_workspace(type, workspace); + put_workspace(fs_info, type, workspace); return ret; } +int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info) +{ + int ret; + + ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_NONE); + if (ret < 0) + goto error; + ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB); + if (ret < 0) + goto error; + ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_LZO); + if (ret < 0) + goto error; + ret = zstd_alloc_workspace_manager(fs_info); + if (ret < 0) + goto error; + return 0; +error: + btrfs_free_compress_wsm(fs_info); + return ret; +} + +void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info) +{ + free_workspace_manager(fs_info, BTRFS_COMPRESS_NONE); + free_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB); + free_workspace_manager(fs_info, BTRFS_COMPRESS_LZO); + zstd_free_workspace_manager(fs_info); +} + int __init btrfs_init_compress(void) { if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE, @@ -1105,11 +1166,6 @@ int __init btrfs_init_compress(void) if (!compr_pool.shrinker) return -ENOMEM; - btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); - btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); - btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); - zstd_init_workspace_manager(); - spin_lock_init(&compr_pool.lock); INIT_LIST_HEAD(&compr_pool.list); compr_pool.count = 0; @@ -1130,10 +1186,6 @@ void __cold btrfs_exit_compress(void) btrfs_compr_pool_scan(NULL, NULL); shrinker_free(compr_pool.shrinker); - btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); - btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); - btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); - zstd_cleanup_workspace_manager(); bioset_exit(&btrfs_compressed_bioset); } @@ -1256,7 +1308,7 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, #define ENTROPY_LVL_HIGH (80) /* - * For increasead precision in shannon_entropy calculation, + * For increased precision in shannon_entropy calculation, * let's do pow(n, M) to save more digits after comma: * * - maximum int bit length is 64 @@ -1542,7 +1594,8 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, */ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) { - struct list_head *ws_list = get_workspace(0, 0); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct list_head *ws_list = get_workspace(fs_info, 0, 0); struct heuristic_ws *ws; u32 i; u8 byte; @@ -1611,30 +1664,34 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) } out: - put_workspace(0, ws_list); + put_workspace(fs_info, 0, ws_list); return ret; } /* - * Convert the compression suffix (eg. after "zlib" starting with ":") to - * level, unrecognized string will set the default level. Negative level - * numbers are allowed. + * Convert the compression suffix (eg. after "zlib" starting with ":") to level. + * + * If the resulting level exceeds the algo's supported levels, it will be clamped. + * + * Return <0 if no valid string can be found. + * Return 0 if everything is fine. */ -int btrfs_compress_str2level(unsigned int type, const char *str) +int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret) { int level = 0; int ret; - if (!type) + if (!type) { + *level_ret = btrfs_compress_set_level(type, level); return 0; + } if (str[0] == ':') { ret = kstrtoint(str + 1, 10, &level); if (ret) - level = 0; + return ret; } - level = btrfs_compress_set_level(type, level); - - return level; + *level_ret = btrfs_compress_set_level(type, level); + return 0; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 1b38e707bbd9..eba188a9e3bb 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -75,6 +75,11 @@ struct compressed_bio { struct btrfs_bio bbio; }; +static inline struct btrfs_fs_info *cb_to_fs_info(const struct compressed_bio *cb) +{ + return cb->bbio.fs_info; +} + /* @range_end must be exclusive. */ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur) { @@ -84,11 +89,14 @@ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u6 return min(range_end, folio_end(folio)) - cur; } +int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info); +void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info); + int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); bool btrfs_compress_level_valid(unsigned int type, int level); -int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, @@ -102,21 +110,11 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); -int btrfs_compress_str2level(unsigned int type, const char *str); +int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); -struct folio *btrfs_alloc_compr_folio(void); +struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info); void btrfs_free_compr_folio(struct folio *folio); -enum btrfs_compression_type { - BTRFS_COMPRESS_NONE = 0, - BTRFS_COMPRESS_ZLIB = 1, - BTRFS_COMPRESS_LZO = 2, - BTRFS_COMPRESS_ZSTD = 3, - BTRFS_NR_COMPRESS_TYPES = 4, - - BTRFS_DEFRAG_DONT_COMPRESS, -}; - struct workspace_manager { struct list_head idle_ws; spinlock_t ws_lock; @@ -128,11 +126,10 @@ struct workspace_manager { wait_queue_head_t ws_wait; }; -struct list_head *btrfs_get_workspace(int type, int level); -void btrfs_put_workspace(int type, struct list_head *ws); +struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level); +void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws); -struct btrfs_compress_op { - struct workspace_manager *workspace_manager; +struct btrfs_compress_levels { /* Maximum level supported by the compression algorithm */ int min_level; int max_level; @@ -142,10 +139,10 @@ struct btrfs_compress_op { /* The heuristic workspaces are managed via the 0th workspace manager */ #define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES -extern const struct btrfs_compress_op btrfs_heuristic_compress; -extern const struct btrfs_compress_op btrfs_zlib_compress; -extern const struct btrfs_compress_op btrfs_lzo_compress; -extern const struct btrfs_compress_op btrfs_zstd_compress; +extern const struct btrfs_compress_levels btrfs_heuristic_compress; +extern const struct btrfs_compress_levels btrfs_zlib_compress; +extern const struct btrfs_compress_levels btrfs_lzo_compress; +extern const struct btrfs_compress_levels btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); bool btrfs_compress_is_valid_type(const char *str, size_t len); @@ -155,39 +152,39 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, struct folio **in_folio_ret); -int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, +int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *zlib_alloc_workspace(unsigned int level); +struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level); void zlib_free_workspace(struct list_head *ws); -struct list_head *zlib_get_workspace(unsigned int level); +struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level); -int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, +int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *lzo_alloc_workspace(void); +struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info); void lzo_free_workspace(struct list_head *ws); -int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, +int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -void zstd_init_workspace_manager(void); -void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(int level); +int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info); +void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info); +struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level); void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(int level); -void zstd_put_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level); +void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 74e6d7f3d266..561658aca018 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -30,10 +30,10 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *ins_key, struct btrfs_path *path, - int data_size, int extend); + int data_size, bool extend); static int push_node_left(struct btrfs_trans_handle *trans, struct extent_buffer *dst, - struct extent_buffer *src, int empty); + struct extent_buffer *src, bool empty); static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); @@ -293,11 +293,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_inc_ref(trans, root, cow, 1); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } else { ret = btrfs_inc_ref(trans, root, cow, 0); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } if (ret) { @@ -536,14 +536,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -556,7 +556,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, parent_start = buf->start; ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -567,7 +567,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, parent_start, last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -575,7 +575,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, WARN_ON(trans->transid != btrfs_header_generation(parent)); ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -586,14 +586,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(trans, parent); if (last_ref) { ret = btrfs_tree_mod_log_free_eb(buf); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } } ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, parent_start, last_ref); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -613,15 +613,12 @@ error_unlock_cow: return ret; } -static inline int should_cow_block(const struct btrfs_trans_handle *trans, - const struct btrfs_root *root, - const struct extent_buffer *buf) +static inline bool should_cow_block(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf) { if (btrfs_is_testing(root->fs_info)) - return 0; - - /* Ensure we can see the FORCE_COW bit */ - smp_mb__before_atomic(); + return false; /* * We do not need to cow a block if @@ -634,13 +631,25 @@ static inline int should_cow_block(const struct btrfs_trans_handle *trans, * after we've finished copying src root, we must COW the shared * block to ensure the metadata consistency. */ - if (btrfs_header_generation(buf) == trans->transid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && - !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && - !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) - return 0; - return 1; + + if (btrfs_header_generation(buf) != trans->transid) + return true; + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) + return true; + + /* Ensure we can see the FORCE_COW bit. */ + smp_mb__before_atomic(); + if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) + return true; + + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return false; + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) + return true; + + return false; } /* @@ -844,7 +853,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, &check); if (IS_ERR(eb)) return eb; - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); return ERR_PTR(-EIO); } @@ -913,7 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } ret = btrfs_tree_mod_log_insert_root(root->node, child, true); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_tree_unlock(child); free_extent_buffer(child); btrfs_abort_transaction(trans, ret); @@ -935,7 +944,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1010,7 +1019,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right, 0, 1); free_extent_buffer_stale(right); right = NULL; - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1019,7 +1028,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1071,7 +1080,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1081,7 +1090,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1186,7 +1195,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_tree_unlock(left); free_extent_buffer(left); btrfs_abort_transaction(trans, ret); @@ -1246,7 +1255,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_tree_unlock(right); free_extent_buffer(right); btrfs_abort_transaction(trans, ret); @@ -1484,13 +1493,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, reada_for_search(fs_info, p, parent_level, slot, key->objectid); /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) { /* * Do extra check for first_key, eb can be stale due to * being cached, read from scrub, or have multiple * parents (shared tree blocks). */ - if (btrfs_verify_level_key(tmp, &check)) { + if (unlikely(btrfs_verify_level_key(tmp, &check))) { ret = -EUCLEAN; goto out; } @@ -1571,7 +1580,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!extent_buffer_uptodate(tmp)) { + if (unlikely(!extent_buffer_uptodate(tmp))) { ret = -EIO; goto out; } @@ -1752,7 +1761,7 @@ out: * The root may have failed to write out at some point, and thus is no * longer valid, return an error in this case. */ - if (!extent_buffer_uptodate(b)) { + if (unlikely(!extent_buffer_uptodate(b))) { if (root_lock) btrfs_tree_unlock_rw(b, root_lock); free_extent_buffer(b); @@ -2260,7 +2269,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, again: b = btrfs_get_old_root(root, time_seq); - if (!b) { + if (unlikely(!b)) { ret = -EIO; goto done; } @@ -2686,7 +2695,7 @@ static bool check_sibling_keys(const struct extent_buffer *left, */ static int push_node_left(struct btrfs_trans_handle *trans, struct extent_buffer *dst, - struct extent_buffer *src, int empty) + struct extent_buffer *src, bool empty) { struct btrfs_fs_info *fs_info = trans->fs_info; int push_items = 0; @@ -2722,13 +2731,13 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); /* dst is the left eb, src is the middle eb */ - if (check_sibling_keys(dst, src)) { + if (unlikely(check_sibling_keys(dst, src))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2796,7 +2805,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, push_items = max_push; /* dst is the right eb, src is the middle eb */ - if (check_sibling_keys(src, dst)) { + if (unlikely(check_sibling_keys(src, dst))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); return ret; @@ -2813,7 +2822,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, push_items); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2883,7 +2892,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_clear_buffer_dirty(trans, c); ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); - if (ret2 < 0) + if (unlikely(ret2 < 0)) btrfs_abort_transaction(trans, ret2); btrfs_tree_unlock(c); free_extent_buffer(c); @@ -2928,7 +2937,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, slot, nritems - slot); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2941,7 +2950,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot, BTRFS_MOD_LOG_KEY_ADD); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -3017,7 +3026,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ASSERT(btrfs_header_level(c) == level); ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); - if (ret) { + if (unlikely(ret)) { btrfs_tree_unlock(split); free_extent_buffer(split); btrfs_abort_transaction(trans, ret); @@ -3086,7 +3095,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf) int ret; ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_crit(fs_info, "leaf free space ret %d, leaf data size %lu, used %d nritems %d", ret, @@ -3102,7 +3111,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf) */ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_path *path, - int data_size, int empty, + int data_size, bool empty, struct extent_buffer *right, int free_space, u32 left_nritems, u32 min_slot) @@ -3239,7 +3248,7 @@ out_unlock: static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int min_data_size, int data_size, - int empty, u32 min_slot) + bool empty, u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; @@ -3278,7 +3287,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; - if (check_sibling_keys(left, right)) { + if (unlikely(check_sibling_keys(left, right))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); btrfs_tree_unlock(right); @@ -3316,7 +3325,7 @@ out_unlock: */ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_path *path, int data_size, - int empty, struct extent_buffer *left, + bool empty, struct extent_buffer *left, int free_space, u32 right_nritems, u32 max_slot) { @@ -3494,7 +3503,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - if (check_sibling_keys(left, right)) { + if (unlikely(check_sibling_keys(left, right))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); goto out; @@ -3642,7 +3651,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *ins_key, struct btrfs_path *path, int data_size, - int extend) + bool extend) { struct btrfs_disk_key disk_key; struct extent_buffer *l; @@ -4075,7 +4084,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, btrfs_set_item_size(leaf, slot, new_size); btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(leaf) < 0) { + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { btrfs_print_leaf(leaf); BUG(); } @@ -4108,7 +4117,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, old_data = btrfs_item_data_end(leaf, slot); BUG_ON(slot < 0); - if (slot >= nritems) { + if (unlikely(slot >= nritems)) { btrfs_print_leaf(leaf); btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", slot, nritems); @@ -4135,7 +4144,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, btrfs_set_item_size(leaf, slot, old_size + data_size); btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(leaf) < 0) { + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { btrfs_print_leaf(leaf); BUG(); } @@ -4183,7 +4192,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, data_end = leaf_data_end(leaf); total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); - if (btrfs_leaf_free_space(leaf) < total_size) { + if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) { btrfs_print_leaf(leaf); btrfs_crit(fs_info, "not enough freespace need %u have %d", total_size, btrfs_leaf_free_space(leaf)); @@ -4193,7 +4202,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, if (slot != nritems) { unsigned int old_data = btrfs_item_data_end(leaf, slot); - if (old_data < data_end) { + if (unlikely(old_data < data_end)) { btrfs_print_leaf(leaf); btrfs_crit(fs_info, "item at slot %d with data offset %u beyond data end of leaf %u", @@ -4232,7 +4241,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(leaf, nritems + batch->nr); btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(leaf) < 0) { + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { btrfs_print_leaf(leaf); BUG(); } @@ -4374,7 +4383,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (level) { ret = btrfs_tree_mod_log_insert_move(parent, slot, slot + 1, nritems - slot - 1); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -4387,7 +4396,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot, BTRFS_MOD_LOG_KEY_REMOVE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 738179a5e170..7b277934f66f 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -153,7 +153,7 @@ void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) } /* - * Pick the defragable inode that we want, if it doesn't exist, we will get the + * Pick the defraggable inode that we want, if it doesn't exist, we will get the * next one. */ static struct inode_defrag *btrfs_pick_defrag_inode( @@ -924,7 +924,7 @@ again: folio_put(folio); goto again; } - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-EIO); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0f8d8e275143..41e37f7f67cc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -57,6 +57,7 @@ static inline void btrfs_init_delayed_node( delayed_node->root = root; delayed_node->inode_id = inode_id; refcount_set(&delayed_node->refs, 0); + btrfs_delayed_node_ref_tracker_dir_init(delayed_node); delayed_node->ins_root = RB_ROOT_CACHED; delayed_node->del_root = RB_ROOT_CACHED; mutex_init(&delayed_node->mutex); @@ -65,7 +66,8 @@ static inline void btrfs_init_delayed_node( } static struct btrfs_delayed_node *btrfs_get_delayed_node( - struct btrfs_inode *btrfs_inode) + struct btrfs_inode *btrfs_inode, + struct btrfs_ref_tracker *tracker) { struct btrfs_root *root = btrfs_inode->root; u64 ino = btrfs_ino(btrfs_inode); @@ -74,6 +76,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = READ_ONCE(btrfs_inode->delayed_node); if (node) { refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_NOFS); return node; } @@ -83,6 +86,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( if (node) { if (btrfs_inode->delayed_node) { refcount_inc(&node->refs); /* can be accessed */ + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); BUG_ON(btrfs_inode->delayed_node != node); xa_unlock(&root->delayed_nodes); return node; @@ -106,6 +110,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( */ if (refcount_inc_not_zero(&node->refs)) { refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); + btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, + GFP_ATOMIC); btrfs_inode->delayed_node = node; } else { node = NULL; @@ -126,7 +133,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * Return the delayed node, or error pointer on failure. */ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( - struct btrfs_inode *btrfs_inode) + struct btrfs_inode *btrfs_inode, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; struct btrfs_root *root = btrfs_inode->root; @@ -135,7 +143,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( void *ptr; again: - node = btrfs_get_delayed_node(btrfs_inode); + node = btrfs_get_delayed_node(btrfs_inode, tracker); if (node) return node; @@ -144,12 +152,10 @@ again: return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); - /* Cached in the inode and can be accessed. */ - refcount_set(&node->refs, 2); - /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */ ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS); if (ret == -ENOMEM) { + btrfs_delayed_node_ref_tracker_dir_exit(node); kmem_cache_free(delayed_node_cache, node); return ERR_PTR(-ENOMEM); } @@ -158,6 +164,7 @@ again: if (ptr) { /* Somebody inserted it, go back and read it. */ xa_unlock(&root->delayed_nodes); + btrfs_delayed_node_ref_tracker_dir_exit(node); kmem_cache_free(delayed_node_cache, node); node = NULL; goto again; @@ -166,6 +173,12 @@ again: ASSERT(xa_err(ptr) != -EINVAL); ASSERT(xa_err(ptr) != -ENOMEM); ASSERT(ptr == NULL); + + /* Cached in the inode and can be accessed. */ + refcount_set(&node->refs, 2); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); + btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, GFP_ATOMIC); + btrfs_inode->delayed_node = node; xa_unlock(&root->delayed_nodes); @@ -191,6 +204,8 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, list_add_tail(&node->n_list, &root->node_list); list_add_tail(&node->p_list, &root->prepare_list); refcount_inc(&node->refs); /* inserted into list */ + btrfs_delayed_node_ref_tracker_alloc(node, &node->node_list_tracker, + GFP_ATOMIC); root->nodes++; set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } @@ -204,6 +219,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, spin_lock(&root->lock); if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { root->nodes--; + btrfs_delayed_node_ref_tracker_free(node, &node->node_list_tracker); refcount_dec(&node->refs); /* not in the list */ list_del_init(&node->n_list); if (!list_empty(&node->p_list)) @@ -214,22 +230,26 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, } static struct btrfs_delayed_node *btrfs_first_delayed_node( - struct btrfs_delayed_root *delayed_root) + struct btrfs_delayed_root *delayed_root, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; spin_lock(&delayed_root->lock); node = list_first_entry_or_null(&delayed_root->node_list, struct btrfs_delayed_node, n_list); - if (node) + if (node) { refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); + } spin_unlock(&delayed_root->lock); return node; } static struct btrfs_delayed_node *btrfs_next_delayed_node( - struct btrfs_delayed_node *node) + struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_root *delayed_root; struct list_head *p; @@ -249,6 +269,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( next = list_entry(p, struct btrfs_delayed_node, n_list); refcount_inc(&next->refs); + btrfs_delayed_node_ref_tracker_alloc(next, tracker, GFP_ATOMIC); out: spin_unlock(&delayed_root->lock); @@ -257,7 +278,7 @@ out: static void __btrfs_release_delayed_node( struct btrfs_delayed_node *delayed_node, - int mod) + int mod, struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_root *delayed_root; @@ -273,6 +294,7 @@ static void __btrfs_release_delayed_node( btrfs_dequeue_delayed_node(delayed_root, delayed_node); mutex_unlock(&delayed_node->mutex); + btrfs_delayed_node_ref_tracker_free(delayed_node, tracker); if (refcount_dec_and_test(&delayed_node->refs)) { struct btrfs_root *root = delayed_node->root; @@ -282,17 +304,20 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); + btrfs_delayed_node_ref_tracker_dir_exit(delayed_node); kmem_cache_free(delayed_node_cache, delayed_node); } } -static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) +static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) { - __btrfs_release_delayed_node(node, 0); + __btrfs_release_delayed_node(node, 0, tracker); } static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( - struct btrfs_delayed_root *delayed_root) + struct btrfs_delayed_root *delayed_root, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; @@ -302,6 +327,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( if (node) { list_del_init(&node->p_list); refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); } spin_unlock(&delayed_root->lock); @@ -309,9 +335,10 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( } static inline void btrfs_release_prepared_delayed_node( - struct btrfs_delayed_node *node) + struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) { - __btrfs_release_delayed_node(node, 1); + __btrfs_release_delayed_node(node, 1, tracker); } static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, @@ -711,8 +738,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, u32 *ins_sizes; int i = 0; - ins_data = kmalloc(batch.nr * sizeof(u32) + - batch.nr * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc_array(batch.nr, + sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) { ret = -ENOMEM; goto out; @@ -1011,7 +1038,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, * transaction, because we could leave the inode with the * improper counts behind. */ - if (ret != -ENOENT) + if (unlikely(ret != -ENOENT)) btrfs_abort_transaction(trans, ret); goto out; } @@ -1039,7 +1066,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto err_out; } @@ -1126,6 +1153,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; + struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret = 0; @@ -1143,17 +1171,18 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) delayed_root = fs_info->delayed_root; - curr_node = btrfs_first_delayed_node(delayed_root); + curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker); while (curr_node && (!count || nr--)) { ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } prev_node = curr_node; - curr_node = btrfs_next_delayed_node(curr_node); + prev_delayed_node_tracker = curr_delayed_node_tracker; + curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker); /* * See the comment below about releasing path before releasing * node. If the commit of delayed items was successful the path @@ -1161,7 +1190,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) * point to locked extent buffers (a leaf at the very least). */ ASSERT(path->nodes[0] == NULL); - btrfs_release_delayed_node(prev_node); + btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker); } /* @@ -1174,7 +1203,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) btrfs_free_path(path); if (curr_node) - btrfs_release_delayed_node(curr_node); + btrfs_release_delayed_node(curr_node, &curr_delayed_node_tracker); trans->block_rsv = block_rsv; return ret; @@ -1193,7 +1222,9 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr) int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *delayed_node = + btrfs_get_delayed_node(inode, &delayed_node_tracker); BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_rsv *block_rsv; int ret; @@ -1204,14 +1235,14 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); if (!delayed_node->count) { mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } mutex_unlock(&delayed_node->mutex); path = btrfs_alloc_path(); if (!path) { - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return -ENOMEM; } @@ -1220,7 +1251,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); trans->block_rsv = block_rsv; return ret; @@ -1230,18 +1261,20 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_trans_handle *trans; - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *delayed_node; struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret; + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return 0; mutex_lock(&delayed_node->mutex); if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } mutex_unlock(&delayed_node->mutex); @@ -1275,7 +1308,7 @@ trans_out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return ret; } @@ -1289,7 +1322,8 @@ void btrfs_remove_delayed_node(struct btrfs_inode *inode) return; inode->delayed_node = NULL; - btrfs_release_delayed_node(delayed_node); + + btrfs_release_delayed_node(delayed_node, &delayed_node->inode_cache_tracker); } struct btrfs_async_delayed_work { @@ -1305,6 +1339,7 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; int total_done = 0; @@ -1321,7 +1356,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) BTRFS_DELAYED_BACKGROUND / 2) break; - delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + delayed_node = btrfs_first_prepared_delayed_node(delayed_root, + &delayed_node_tracker); if (!delayed_node) break; @@ -1330,7 +1366,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_release_path(path); - btrfs_release_prepared_delayed_node(delayed_node); + btrfs_release_prepared_delayed_node(delayed_node, + &delayed_node_tracker); total_done++; continue; } @@ -1345,7 +1382,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) btrfs_btree_balance_dirty_nodelay(root->fs_info); btrfs_release_path(path); - btrfs_release_prepared_delayed_node(delayed_node); + btrfs_release_prepared_delayed_node(delayed_node, + &delayed_node_tracker); total_done++; } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) @@ -1377,10 +1415,15 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) { - struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *node; - if (WARN_ON(node)) + node = btrfs_first_delayed_node( fs_info->delayed_root, &delayed_node_tracker); + if (WARN_ON(node)) { + btrfs_delayed_node_ref_tracker_free(node, + &delayed_node_tracker); refcount_dec(&node->refs); + } } static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) @@ -1454,13 +1497,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_delayed_item *delayed_item; struct btrfs_dir_item *dir_item; bool reserve_leaf_space; u32 data_len; int ret; - delayed_node = btrfs_get_or_create_delayed_node(dir); + delayed_node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1536,7 +1580,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_unlock(&delayed_node->mutex); release_node: - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return ret; } @@ -1591,10 +1635,11 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, u64 index) { struct btrfs_delayed_node *node; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_delayed_item *item; int ret; - node = btrfs_get_or_create_delayed_node(dir); + node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker); if (IS_ERR(node)) return PTR_ERR(node); @@ -1635,14 +1680,16 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } mutex_unlock(&node->mutex); end: - btrfs_release_delayed_node(node); + btrfs_release_delayed_node(node, &delayed_node_tracker); return ret; } int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) { - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *delayed_node; + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return -ENOENT; @@ -1652,12 +1699,12 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) * is updated now. So we needn't lock the delayed node. */ if (!delayed_node->index_cnt) { - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return -EINVAL; } inode->index_cnt = delayed_node->index_cnt; - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } @@ -1668,8 +1715,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, { struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *item; + struct btrfs_ref_tracker delayed_node_tracker; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return false; @@ -1704,6 +1752,7 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, * insert/delete delayed items in this period. So we also needn't * requeue or dequeue this delayed node. */ + btrfs_delayed_node_ref_tracker_free(delayed_node, &delayed_node_tracker); refcount_dec(&delayed_node->refs); return true; @@ -1843,19 +1892,19 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_inode_item *inode_item; struct inode *vfs_inode = &inode->vfs_inode; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return -ENOENT; mutex_lock(&delayed_node->mutex); if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return -ENOENT; } @@ -1864,8 +1913,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); - btrfs_inode_set_file_extent_range(inode, 0, - round_up(i_size_read(vfs_inode), fs_info->sectorsize)); vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); @@ -1895,7 +1942,7 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) inode->index_cnt = (u64)-1; mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } @@ -1904,9 +1951,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; int ret = 0; - delayed_node = btrfs_get_or_create_delayed_node(inode); + delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1926,7 +1974,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, atomic_inc(&root->fs_info->delayed_root->items); release_node: mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return ret; } @@ -1934,6 +1982,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; /* * we don't do delayed inode updates during log recovery because it @@ -1943,7 +1992,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) return -EAGAIN; - delayed_node = btrfs_get_or_create_delayed_node(inode); + delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1970,7 +2019,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) atomic_inc(&fs_info->delayed_root->items); release_node: mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } @@ -2014,19 +2063,21 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return; __btrfs_kill_delayed_node(delayed_node); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); } void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { unsigned long index = 0; struct btrfs_delayed_node *delayed_nodes[8]; + struct btrfs_ref_tracker delayed_node_trackers[8]; while (1) { struct btrfs_delayed_node *node; @@ -2045,6 +2096,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) * about to be removed from the tree in the loop below */ if (refcount_inc_not_zero(&node->refs)) { + btrfs_delayed_node_ref_tracker_alloc(node, + &delayed_node_trackers[count], + GFP_ATOMIC); delayed_nodes[count] = node; count++; } @@ -2056,7 +2110,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) for (int i = 0; i < count; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); - btrfs_release_delayed_node(delayed_nodes[i]); + btrfs_release_delayed_node(delayed_nodes[i], + &delayed_node_trackers[i]); + btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]); } } } @@ -2064,14 +2120,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) { struct btrfs_delayed_node *curr_node, *prev_node; + struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; - curr_node = btrfs_first_delayed_node(fs_info->delayed_root); + curr_node = btrfs_first_delayed_node(fs_info->delayed_root, + &curr_delayed_node_tracker); while (curr_node) { __btrfs_kill_delayed_node(curr_node); prev_node = curr_node; - curr_node = btrfs_next_delayed_node(curr_node); - btrfs_release_delayed_node(prev_node); + prev_delayed_node_tracker = curr_delayed_node_tracker; + curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker); + btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker); } } @@ -2081,8 +2140,9 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, { struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; + struct btrfs_ref_tracker delayed_node_tracker; - node = btrfs_get_delayed_node(inode); + node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!node) return; @@ -2140,6 +2200,7 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, * delete delayed items. */ ASSERT(refcount_read(&node->refs) > 1); + btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); refcount_dec(&node->refs); } @@ -2150,8 +2211,9 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode, struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; struct btrfs_delayed_item *next; + struct btrfs_ref_tracker delayed_node_tracker; - node = btrfs_get_delayed_node(inode); + node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!node) return; @@ -2183,5 +2245,6 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode, * delete delayed items. */ ASSERT(refcount_read(&node->refs) > 1); + btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); refcount_dec(&node->refs); } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index e6e763ad2d42..0d949edc0caf 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -16,6 +16,7 @@ #include <linux/fs.h> #include <linux/atomic.h> #include <linux/refcount.h> +#include <linux/ref_tracker.h> #include "ctree.h" struct btrfs_disk_key; @@ -44,6 +45,22 @@ struct btrfs_delayed_root { wait_queue_head_t wait; }; +struct btrfs_ref_tracker_dir { +#ifdef CONFIG_BTRFS_DEBUG + struct ref_tracker_dir dir; +#else + struct {} tracker; +#endif +}; + +struct btrfs_ref_tracker { +#ifdef CONFIG_BTRFS_DEBUG + struct ref_tracker *tracker; +#else + struct {} tracker; +#endif +}; + #define BTRFS_DELAYED_NODE_IN_LIST 0 #define BTRFS_DELAYED_NODE_INODE_DIRTY 1 #define BTRFS_DELAYED_NODE_DEL_IREF 2 @@ -78,6 +95,12 @@ struct btrfs_delayed_node { * actual number of leaves we end up using. Protected by @mutex. */ u32 index_item_leaves; + /* Track all references to this delayed node. */ + struct btrfs_ref_tracker_dir ref_dir; + /* Track delayed node reference stored in node list. */ + struct btrfs_ref_tracker node_list_tracker; + /* Track delayed node reference stored in inode cache. */ + struct btrfs_ref_tracker inode_cache_tracker; }; struct btrfs_delayed_item { @@ -169,4 +192,74 @@ void __cold btrfs_delayed_inode_exit(void); /* for debugging */ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info); +#define BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT 16 +#define BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT 16 + +#ifdef CONFIG_BTRFS_DEBUG +static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return; + + ref_tracker_dir_init(&node->ref_dir.dir, + BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT, + "delayed_node"); +} + +static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return; + + ref_tracker_dir_exit(&node->ref_dir.dir); +} + +static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return; + + ref_tracker_dir_print(&node->ref_dir.dir, + BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT); +} + +static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker, + gfp_t gfp) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return 0; + + return ref_tracker_alloc(&node->ref_dir.dir, &tracker->tracker, gfp); +} + +static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return 0; + + return ref_tracker_free(&node->ref_dir.dir, &tracker->tracker); +} +#else +static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) { } + +static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) { } + +static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) { } + +static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker, + gfp_t gfp) +{ + return 0; +} + +static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) +{ + return 0; +} +#endif + #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ca382c5b186f..481802efaa14 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -895,7 +895,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } /* - * Initialize the structure which represents a modification to a an extent. + * Initialize the structure which represents a modification to an extent. * * @fs_info: Internal to the mounted filesystem mount structure. * @@ -952,7 +952,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, bool skip_qgroup) { -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif @@ -969,7 +969,7 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, u64 mod_root, bool skip_qgroup) { -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif @@ -1251,7 +1251,6 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) { struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; - bool testing = btrfs_is_testing(fs_info); spin_lock(&delayed_refs->lock); while (true) { @@ -1281,7 +1280,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); - if (!testing && pin_bytes) { + if (!btrfs_is_testing(fs_info) && pin_bytes) { struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, head->bytenr); @@ -1312,14 +1311,14 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) btrfs_error_unpin_extent_range(fs_info, head->bytenr, head->bytenr + head->num_bytes - 1); } - if (!testing) + if (!btrfs_is_testing(fs_info)) btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } - if (!testing) + if (!btrfs_is_testing(fs_info)) btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 552ec4fa645d..5ce940532144 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -276,10 +276,6 @@ struct btrfs_ref { */ bool skip_qgroup; -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* Through which root is this modification. */ - u64 real_root; -#endif u64 bytenr; u64 num_bytes; u64 owning_root; @@ -296,6 +292,11 @@ struct btrfs_ref { struct btrfs_data_ref data_ref; struct btrfs_tree_ref tree_ref; }; + +#ifdef CONFIG_BTRFS_DEBUG + /* Through which root is this modification. */ + u64 real_root; +#endif }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 4675bcd5f92e..a4eaef60549e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -98,7 +98,7 @@ no_valid_dev_replace_entry_found: * We don't have a replace item or it's corrupted. If there is * a replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, &args)) { + if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) { btrfs_err(fs_info, "found replace target device without a valid replace item"); return -EUCLEAN; @@ -158,7 +158,7 @@ no_valid_dev_replace_entry_found: * We don't have an active replace item but if there is a * replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, &args)) { + if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) { btrfs_err(fs_info, "replace without active item, run 'device scan --forget' on the target device"); ret = -EUCLEAN; @@ -177,8 +177,7 @@ no_valid_dev_replace_entry_found: * allow 'btrfs dev replace_cancel' if src/tgt device is * missing */ - if (!dev_replace->srcdev && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely(!dev_replace->srcdev && !btrfs_test_opt(fs_info, DEGRADED))) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -186,8 +185,7 @@ no_valid_dev_replace_entry_found: "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", src_devid); } - if (!dev_replace->tgtdev && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely(!dev_replace->tgtdev && !btrfs_test_opt(fs_info, DEGRADED))) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -637,7 +635,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - DEBUG_WARN("unexpected STARTED ot SUSPENDED dev-replace state"); + DEBUG_WARN("unexpected STARTED or SUSPENDED dev-replace state"); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; up_write(&dev_replace->rwsem); goto leave; diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index fe9a4bd7e6e6..802d4dbe5b38 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -786,6 +786,18 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, if (iov_iter_alignment(iter) & blocksize_mask) return -EINVAL; + /* + * For bs > ps support, we heavily rely on large folios to make sure no + * block will cross large folio boundaries. + * + * But memory provided by direct IO is only virtually contiguous, not + * physically contiguous, and will break the btrfs' large folio requirement. + * + * So for bs > ps support, all direct IOs should fallback to buffered ones. + */ + if (fs_info->sectorsize > PAGE_SIZE) + return -EINVAL; + return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 70fc4e7cc5a0..9247a58894de 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -116,7 +116,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) * detect blocks that either didn't get written at all or got written * in the wrong place. */ -int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic) +int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic) { if (!extent_buffer_uptodate(eb)) return 0; @@ -370,21 +370,21 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, ASSERT(check); found_start = btrfs_header_bytenr(eb); - if (found_start != eb->start) { + if (unlikely(found_start != eb->start)) { btrfs_err_rl(fs_info, "bad tree block start, mirror %u want %llu have %llu", eb->read_mirror, eb->start, found_start); ret = -EIO; goto out; } - if (check_tree_block_fsid(eb)) { + if (unlikely(check_tree_block_fsid(eb))) { btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", eb->start, eb->read_mirror); ret = -EIO; goto out; } found_level = btrfs_header_level(eb); - if (found_level >= BTRFS_MAX_LEVEL) { + if (unlikely(found_level >= BTRFS_MAX_LEVEL)) { btrfs_err(fs_info, "bad tree block level, mirror %u level %d on logical %llu", eb->read_mirror, btrfs_header_level(eb), eb->start); @@ -404,13 +404,13 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb), ignore_csum ? ", ignored" : ""); - if (!ignore_csum) { + if (unlikely(!ignore_csum)) { ret = -EUCLEAN; goto out; } } - if (found_level != check->level) { + if (unlikely(found_level != check->level)) { btrfs_err(fs_info, "level verify failed on logical %llu mirror %u wanted %u found %u", eb->start, eb->read_mirror, check->level, found_level); @@ -639,7 +639,6 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, u64 objectid, gfp_t flags) { struct btrfs_root *root; - bool dummy = btrfs_is_testing(fs_info); root = kzalloc(sizeof(*root), flags); if (!root) @@ -696,7 +695,7 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, root->log_transid_committed = -1; btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; - if (!dummy) { + if (!btrfs_is_testing(fs_info)) { btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES); btrfs_extent_io_tree_init(fs_info, &root->log_csum_range, @@ -1047,7 +1046,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, root->node = NULL; goto fail; } - if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) { ret = -EIO; goto fail; } @@ -1056,10 +1055,10 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, * For real fs, and not log/reloc trees, root owner must * match its root node owner */ - if (!btrfs_is_testing(fs_info) && - btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && - btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && - btrfs_root_id(root) != btrfs_header_owner(root->node)) { + if (unlikely(!btrfs_is_testing(fs_info) && + btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && + btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + btrfs_root_id(root) != btrfs_header_owner(root->node))) { btrfs_crit(fs_info, "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", btrfs_root_id(root), root->node->start, @@ -1248,6 +1247,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) if (fs_info->fs_devices) btrfs_close_devices(fs_info->fs_devices); + btrfs_free_compress_wsm(fs_info); percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); @@ -1958,7 +1958,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; - unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE; + unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU; fs_info->workers = btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); @@ -2058,7 +2058,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, u64 bytenr = btrfs_super_log_root(disk_super); int level = btrfs_super_log_root_level(disk_super); - if (fs_devices->rw_devices == 0) { + if (unlikely(fs_devices->rw_devices == 0)) { btrfs_warn(fs_info, "log replay required on RO media"); return -EIO; } @@ -2079,7 +2079,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, btrfs_put_root(log_tree_root); return ret; } - if (!extent_buffer_uptodate(log_tree_root->node)) { + if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) { btrfs_err(fs_info, "failed to read log tree"); btrfs_put_root(log_tree_root); return -EIO; @@ -2087,10 +2087,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); + btrfs_put_root(log_tree_root); if (ret) { btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); - btrfs_put_root(log_tree_root); return ret; } @@ -2324,7 +2324,7 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, const u32 sectorsize = btrfs_super_sectorsize(sb); u32 sys_array_size = btrfs_super_sys_array_size(sb); - if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + if (unlikely(sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)) { btrfs_err(fs_info, "system chunk array too big %u > %u", sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); return -EUCLEAN; @@ -2342,12 +2342,12 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); len = sizeof(*disk_key); - if (cur + len > sys_array_size) + if (unlikely(cur + len > sys_array_size)) goto short_read; cur += len; btrfs_disk_key_to_cpu(&key, disk_key); - if (key.type != BTRFS_CHUNK_ITEM_KEY) { + if (unlikely(key.type != BTRFS_CHUNK_ITEM_KEY)) { btrfs_err(fs_info, "unexpected item type %u in sys_array at offset %u", key.type, cur); @@ -2355,10 +2355,10 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, } chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); num_stripes = btrfs_stack_chunk_num_stripes(chunk); - if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size) + if (unlikely(cur + btrfs_chunk_item_size(num_stripes) > sys_array_size)) goto short_read; type = btrfs_stack_chunk_type(chunk); - if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + if (unlikely(!(type & BTRFS_BLOCK_GROUP_SYSTEM))) { btrfs_err(fs_info, "invalid chunk type %llu in sys_array at offset %u", type, cur); @@ -2438,21 +2438,7 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } - /* - * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE. - * - * For 4K page sized systems with non-debug builds, all 3 matches (4K). - * For 4K page sized systems with debug builds, there are two block sizes - * supported. (4K and 2K) - * - * We can support 16K sectorsize with 64K page size without problem, - * but such sectorsize/pagesize combination doesn't make much sense. - * 4K will be our future standard, PAGE_SIZE is supported from the very - * beginning. - */ - if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && - sectorsize != PAGE_SIZE && - sectorsize != BTRFS_MIN_BLOCKSIZE)) { + if (!btrfs_supported_blocksize(sectorsize)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2619,13 +2605,13 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, ret = btrfs_validate_super(fs_info, sb, -1); if (ret < 0) goto out; - if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { + if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid csum type, has %u want %u", btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); goto out; } - if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) { + if (unlikely(btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP)) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid incompat flags, has 0x%llx valid mask 0x%llx", @@ -2655,7 +2641,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev root->node = NULL; return ret; } - if (!extent_buffer_uptodate(root->node)) { + if (unlikely(!extent_buffer_uptodate(root->node))) { free_extent_buffer(root->node); root->node = NULL; return -EIO; @@ -3256,18 +3242,24 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) } /* - * Subpage runtime limitation on v1 cache. + * Subpage/bs > ps runtime limitation on v1 cache. * - * V1 space cache still has some hard codeed PAGE_SIZE usage, while + * V1 space cache still has some hard coded PAGE_SIZE usage, while * we're already defaulting to v2 cache, no need to bother v1 as it's * going to be deprecated anyway. */ - if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { btrfs_warn(fs_info, "v1 space cache is not supported for page size %lu with sectorsize %u", PAGE_SIZE, fs_info->sectorsize); return -EINVAL; } + if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) { + btrfs_err(fs_info, + "RAID56 is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + return -EINVAL; + } /* This can be called by remount, we need to protect the super block. */ spin_lock(&fs_info->super_lock); @@ -3396,10 +3388,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize_bits = ilog2(nodesize); fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT); + fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; fs_info->fs_devices->fs_info = fs_info; + if (fs_info->sectorsize > PAGE_SIZE) + btrfs_warn(fs_info, + "support for block size %u with page size %zu is experimental, some features may be missing", + fs_info->sectorsize, PAGE_SIZE); /* * Handle the space caching options appropriately now that we have the * super block loaded and validated. @@ -3421,6 +3419,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); + ret = btrfs_alloc_compress_wsm(fs_info); + if (ret) + goto fail_sb_buffer; ret = btrfs_init_workqueues(fs_info); if (ret) goto fail_sb_buffer; @@ -3468,7 +3469,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * below in btrfs_init_dev_replace(). */ btrfs_free_extra_devids(fs_devices); - if (!fs_devices->latest_dev->bdev) { + if (unlikely(!fs_devices->latest_dev->bdev)) { btrfs_err(fs_info, "failed to read devices"); ret = -EIO; goto fail_tree_roots; @@ -3962,7 +3963,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) * Checks last_flush_error of disks in order to determine the device * state. */ - if (errors_wait && !btrfs_check_rw_degradable(info, NULL)) + if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL))) return -EIO; return 0; @@ -4064,7 +4065,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); ret = btrfs_validate_write_super(fs_info, sb); - if (ret < 0) { + if (unlikely(ret < 0)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_handle_fs_error(fs_info, -EUCLEAN, "unexpected superblock corruption detected"); @@ -4075,7 +4076,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) if (ret) total_errors++; } - if (total_errors > max_errors) { + if (unlikely(total_errors > max_errors)) { btrfs_err(fs_info, "%d errors while writing supers", total_errors); mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -4100,7 +4101,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) total_errors++; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); - if (total_errors > max_errors) { + if (unlikely(total_errors > max_errors)) { btrfs_handle_fs_error(fs_info, -EIO, "%d errors while writing supers", total_errors); @@ -4880,7 +4881,7 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 864a55a96226..57920f2c6fe4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -106,8 +106,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, - int atomic); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, bool atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, const struct btrfs_tree_parent_check *check); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 7fc8a3200b40..d062ac521051 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -174,7 +174,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto fail; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset of -1 found, there would have to exist an * inode with such number or a root with such id. diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 66361325f6dc..bb2ca1c9c7b0 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1237,7 +1237,7 @@ hit_next: state = next_search_state(inserted_state, end); /* * If there's a next state, whether contiguous or not, we don't - * need to unlock and start search agian. If it's not contiguous + * need to unlock and start search again. If it's not contiguous * we will end up here and try to allocate a prealloc state and insert. */ if (state) @@ -1664,7 +1664,7 @@ out: */ u64 btrfs_count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig, + u32 bits, bool contig, struct extent_state **cached_state) { struct extent_state *state = NULL; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 36facca37973..6f07b965e8da 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -163,7 +163,7 @@ void __cold btrfs_extent_state_free_cachep(void); u64 btrfs_count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, u32 bits, int contig, + u64 max_bytes, u32 bits, bool contig, struct extent_state **cached_state); void btrfs_free_extent_state(struct extent_state *state); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 97d517cdf2df..dc4ca98c3780 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -325,7 +325,7 @@ search_again: /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, - * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, + * is_data == BTRFS_REF_TYPE_DATA, data type is required, * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -879,7 +879,7 @@ again: ptr += btrfs_extent_inline_ref_size(type); continue; } - if (type == BTRFS_REF_TYPE_INVALID) { + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) { ret = -EUCLEAN; goto out; } @@ -1210,7 +1210,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, * We're adding refs to a tree block we already own, this * should not happen at all. */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (unlikely(owner < BTRFS_FIRST_FREE_OBJECTID)) { btrfs_print_leaf(path->nodes[0]); btrfs_crit(trans->fs_info, "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", @@ -2157,7 +2157,7 @@ again: delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif ret = __btrfs_run_delayed_refs(trans, min_bytes); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2355,7 +2355,7 @@ static noinline int check_committed_ref(struct btrfs_inode *inode, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2457,7 +2457,7 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc) + bool full_backref, bool inc) { struct btrfs_fs_info *fs_info = root->fs_info; u64 parent; @@ -2543,15 +2543,15 @@ fail: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, bool full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1); + return __btrfs_mod_ref(trans, root, buf, full_backref, true); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, bool full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0); + return __btrfs_mod_ref(trans, root, buf, full_backref, false); } static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) @@ -2760,7 +2760,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, btrfs_put_block_group(cache); total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); - if (cache == NULL) { + if (unlikely(cache == NULL)) { /* Logic error, something removed the block group. */ ret = -EUCLEAN; goto out; @@ -2982,26 +2982,26 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, csum_root = btrfs_csum_root(trans->fs_info, bytenr); ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } } ret = btrfs_record_squota_delta(trans->fs_info, delta); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -3115,7 +3115,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - if (!is_data && refs_to_drop != 1) { + if (unlikely(!is_data && refs_to_drop != 1)) { btrfs_crit(info, "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", node->bytenr, refs_to_drop); @@ -3162,7 +3162,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (!found_extent) { - if (iref) { + if (unlikely(iref)) { abort_and_dump(trans, path, "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", path->slots[0]); @@ -3172,7 +3172,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3221,7 +3221,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "umm, got %d back from search, was looking for %llu, slot %d", ret, bytenr, path->slots[0]); } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3254,7 +3254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; - if (item_size < sizeof(*ei) + sizeof(*bi)) { + if (unlikely(item_size < sizeof(*ei) + sizeof(*bi))) { abort_and_dump(trans, path, "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", key.objectid, key.type, key.offset, @@ -3268,7 +3268,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } refs = btrfs_extent_refs(leaf, ei); - if (refs < refs_to_drop) { + if (unlikely(refs < refs_to_drop)) { abort_and_dump(trans, path, "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", refs_to_drop, refs, bytenr, path->slots[0]); @@ -3285,7 +3285,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * be updated by remove_extent_backref */ if (iref) { - if (!found_extent) { + if (unlikely(!found_extent)) { abort_and_dump(trans, path, "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", path->slots[0]); @@ -3298,7 +3298,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3314,8 +3314,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, /* In this branch refs == 1 */ if (found_extent) { - if (is_data && refs_to_drop != - extent_data_ref_count(path, iref)) { + if (unlikely(is_data && refs_to_drop != + extent_data_ref_count(path, iref))) { abort_and_dump(trans, path, "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", extent_data_ref_count(path, iref), @@ -3324,7 +3324,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } if (iref) { - if (path->slots[0] != extent_slot) { + if (unlikely(path->slots[0] != extent_slot)) { abort_and_dump(trans, path, "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", key.objectid, key.type, @@ -3339,7 +3339,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * | extent_slot ||extent_slot + 1| * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] */ - if (path->slots[0] != extent_slot + 1) { + if (unlikely(path->slots[0] != extent_slot + 1)) { abort_and_dump(trans, path, "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", path->slots[0]); @@ -3363,7 +3363,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4297,7 +4297,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, } static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, - struct find_free_extent_ctl *ffe_ctl) + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info) { if (ffe_ctl->for_treelog) { spin_lock(&fs_info->treelog_bg_lock); @@ -4315,12 +4316,13 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { /* - * No lock is OK here because avail is monotinically + * No lock is OK here because avail is monotonically * decreasing, and this is just a hint. */ u64 avail = block_group->zone_capacity - block_group->alloc_offset; if (block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && avail >= ffe_ctl->num_bytes) { ffe_ctl->hint_byte = block_group->start; break; @@ -4342,7 +4344,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - return prepare_allocation_zoned(fs_info, ffe_ctl); + return prepare_allocation_zoned(fs_info, ffe_ctl, space_info); default: BUG(); } @@ -5061,7 +5063,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; - if (check_eb_lock_owner(buf)) { + if (unlikely(check_eb_lock_owner(buf))) { free_extent_buffer(buf); return ERR_PTR(-EUCLEAN); } @@ -5470,17 +5472,17 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, if (!(wc->flags[level] & flag)) { ASSERT(path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_dec_ref(trans, root, eb, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_set_disk_extent_flags(trans, eb, flag); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -5582,7 +5584,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans, generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); - if (btrfs_buffer_uptodate(next, generation, 0)) + if (btrfs_buffer_uptodate(next, generation, false)) return 0; check.level = level - 1; @@ -5611,7 +5613,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans, * If we are UPDATE_BACKREF then we will not, we need to update our backrefs. * * If we are DROP_REFERENCE this will figure out if we need to drop our current - * reference, skipping it if we dropped it from a previous incompleted drop, or + * reference, skipping it if we dropped it from a previous uncompleted drop, or * dropping it if we still have a reference to it. */ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -5636,7 +5638,7 @@ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_r ref.parent = path->nodes[level]->start; } else { ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level])); - if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) { + if (unlikely(btrfs_root_id(root) != btrfs_header_owner(path->nodes[level]))) { btrfs_err(root->fs_info, "mismatched block owner"); return -EIO; } @@ -5758,7 +5760,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, /* * We have to walk down into this node, and if we're currently at the - * DROP_REFERNCE stage and this block is shared then we need to switch + * DROP_REFERENCE stage and this block is shared then we need to switch * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF. */ if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) { @@ -5772,7 +5774,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, level--; ASSERT(level == btrfs_header_level(next)); - if (level != btrfs_header_level(next)) { + if (unlikely(level != btrfs_header_level(next))) { btrfs_err(root->fs_info, "mismatched level"); ret = -EIO; goto out_unlock; @@ -5883,7 +5885,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } } else { ret = btrfs_dec_ref(trans, root, eb, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -5908,13 +5910,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else if (btrfs_root_id(root) != btrfs_header_owner(eb)) + else if (unlikely(btrfs_root_id(root) != btrfs_header_owner(eb))) goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else if (btrfs_root_id(root) != - btrfs_header_owner(path->nodes[level + 1])) + else if (unlikely(btrfs_root_id(root) != + btrfs_header_owner(path->nodes[level + 1]))) goto owner_mismatch; } @@ -6049,9 +6051,9 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. * - * If called with for_reloc == 0, may exit early with -EAGAIN + * If called with for_reloc set, may exit early with -EAGAIN */ -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) +int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc) { const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; @@ -6178,13 +6180,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) while (1) { ret = walk_down_tree(trans, root, path, wc); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); break; } ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); break; } @@ -6211,7 +6213,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -6247,7 +6249,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out_end_trans; ret = btrfs_del_root(trans, &root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -6255,7 +6257,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) if (!is_reloc_root) { ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } else if (ret > 0) { diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 82d3a82dc712..e970ac42a871 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -140,9 +140,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, bool full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, bool full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); @@ -155,8 +155,7 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, - int for_reloc); +int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c953297aa89a..c123a3ef154a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -101,6 +101,26 @@ struct btrfs_bio_ctrl { enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; + /* + * For data read bios, we attempt to optimize csum lookups if the extent + * generation is older than the current one. To make this possible, we + * need to track the maximum generation of an extent in a bio_ctrl to + * make the decision when submitting the bio. + * + * The pattern between do_readpage(), submit_one_bio() and + * submit_extent_folio() is quite subtle, so tracking this is tricky. + * + * As we process extent E, we might submit a bio with existing built up + * extents before adding E to a new bio, or we might just add E to the + * bio. As a result, E's generation could apply to the current bio or + * to the next one, so we need to be careful to update the bio_ctrl's + * generation with E's only when we are sure E is added to bio_ctrl->bbio + * in submit_extent_folio(). + * + * See the comment in btrfs_lookup_bio_sums() for more detail on the + * need for this optimization. + */ + u64 generation; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; @@ -111,8 +131,46 @@ struct btrfs_bio_ctrl { */ unsigned long submit_bitmap; struct readahead_control *ractl; + + /* + * The start offset of the last used extent map by a read operation. + * + * This is for proper compressed read merge. + * U64_MAX means we are starting the read and have made no progress yet. + * + * The current btrfs_bio_is_contig() only uses disk_bytenr as + * the condition to check if the read can be merged with previous + * bio, which is not correct. E.g. two file extents pointing to the + * same extent but with different offset. + * + * So here we need to do extra checks to only merge reads that are + * covered by the same extent map. + * Just extent_map::start will be enough, as they are unique + * inside the same inode. + */ + u64 last_em_start; }; +/* + * Helper to set the csum search commit root option for a bio_ctrl's bbio + * before submitting the bio. + * + * Only for use by submit_one_bio(). + */ +static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl) +{ + struct btrfs_bio *bbio = bio_ctrl->bbio; + + ASSERT(bbio); + + if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode))) + return; + + bio_ctrl->bbio->csum_search_commit_root = + (bio_ctrl->generation && + bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info)); +} + static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_bio *bbio = bio_ctrl->bbio; @@ -123,6 +181,8 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* Caller should ensure the bio has at least some range added */ ASSERT(bbio->bio.bi_iter.bi_size); + bio_set_csum_search_commit_root(bio_ctrl); + if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); @@ -131,6 +191,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; + /* + * We used the generation to decide whether to lookup csums in the + * commit_root or not when we called bio_set_csum_search_commit_root() + * above. Now, reset the generation for the next bio. + */ + bio_ctrl->generation = 0; } /* @@ -327,6 +393,13 @@ again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; + + /* + * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can + * return early without handling any dirty ranges. + */ + ASSERT(max_bytes >= fs_info->sectorsize); + found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, max_bytes, &cached_state); if (!found || delalloc_end <= *start || delalloc_start > orig_end) { @@ -352,18 +425,19 @@ again: if (delalloc_end + 1 - delalloc_start > max_bytes) delalloc_end = delalloc_start + max_bytes - 1; - /* step two, lock all the folioss after the folios that has start */ + /* step two, lock all the folios after the folios that has start */ ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { - /* some of the folios are gone, lets avoid looping by - * shortening the size of the delalloc range we're searching + /* + * Some of the folios are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching. */ btrfs_free_extent_state(cached_state); cached_state = NULL; if (!loops) { - max_bytes = PAGE_SIZE; + max_bytes = fs_info->sectorsize; loops = 1; goto again; } else { @@ -552,6 +626,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Populate every free slot in a provided array with folios using GFP_NOFS. * * @nr_folios: number of folios to allocate + * @order: the order of the folios to be allocated * @folio_array: the array to fill with folios; any existing non-NULL entries in * the array will be skipped * @@ -559,12 +634,13 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * -ENOMEM otherwise, the partially allocated folios would be freed and * the array slots zeroed */ -int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array) +int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, + struct folio **folio_array) { for (int i = 0; i < nr_folios; i++) { if (folio_array[i]) continue; - folio_array[i] = folio_alloc(GFP_NOFS, 0); + folio_array[i] = folio_alloc(GFP_NOFS, order); if (!folio_array[i]) goto error; } @@ -573,6 +649,7 @@ error: for (int i = 0; i < nr_folios; i++) { if (folio_array[i]) folio_put(folio_array[i]); + folio_array[i] = NULL; } return -ENOMEM; } @@ -701,15 +778,18 @@ static void alloc_new_bio(struct btrfs_inode *inode, * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one + * @read_em_generation: generation of the extent_map we are submitting + * (only used for read) * * The will either add the page into the existing @bio_ctrl->bbio, or allocate a * new one in @bio_ctrl->bbio. - * The mirror number for this IO should already be initizlied in + * The mirror number for this IO should already be initialized in * @bio_ctrl->mirror_num. */ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, u64 disk_bytenr, struct folio *folio, - size_t size, unsigned long pg_offset) + size_t size, unsigned long pg_offset, + u64 read_em_generation) { struct btrfs_inode *inode = folio_to_inode(folio); loff_t file_offset = folio_pos(folio) + pg_offset; @@ -740,6 +820,11 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, submit_one_bio(bio_ctrl); continue; } + /* + * Now that the folio is definitely added to the bio, include its + * generation in the max generation calculation. + */ + bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation); bio_ctrl->next_file_offset += len; if (bio_ctrl->wbc) @@ -909,7 +994,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl, * return 0 on success, otherwise return error */ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) + struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); @@ -942,6 +1027,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, bool force_bio_submit = false; u64 disk_bytenr; u64 block_start; + u64 em_gen; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { @@ -1019,13 +1105,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, * non-optimal behavior (submitting 2 bios for the same extent). */ if (compress_type != BTRFS_COMPRESS_NONE && - prev_em_start && *prev_em_start != (u64)-1 && - *prev_em_start != em->start) + bio_ctrl->last_em_start != U64_MAX && + bio_ctrl->last_em_start != em->start) force_bio_submit = true; - if (prev_em_start) - *prev_em_start = em->start; + bio_ctrl->last_em_start = em->start; + em_gen = em->generation; btrfs_free_extent_map(em); em = NULL; @@ -1049,7 +1135,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, - pg_offset); + pg_offset, em_gen); } return 0; } @@ -1238,12 +1324,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio) const u64 start = folio_pos(folio); const u64 end = start + folio_size(folio) - 1; struct extent_state *cached_state = NULL; - struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ, + .last_em_start = U64_MAX, + }; struct extent_map *em_cached = NULL; int ret; lock_extents_for_read(inode, start, end, &cached_state); - ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); btrfs_free_extent_map(em_cached); @@ -1580,7 +1669,7 @@ static int submit_one_sector(struct btrfs_inode *inode, ASSERT(folio_test_writeback(folio)); submit_extent_folio(bio_ctrl, disk_bytenr, folio, - sectorsize, filepos - folio_pos(folio)); + sectorsize, filepos - folio_pos(folio), 0); return 0; } @@ -1601,7 +1690,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; bool submitted_io = false; - bool error = false; + int found_error = 0; const u64 folio_start = folio_pos(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; @@ -1665,7 +1754,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, */ btrfs_mark_ordered_io_finished(inode, folio, cur, fs_info->sectorsize, false); - error = true; + if (!found_error) + found_error = ret; continue; } submitted_io = true; @@ -1682,11 +1772,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * If we hit any error, the corresponding sector will have its dirty * flag cleared and writeback finished, thus no need to handle the error case. */ - if (!submitted_io && !error) { + if (!submitted_io && !found_error) { btrfs_folio_set_writeback(fs_info, folio, start, len); btrfs_folio_clear_writeback(fs_info, folio, start, len); } - return ret; + return found_error; } /* @@ -2147,7 +2237,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * @fs_info: The fs_info for this file system. * @start: The offset of the range to start waiting on writeback. * @end: The end of the range, inclusive. This is meant to be used in - * conjuction with wait_marked_extents, so this will usually be + * conjunction with wait_marked_extents, so this will usually be * the_next_eb->start - 1. */ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, @@ -2417,7 +2507,7 @@ retry: * In above case, [32K, 96K) is asynchronously submitted * for compression, and [124K, 128K) needs to be written back. * - * If we didn't wait wrtiteback for page 64K, [128K, 128K) + * If we didn't wait writeback for page 64K, [128K, 128K) * won't be submitted as the page still has writeback flag * and will be skipped in the next check. * @@ -2583,7 +2673,8 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD, - .ractl = rac + .ractl = rac, + .last_em_start = U64_MAX, }; struct folio *folio; struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); @@ -2591,12 +2682,11 @@ void btrfs_readahead(struct readahead_control *rac) const u64 end = start + readahead_length(rac) - 1; struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; - u64 prev_em_start = (u64)-1; lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) - btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); @@ -2901,7 +2991,7 @@ static void cleanup_extent_buffer_folios(struct extent_buffer *eb) { const int num_folios = num_extent_folios(eb); - /* We canont use num_extent_folios() as loop bound as eb->folios changes. */ + /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */ for (int i = 0; i < num_folios; i++) { ASSERT(eb->folios[i]); detach_extent_buffer_folio(eb, eb->folios[i]); @@ -3148,29 +3238,30 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, */ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) { - if (!IS_ALIGNED(start, fs_info->sectorsize)) { + const u32 nodesize = fs_info->nodesize; + + if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) { btrfs_err(fs_info, "bad tree block start %llu", start); return true; } - if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) { + if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) { btrfs_err(fs_info, "tree block is not nodesize aligned, start %llu nodesize %u", - start, fs_info->nodesize); + start, nodesize); return true; } - if (fs_info->nodesize >= PAGE_SIZE && - !PAGE_ALIGNED(start)) { + if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", - start, fs_info->nodesize); + start, nodesize); return true; } - if (!IS_ALIGNED(start, fs_info->nodesize) && - !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { + if (unlikely(!IS_ALIGNED(start, nodesize) && + !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) { btrfs_warn(fs_info, "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", - start, fs_info->nodesize); + start, nodesize); } return false; } @@ -3789,7 +3880,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, return ret; wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) return -EIO; return 0; } @@ -4465,7 +4556,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, if (IS_ERR(eb)) return; - if (btrfs_buffer_uptodate(eb, gen, 1)) { + if (btrfs_buffer_uptodate(eb, gen, true)) { free_extent_buffer(eb); return; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 61130786b9a3..5fcbfe44218c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -366,7 +366,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, bool nofail); -int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array); +int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, + struct folio **folio_array); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 57f52585a6dd..7e38c23a0c1c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -460,7 +460,7 @@ void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) static inline void setup_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, - int modified) + bool modified) { refcount_inc(&em->refs); @@ -486,7 +486,7 @@ static inline void setup_extent_mapping(struct btrfs_inode *inode, * taken, or a reference dropped if the merge attempt was successful. */ static int add_extent_mapping(struct btrfs_inode *inode, - struct extent_map *em, int modified) + struct extent_map *em, bool modified) { struct extent_map_tree *tree = &inode->extent_tree; struct btrfs_root *root = inode->root; @@ -509,7 +509,7 @@ static int add_extent_mapping(struct btrfs_inode *inode, } static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len, int strict) + u64 start, u64 len, bool strict) { struct extent_map *em; struct rb_node *rb_node; @@ -548,7 +548,7 @@ static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { - return lookup_extent_mapping(tree, start, len, 1); + return lookup_extent_mapping(tree, start, len, true); } /* @@ -566,7 +566,7 @@ struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { - return lookup_extent_mapping(tree, start, len, 0); + return lookup_extent_mapping(tree, start, len, false); } /* @@ -594,7 +594,7 @@ void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *e static void replace_extent_mapping(struct btrfs_inode *inode, struct extent_map *cur, struct extent_map *new, - int modified) + bool modified) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *tree = &inode->extent_tree; @@ -670,7 +670,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, em->len = end - start; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) em->offset += start_diff; - return add_extent_mapping(inode, em, 0); + return add_extent_mapping(inode, em, false); } /* @@ -707,7 +707,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, if (em->disk_bytenr == EXTENT_MAP_INLINE) ASSERT(em->start == 0); - ret = add_extent_mapping(inode, em, 0); + ret = add_extent_mapping(inode, em, false); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that * an overlapping map exists in the tree @@ -1057,7 +1057,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); em = btrfs_lookup_extent_mapping(em_tree, start, len); - if (!em) { + if (unlikely(!em)) { ret = -EIO; goto out_unlock; } @@ -1082,7 +1082,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr split_pre->flags = flags; split_pre->generation = em->generation; - replace_extent_mapping(inode, em, split_pre, 1); + replace_extent_mapping(inode, em, split_pre, true); /* * Now we only have an extent_map at: @@ -1098,7 +1098,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; split_mid->generation = em->generation; - add_extent_mapping(inode, split_mid, 1); + add_extent_mapping(inode, split_mid, true); /* Once for us */ btrfs_free_extent_map(em); @@ -1372,7 +1372,7 @@ void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) return; - queue_work(system_unbound_wq, &fs_info->em_shrinker_work); + queue_work(system_dfl_wq, &fs_info->em_shrinker_work); } void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index 7935586a9dbd..f2eaaef8422b 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -153,7 +153,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, if (cache_end > offset) { if (offset == cache->offset) { /* - * We cached a dealloc range (found in the io tree) for + * We cached a delalloc range (found in the io tree) for * a hole or prealloc extent and we have now found a * file extent item for the same offset. What we have * now is more recent and up to date, so discard what diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c09fbc257634..a42e6d54e7cd 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -397,6 +397,36 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) path->skip_locking = 1; } + /* + * If we are searching for a csum of an extent from a past + * transaction, we can search in the commit root and reduce + * lock contention on the csum tree extent buffers. + * + * This is important because that lock is an rwsem which gets + * pretty heavy write load under memory pressure and sustained + * csum overwrites, unlike the commit_root_sem. (Memory pressure + * makes us writeback the nodes multiple times per transaction, + * which makes us cow them each time, taking the write lock.) + * + * Due to how rwsem is implemented, there is a possible + * priority inversion where the readers holding the lock don't + * get scheduled (say they're in a cgroup stuck in heavy reclaim) + * which then blocks writers, including transaction commit. By + * using a semaphore with fewer writers (only a commit switching + * the roots), we make this issue less likely. + * + * Note that we don't rely on btrfs_search_slot to lock the + * commit root csum. We call search_slot multiple times, which would + * create a potential race where a commit comes in between searches + * while we are not holding the commit_root_sem, and we get csums + * from across transactions. + */ + if (bbio->csum_search_commit_root) { + path->search_commit_root = 1; + path->skip_locking = 1; + down_read(&fs_info->commit_root_sem); + } + while (bio_offset < orig_len) { int count; u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset; @@ -442,6 +472,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) bio_offset += count * sectorsize; } + if (bbio->csum_search_commit_root) + up_read(&fs_info->commit_root_sem); return ret; } @@ -743,12 +775,10 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio) SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; struct btrfs_ordered_sum *sums; - char *data; - struct bvec_iter iter; - struct bio_vec bvec; + struct bvec_iter iter = bio->bi_iter; + phys_addr_t paddr; + const u32 blocksize = fs_info->sectorsize; int index; - unsigned int blockcount; - int i; unsigned nofs_flag; nofs_flag = memalloc_nofs_save(); @@ -767,21 +797,9 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio) shash->tfm = fs_info->csum_shash; - bio_for_each_segment(bvec, bio, iter) { - blockcount = BTRFS_BYTES_TO_BLKS(fs_info, - bvec.bv_len + fs_info->sectorsize - - 1); - - for (i = 0; i < blockcount; i++) { - data = bvec_kmap_local(&bvec); - crypto_shash_digest(shash, - data + (i * fs_info->sectorsize), - fs_info->sectorsize, - sums->sums + index); - kunmap_local(data); - index += fs_info->csum_size; - } - + btrfs_bio_for_each_block(paddr, bio, &iter, blocksize) { + btrfs_calculate_block_csum(fs_info, paddr, sums->sums + index); + index += fs_info->csum_size; } bbio->sums = sums; @@ -993,7 +1011,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, * item changed size or key */ ret = btrfs_split_item(trans, root, path, &key, offset); - if (ret && ret != -EAGAIN) { + if (unlikely(ret && ret != -EAGAIN)) { btrfs_abort_transaction(trans, ret); break; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 204674934795..7efd1f8a1912 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -327,7 +327,7 @@ next_slot: args->start - extent_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -426,7 +426,7 @@ delete_extent_item: key.offset - extent_offset, 0, false); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -443,7 +443,7 @@ delete_extent_item: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -587,21 +587,20 @@ again: leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ino || - key.type != BTRFS_EXTENT_DATA_KEY) { + if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) { + if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (key.offset > start || extent_end < end) { + if (unlikely(key.offset > start || extent_end < end)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -676,7 +675,7 @@ again: btrfs_release_path(path); goto again; } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -704,7 +703,7 @@ again: ref.ref_root = btrfs_root_id(root); btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -712,7 +711,7 @@ again: if (split == start) { key.offset = start; } else { - if (start != key.offset) { + if (unlikely(start != key.offset)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -744,7 +743,7 @@ again: del_slot = path->slots[0] + 1; del_nr++; ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -762,7 +761,7 @@ again: del_slot = path->slots[0]; del_nr++; ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -783,7 +782,7 @@ again: extent_end - key.offset); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -815,7 +814,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 if (ret) return ret; folio_lock(folio); - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); return -EIO; } @@ -970,7 +969,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, * Return: * > 0 If we can nocow, and updates @write_bytes. * 0 If we can't do a nocow write. - * -EAGAIN If we can't do a nocow write because snapshoting of the inode's + * -EAGAIN If we can't do a nocow write because snapshotting of the inode's * root is in progress or because we are in a non-blocking IO * context and need to block (@nowait is true). * < 0 If an error happened. @@ -2460,9 +2459,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, * got EOPNOTSUPP via prealloc then we messed up and * need to abort. */ - if (ret && - (ret != -EOPNOTSUPP || - (extent_info && extent_info->is_new_extent))) + if (unlikely(ret && + (ret != -EOPNOTSUPP || + (extent_info && extent_info->is_new_extent)))) btrfs_abort_transaction(trans, ret); break; } @@ -2473,7 +2472,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, cur_offset < ino_size) { ret = fill_holes(trans, inode, path, cur_offset, drop_args.drop_end); - if (ret) { + if (unlikely(ret)) { /* * If we failed then we didn't insert our hole * entries for the area we dropped, so now the @@ -2493,7 +2492,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); - if (ret) { + if (unlikely(ret)) { /* * We couldn't clear our area, so we could * presumably adjust up and corrupt the fs, so @@ -2512,7 +2511,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, replace_len, drop_args.bytes_found); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -2607,7 +2606,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, cur_offset < drop_args.drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_args.drop_end); - if (ret) { + if (unlikely(ret)) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2616,7 +2615,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_trans; } @@ -2626,7 +2625,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, extent_info->data_len, drop_args.bytes_found); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_trans; } @@ -3345,7 +3344,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end * We could also use the extent map tree to find such delalloc that is * being flushed, but using the ordered extents tree is more efficient * because it's usually much smaller as ordered extents are removed from - * the tree once they complete. With the extent maps, we mau have them + * the tree once they complete. With the extent maps, we may have them * in the extent map tree for a very long time, and they were either * created by previous writes or loaded by read operations. */ diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5d8d1570a5c9..ab873bd67192 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2282,7 +2282,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want * to reserve them to larger extents, however if we have plenty - * of cache left then go ahead an dadd them, no sense in adding + * of cache left then go ahead and add them, no sense in adding * the overhead of a bitmap if we don't have to. */ if (info->bytes <= fs_info->sectorsize * 8) { @@ -3829,7 +3829,7 @@ out_unlock: /* * If we break out of trimming a bitmap prematurely, we should reset the - * trimming bit. In a rather contrieved case, it's possible to race here so + * trimming bit. In a rather contrived case, it's possible to race here so * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. * * start = start of bitmap @@ -4142,7 +4142,7 @@ int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool act if (!active) { set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); ret = cleanup_free_space_cache_v1(fs_info, trans); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index eba7f22ae49c..dad0b492a663 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -137,12 +137,12 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { DEBUG_WARN(); return -EIO; } - if (p->slots[0] == 0) { + if (unlikely(p->slots[0] == 0)) { DEBUG_WARN("no previous slot found"); return -EIO; } @@ -218,7 +218,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (!bitmap) { + if (unlikely(!bitmap)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -233,7 +233,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -271,7 +271,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -293,7 +293,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, expected_extent_count = btrfs_free_space_extent_count(leaf, info); btrfs_release_path(path); - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, @@ -320,7 +320,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, data_size); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -361,7 +361,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (!bitmap) { + if (unlikely(!bitmap)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -376,7 +376,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -420,7 +420,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -454,7 +454,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, key.offset = (end_bit - start_bit) * fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -465,7 +465,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, start_bit = find_next_bit_le(bitmap, nrbits, end_bit); } - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, @@ -848,14 +848,14 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, return 0; path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); - if (!block_group) { + if (unlikely(!block_group)) { DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); @@ -1030,14 +1030,14 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, return 0; path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); - if (!block_group) { + if (unlikely(!block_group)) { DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); @@ -1185,7 +1185,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) goto out_clear; } ret = btrfs_global_root_insert(free_space_root); - if (ret) { + if (unlikely(ret)) { btrfs_put_root(free_space_root); btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -1197,7 +1197,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out_clear; @@ -1290,14 +1290,14 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); ret = clear_free_space_tree(trans, free_space_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } ret = btrfs_del_root(trans, &free_space_root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1315,7 +1315,7 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), free_space_root->node, 0, 1); btrfs_put_root(free_space_root); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1344,7 +1344,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); ret = clear_free_space_tree(trans, free_space_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1362,7 +1362,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) goto next; ret = populate_free_space_tree(trans, block_group); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1422,7 +1422,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, if (!path) { path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { btrfs_abort_transaction(trans, -ENOMEM); return -ENOMEM; } @@ -1430,7 +1430,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, } ret = add_new_free_space_info(trans, block_group, path); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1481,7 +1481,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -1496,7 +1496,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1527,7 +1527,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1611,7 +1611,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, extent_count++; } - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, @@ -1672,7 +1672,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, extent_count++; } - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index b2bb86f8d7cf..feb0a2faa837 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -55,6 +55,54 @@ size_t __attribute_const__ btrfs_get_num_csums(void) } /* + * We support the following block sizes for all systems: + * + * - 4K + * This is the most common block size. For PAGE SIZE > 4K cases the subpage + * mode is used. + * + * - PAGE_SIZE + * The straightforward block size to support. + * + * And extra support for the following block sizes based on the kernel config: + * + * - MIN_BLOCKSIZE + * This is either 4K (regular builds) or 2K (debug builds) + * This allows testing subpage routines on x86_64. + */ +bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize) +{ + /* @blocksize should be validated first. */ + ASSERT(is_power_of_2(blocksize) && blocksize >= BTRFS_MIN_BLOCKSIZE && + blocksize <= BTRFS_MAX_BLOCKSIZE); + + if (blocksize == PAGE_SIZE || blocksize == SZ_4K || blocksize == BTRFS_MIN_BLOCKSIZE) + return true; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* + * For bs > ps support it's done by specifying a minimal folio order + * for filemap, thus implying large data folios. + * For HIGHMEM systems, we can not always access the content of a (large) + * folio in one go, but go through them page by page. + * + * A lot of features don't implement a proper PAGE sized loop for large + * folios, this includes: + * + * - compression + * - verity + * - encoded write + * + * Considering HIGHMEM is such a pain to deal with and it's going + * to be deprecated eventually, just reject HIGHMEM && bs > ps cases. + */ + if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE) + return false; + return true; +#endif + return false; +} + +/* * Start exclusive operation @type, return true on success. */ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 8cc07cc70b12..814bbc9417d2 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -59,6 +59,8 @@ struct btrfs_space_info; #define BTRFS_MIN_BLOCKSIZE (SZ_4K) #endif +#define BTRFS_MAX_BLOCKSIZE (SZ_64K) + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL @@ -102,6 +104,8 @@ enum { BTRFS_FS_STATE_RO, /* Track if a transaction abort has been reported on this filesystem */ BTRFS_FS_STATE_TRANS_ABORTED, + /* Track if log replay has failed. */ + BTRFS_FS_STATE_LOG_REPLAY_ABORTED, /* * Bio operations should be blocked on this filesystem because a source * or target device is being destroyed as part of a device replace @@ -243,6 +247,7 @@ enum { BTRFS_MOUNT_NOSPACECACHE = (1ULL << 30), BTRFS_MOUNT_IGNOREMETACSUMS = (1ULL << 31), BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32), + BTRFS_MOUNT_REF_TRACKER = (1ULL << 33), }; /* @@ -280,7 +285,7 @@ enum { #ifdef CONFIG_BTRFS_EXPERIMENTAL /* - * Features under developmen like Extent tree v2 support is enabled + * Features under development like Extent tree v2 support is enabled * only under CONFIG_BTRFS_EXPERIMENTAL */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ @@ -303,6 +308,16 @@ enum { #define BTRFS_WARNING_COMMIT_INTERVAL (300) #define BTRFS_DEFAULT_MAX_INLINE (2048) +enum btrfs_compression_type { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_ZSTD = 3, + BTRFS_NR_COMPRESS_TYPES = 4, + + BTRFS_DEFRAG_DONT_COMPRESS, +}; + struct btrfs_dev_replace { /* See #define above */ u64 replace_state; @@ -505,6 +520,9 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long long mount_opt; + /* Compress related structures. */ + void *compr_wsm[BTRFS_NR_COMPRESS_TYPES]; + int compress_type; int compress_level; u32 commit_interval; @@ -809,6 +827,8 @@ struct btrfs_fs_info { u32 sectorsize; /* ilog2 of sectorsize, use to avoid 64bit division */ u32 sectorsize_bits; + u32 block_min_order; + u32 block_max_order; u32 csum_size; u32 csums_per_leaf; u32 stripesize; @@ -878,12 +898,10 @@ struct btrfs_fs_info { struct lockdep_map btrfs_trans_pending_ordered_map; struct lockdep_map btrfs_ordered_extent_map; -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG spinlock_t ref_verify_lock; struct rb_root block_tree; -#endif -#ifdef CONFIG_BTRFS_DEBUG struct kobject *debug_kobj; struct list_head allocated_roots; @@ -905,6 +923,12 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } +/* Return the minimal folio size of the fs. */ +static inline unsigned int btrfs_min_folio_size(struct btrfs_fs_info *fs_info) +{ + return 1U << (PAGE_SHIFT + fs_info->block_min_order); +} + static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->generation); @@ -997,6 +1021,7 @@ static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs return folio_size(folio) >> fs_info->sectorsize_bits; } +bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize); bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, @@ -1107,9 +1132,9 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) #define EXPORT_FOR_TESTS -static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info) +static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info) { - return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + return unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)); } void btrfs_test_destroy_inode(struct inode *inode); @@ -1118,9 +1143,9 @@ void btrfs_test_destroy_inode(struct inode *inode); #define EXPORT_FOR_TESTS static -static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info) +static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info) { - return 0; + return false; } #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index f06cf701ae5a..1bd73b80f9fa 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -137,7 +137,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, name); - if (!extref) { + if (unlikely(!extref)) { btrfs_abort_transaction(trans, -ENOENT); return -ENOENT; } @@ -627,7 +627,7 @@ delete: if (control->clear_extent_range) { ret = btrfs_inode_clear_file_extent_range(control->inode, clear_start, clear_len); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -666,7 +666,7 @@ delete: btrfs_init_data_ref(&ref, control->ino, extent_offset, btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -684,7 +684,7 @@ delete: ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -720,7 +720,7 @@ out: int ret2; ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - if (ret2) { + if (unlikely(ret2)) { btrfs_abort_transaction(trans, ret2); ret = ret2; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9e4aec7330cb..ced87c9e4682 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -72,6 +72,9 @@ #include "raid-stripe-tree.h" #include "fiemap.h" +#define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0) +#define COW_FILE_RANGE_NO_INLINE (1UL << 1) + struct btrfs_iget_args { u64 ino; struct btrfs_root *root; @@ -367,7 +370,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) } /* - * Unock inode i_rwsem. + * Unlock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. @@ -631,7 +634,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, drop_args.replace_extent = true; drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -639,7 +642,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, compressed_folio, update_i_size); - if (ret && ret != -ENOSPC) { + if (unlikely(ret && ret != -ENOSPC)) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { @@ -649,7 +652,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); - if (ret && ret != -ENOSPC) { + if (unlikely(ret && ret != -ENOSPC)) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { @@ -851,6 +854,8 @@ static void compress_file_range(struct btrfs_work *work) struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; @@ -861,7 +866,7 @@ static void compress_file_range(struct btrfs_work *work) unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; - unsigned int poff; + unsigned int loff; int i; int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level; @@ -899,8 +904,8 @@ static void compress_file_range(struct btrfs_work *work) actual_end = min_t(u64, i_size, end + 1); again: folios = NULL; - nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES); + nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1; + nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift); /* * we don't want to send crud past the end of i_size through @@ -956,18 +961,18 @@ again: /* Compression level is applied here. */ ret = btrfs_compress_folios(compress_type, compress_level, - mapping, start, folios, &nr_folios, &total_in, + inode, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) goto mark_incompressible; /* - * Zero the tail end of the last page, as we might be sending it down + * Zero the tail end of the last folio, as we might be sending it down * to disk. */ - poff = offset_in_page(total_compressed); - if (poff) - folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff); + loff = (total_compressed & (min_folio_size - 1)); + if (loff) + folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff); /* * Try to create an inline extent. @@ -1245,18 +1250,18 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * When this function fails, it unlocks all pages except @locked_folio. + * When this function fails, it unlocks all folios except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and - * unlocks all pages including locked_folio and starts I/O on them. - * (In reality inline extents are limited to a single page, so locked_folio is - * the only page handled anyway). + * unlocks all folios including locked_folio and starts I/O on them. + * (In reality inline extents are limited to a single block, so locked_folio is + * the only folio handled anyway). * - * When this function succeed and creates a normal extent, the page locking + * When this function succeed and creates a normal extent, the folio locking * status depends on the passed in flags: * - * - If @keep_locked is set, all pages are kept locked. - * - Else all pages except for @locked_folio are unlocked. + * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked. + * - Else all folios except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are cleaned up. @@ -1264,7 +1269,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, u64 *done_offset, - bool keep_locked, bool no_inline) + unsigned long flags) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1292,7 +1297,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - if (!no_inline) { + if (!(flags & COW_FILE_RANGE_NO_INLINE)) { /* lets try to make an inline extent */ ret = cow_file_range_inline(inode, locked_folio, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); @@ -1320,7 +1325,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * Do set the Ordered (Private2) bit so we know this page was properly * setup for writepage. */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; /* @@ -1531,10 +1536,11 @@ out_unlock: btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, end - start - cur_alloc_size + 1, NULL); } - btrfs_err_rl(fs_info, - "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", - __func__, btrfs_root_id(inode->root), - btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); + btrfs_err(fs_info, +"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), orig_start, end + 1 - orig_start, + start, cur_alloc_size, ret); return ret; } @@ -1687,7 +1693,7 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, while (start <= end) { ret = cow_file_range(inode, locked_folio, start, end, - &done_offset, true, false); + &done_offset, COW_FILE_RANGE_KEEP_LOCKED); if (ret) return ret; extent_write_locked_range(&inode->vfs_inode, locked_folio, @@ -1768,9 +1774,15 @@ static int fallback_to_cow(struct btrfs_inode *inode, * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work. + * + * And here we do not unlock the folio after a successful run. + * The folios will be unlocked after everything is finished, or by error handling. + * + * This is to ensure error handling won't need to clear dirty/ordered flags without + * a locked folio, which can race with writeback. */ - ret = cow_file_range(inode, locked_folio, start, end, NULL, false, - true); + ret = cow_file_range(inode, locked_folio, start, end, NULL, + COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED); ASSERT(ret != 1); return ret; } @@ -1913,61 +1925,14 @@ static int can_nocow_file_extent(struct btrfs_path *path, return ret < 0 ? ret : can_nocow; } -/* - * Cleanup the dirty folios which will never be submitted due to error. - * - * When running a delalloc range, we may need to split the ranges (due to - * fragmentation or NOCOW). If we hit an error in the later part, we will error - * out and previously successfully executed range will never be submitted, thus - * we have to cleanup those folios by clearing their dirty flag, starting and - * finishing the writeback. - */ -static void cleanup_dirty_folios(struct btrfs_inode *inode, - struct folio *locked_folio, - u64 start, u64 end, int error) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct address_space *mapping = inode->vfs_inode.i_mapping; - pgoff_t start_index = start >> PAGE_SHIFT; - pgoff_t end_index = end >> PAGE_SHIFT; - u32 len; - - ASSERT(end + 1 - start < U32_MAX); - ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && - IS_ALIGNED(end + 1, fs_info->sectorsize)); - len = end + 1 - start; - - /* - * Handle the locked folio first. - * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. - */ - btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); - - for (pgoff_t index = start_index; index <= end_index; index++) { - struct folio *folio; - - /* Already handled at the beginning. */ - if (index == locked_folio->index) - continue; - folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); - /* Cache already dropped, no need to do any cleanup. */ - if (IS_ERR(folio)) - continue; - btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); - folio_unlock(folio); - folio_put(folio); - } - mapping_set_error(mapping, error); -} - static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, struct extent_state **cached, struct can_nocow_file_extent_args *nocow_args, u64 file_pos, bool is_prealloc) { struct btrfs_ordered_extent *ordered; - u64 len = nocow_args->file_extent.num_bytes; - u64 end = file_pos + len - 1; + const u64 len = nocow_args->file_extent.num_bytes; + const u64 end = file_pos + len - 1; int ret = 0; btrfs_lock_extent(&inode->io_tree, file_pos, end, cached); @@ -1978,8 +1943,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { - btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); - return PTR_ERR(em); + ret = PTR_ERR(em); + goto error; } btrfs_free_extent_map(em); } @@ -1991,8 +1956,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio if (IS_ERR(ordered)) { if (is_prealloc) btrfs_drop_extent_map_range(inode, file_pos, end, false); - btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); - return PTR_ERR(ordered); + ret = PTR_ERR(ordered); + goto error; } if (btrfs_is_data_reloc_root(inode->root)) @@ -2004,23 +1969,30 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio ret = btrfs_reloc_clone_csums(ordered); btrfs_put_ordered_extent(ordered); + if (ret < 0) + goto error; extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - /* - * On error, we need to cleanup the ordered extents we created. - * - * We do not clear the folio Dirty flags because they are set and - * cleaered by the caller. - */ - if (ret < 0) - btrfs_cleanup_ordered_extents(inode, file_pos, len); + PAGE_SET_ORDERED); + return ret; + +error: + btrfs_cleanup_ordered_extents(inode, file_pos, len); + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + btrfs_err(inode->root->fs_info, + "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), btrfs_ino(inode), + file_pos, len, ret); return ret; } /* - * when nowcow writeback call back. This checks for snapshots or COW copies + * When nocow writeback calls back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * * If no cow copies or snapshots exist, we write directly to the existing @@ -2037,13 +2009,23 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, /* * If not 0, represents the inclusive end of the last fallback_to_cow() * range. Only for error handling. + * + * The same for nocow_end, it's to avoid double cleaning up the range + * already cleaned by nocow_one_range(). */ u64 cow_end = 0; + u64 nocow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; u64 ino = btrfs_ino(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; + /* The range that has ordered extent(s). */ + u64 oe_cleanup_start; + u64 oe_cleanup_len = 0; + /* The range that is untouched. */ + u64 untouched_start; + u64 untouched_len = 0; /* * Normally on a zoned device we're only doing COW writes, but in case @@ -2207,8 +2189,10 @@ must_cow: &nocow_args, cur_offset, extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (ret < 0) + if (ret < 0) { + nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; goto error; + } cur_offset = extent_end; } btrfs_release_path(path); @@ -2225,86 +2209,105 @@ must_cow: cow_start = (u64)-1; } - btrfs_free_path(path); - return 0; - -error: /* - * There are several error cases: - * - * 1) Failed without falling back to COW - * start cur_offset end - * |/////////////| | - * - * In this case, cow_start should be (u64)-1. - * - * For range [start, cur_offset) the folios are already unlocked (except - * @locked_folio), EXTENT_DELALLOC already removed. - * Need to clear the dirty flags and finish the ordered extents. - * - * 2) Failed with error before calling fallback_to_cow() - * - * start cow_start end - * |/////////////| | - * - * In this case, only @cow_start is set, @cur_offset is between - * [cow_start, end) - * - * It's mostly the same as case 1), just replace @cur_offset with - * @cow_start. - * - * 3) Failed with error from fallback_to_cow() + * Everything is finished without an error, can unlock the folios now. * - * start cow_start cow_end end - * |/////////////|-----------| | - * - * In this case, both @cow_start and @cow_end is set. - * - * For range [start, cow_start) it's the same as case 1). - * But for range [cow_start, cow_end), all the cleanup is handled by - * cow_file_range(), we should not touch anything in that range. - * - * So for all above cases, if @cow_start is set, cleanup ordered extents - * for range [start, @cow_start), other wise cleanup range [start, @cur_offset). + * No need to touch the io tree range nor set folio ordered flag, as + * fallback_to_cow() and nocow_one_range() have already handled them. */ - if (cow_start != (u64)-1) - cur_offset = cow_start; + extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK); - if (cur_offset > start) { - btrfs_cleanup_ordered_extents(inode, start, cur_offset - start); - cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); - } + btrfs_free_path(path); + return 0; - /* - * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to @cow_end + 1 to skip the COW range, as - * cow_file_range() will do the proper cleanup at error. - */ - if (cow_end) - cur_offset = cow_end + 1; +error: + if (cow_start == (u64)-1) { + /* + * case a) + * start cur_offset end + * | OE cleanup | Untouched | + * + * We finished a fallback_to_cow() or nocow_one_range() call, + * but failed to check the next range. + * + * or + * start cur_offset nocow_end end + * | OE cleanup | Skip | Untouched | + * + * nocow_one_range() failed, the range [cur_offset, nocow_end] is + * already cleaned up. + */ + oe_cleanup_start = start; + oe_cleanup_len = cur_offset - start; + if (nocow_end) + untouched_start = nocow_end + 1; + else + untouched_start = cur_offset; + untouched_len = end + 1 - untouched_start; + } else if (cow_start != (u64)-1 && cow_end == 0) { + /* + * case b) + * start cow_start cur_offset end + * | OE cleanup | Untouched | + * + * We got a range that needs COW, but before we hit the next NOCOW range, + * thus [cow_start, cur_offset) doesn't yet have any OE. + */ + oe_cleanup_start = start; + oe_cleanup_len = cow_start - start; + untouched_start = cow_start; + untouched_len = end + 1 - untouched_start; + } else { + /* + * case c) + * start cow_start cow_end end + * | OE cleanup | Skip | Untouched | + * + * fallback_to_cow() failed, and fallback_to_cow() will do the + * cleanup for its range, we shouldn't touch the range + * [cow_start, cow_end]. + */ + ASSERT(cow_start != (u64)-1 && cow_end != 0); + oe_cleanup_start = start; + oe_cleanup_len = cow_start - start; + untouched_start = cow_end + 1; + untouched_len = end + 1 - untouched_start; + } + + if (oe_cleanup_len) { + const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1; + btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len); + extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end, + locked_folio, NULL, + EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + } - /* - * We need to lock the extent here because we're clearing DELALLOC and - * we're not locked at this point. - */ - if (cur_offset < end) { + if (untouched_len) { struct extent_state *cached = NULL; + const u64 untouched_end = untouched_start + untouched_len - 1; - btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached); - extent_clear_unlock_delalloc(inode, cur_offset, end, + /* + * We need to lock the extent here because we're clearing DELALLOC and + * we're not locked at this point. + */ + btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached); + extent_clear_unlock_delalloc(inode, untouched_start, untouched_end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); + btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL); } btrfs_free_path(path); - btrfs_err_rl(fs_info, - "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", - __func__, btrfs_root_id(inode->root), - btrfs_ino(inode), start, end + 1 - start, ret); + btrfs_err(fs_info, +"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d", + __func__, btrfs_root_id(inode->root), btrfs_ino(inode), + start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len, + untouched_start, untouched_len, ret); return ret; } @@ -2349,8 +2352,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else - ret = cow_file_range(inode, locked_folio, start, end, NULL, - false, false); + ret = cow_file_range(inode, locked_folio, start, end, NULL, 0); return ret; } @@ -2986,7 +2988,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the * number of bytes only for that range containing the inline extent. - * The remaining of the range will be processed when clearning the + * The remaining of the range will be processed when clearing the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { @@ -3102,14 +3104,15 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) if (!freespace_inode) btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); - if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { + if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) { ret = -EIO; goto out; } - if (btrfs_is_zoned(fs_info)) - btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes); + ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); + if (ret) + goto out; if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -3147,7 +3150,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3155,7 +3158,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { /* Logic error */ ASSERT(list_empty(&ordered_extent->list)); - if (!list_empty(&ordered_extent->list)) { + if (unlikely(!list_empty(&ordered_extent->list))) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -3163,7 +3166,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); - if (ret) { + if (unlikely(ret)) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); } @@ -3190,20 +3193,20 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } ret = add_pending_csums(trans, &ordered_extent->list); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3221,7 +3224,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); - if (ret) { /* -ENOMEM or corruption */ + if (unlikely(ret)) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } @@ -3327,21 +3330,47 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) return btrfs_finish_one_ordered(ordered); } +void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, + u8 *dest) +{ + struct folio *folio = page_folio(phys_to_page(paddr)); + const u32 blocksize = fs_info->sectorsize; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + + shash->tfm = fs_info->csum_shash; + /* The full block must be inside the folio. */ + ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); + + if (folio_test_partial_kmap(folio)) { + size_t cur = paddr; + + crypto_shash_init(shash); + while (cur < paddr + blocksize) { + void *kaddr; + size_t len = min(paddr + blocksize - cur, + PAGE_SIZE - offset_in_page(cur)); + + kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur)); + crypto_shash_update(shash, kaddr, len); + kunmap_local(kaddr); + cur += len; + } + crypto_shash_final(shash, dest); + } else { + crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest); + } +} /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. * * @kaddr must be a properly kmapped address. */ -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, - const u8 * const csum_expected) +int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, + const u8 * const csum_expected) { - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - - shash->tfm = fs_info->csum_shash; - crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - - if (memcmp(csum, csum_expected, fs_info->csum_size)) + btrfs_calculate_block_csum(fs_info, paddr, csum); + if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) return -EIO; return 0; } @@ -3360,17 +3389,16 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum * Return %true if the sector is ok or had no checksum to start with, else %false. */ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, struct bio_vec *bv) + u32 bio_offset, phys_addr_t paddr) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 blocksize = fs_info->sectorsize; + struct folio *folio; u64 file_offset = bbio->file_offset + bio_offset; - u64 end = file_offset + bv->bv_len - 1; + u64 end = file_offset + blocksize - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; - void *kaddr; - - ASSERT(bv->bv_len == fs_info->sectorsize); if (!bbio->csum) return true; @@ -3386,12 +3414,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - kaddr = bvec_kmap_local(bv); - if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) { - kunmap_local(kaddr); + if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected)) goto zeroit; - } - kunmap_local(kaddr); return true; zeroit: @@ -3399,7 +3423,9 @@ zeroit: bbio->mirror_num); if (dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); - memzero_bvec(bv); + folio = page_folio(phys_to_page(paddr)); + ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); + folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize); return false; } @@ -3513,7 +3539,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, int ret; ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); - if (ret && ret != -EEXIST) { + if (unlikely(ret && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -3885,10 +3911,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(inode); - if (ret) - goto out; - ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; @@ -3920,8 +3942,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(inode, 0, - round_up(i_size_read(vfs_inode), fs_info->sectorsize)); inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); @@ -3953,6 +3973,11 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path btrfs_set_inode_mapping_order(inode); cache_index: + ret = btrfs_init_file_extent_tree(inode); + if (ret) + goto out; + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any @@ -4263,7 +4288,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, } ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); - if (ret) { + if (unlikely(ret)) { btrfs_crit(fs_info, "failed to delete reference to %.*s, root %llu inode %llu parent %llu", name->len, name->name, btrfs_root_id(root), ino, dir_ino); @@ -4275,7 +4300,7 @@ skip_backref: rename_ctx->index = index; ret = btrfs_delete_delayed_dir_index(trans, dir, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -4430,7 +4455,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(leaf, di, &key); WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4461,14 +4486,14 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_del_root_ref(trans, objectid, btrfs_root_id(root), dir_ino, &index, &fname.disk_name); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_delete_delayed_dir_index(trans, dir, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4526,7 +4551,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. @@ -4557,7 +4582,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root) inode = btrfs_find_first_inode(root, min_ino); while (inode) { - if (atomic_read(&inode->vfs_inode.i_count) > 1) + if (icount_read(&inode->vfs_inode) > 1) d_prune_aliases(&inode->vfs_inode); min_ino = btrfs_ino(inode) + 1; @@ -4640,13 +4665,13 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_record_snapshot_destroy(trans, dir); ret = btrfs_unlink_subvol(trans, dir, dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } ret = btrfs_record_root_in_trans(trans, dest); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4660,7 +4685,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, btrfs_root_id(dest)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4668,7 +4693,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest)); - if (ret && ret != -ENOENT) { + if (unlikely(ret && ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4677,7 +4702,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(dest)); - if (ret && ret != -ENOENT) { + if (unlikely(ret && ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4817,7 +4842,7 @@ again: folio_put(folio); goto again; } - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { ret = -EIO; goto out_unlock; } @@ -4905,7 +4930,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e goto out; /* - * Skip the truncatioin if the range in the target block is already aligned. + * Skip the truncation if the range in the target block is already aligned. * The seemingly complex check will also handle the same block case. */ if (in_head_block && !IS_ALIGNED(start, blocksize)) @@ -4961,7 +4986,7 @@ again: folio_put(folio); goto again; } - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { ret = -EIO; goto out_unlock; } @@ -5081,7 +5106,7 @@ static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len) drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -5601,8 +5626,8 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, } btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); - if (location->type != BTRFS_INODE_ITEM_KEY && - location->type != BTRFS_ROOT_ITEM_KEY) { + if (unlikely(location->type != BTRFS_INODE_ITEM_KEY && + location->type != BTRFS_ROOT_ITEM_KEY)) { ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", @@ -5696,7 +5721,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode) bool empty = false; xa_lock(&root->inodes); - entry = __xa_erase(&root->inodes, btrfs_ino(inode)); + /* + * This btrfs_inode is being freed and has already been unhashed at this + * point. It's possible that another btrfs_inode has already been + * allocated for the same inode and inserted itself into the root, so + * don't delete it in that case. + * + * Note that this shouldn't need to allocate memory, so the gfp flags + * don't really matter. + */ + entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL, + GFP_ATOMIC); if (entry == inode) empty = xa_empty(&root->inodes); xa_unlock(&root->inodes); @@ -5883,7 +5918,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_CAST(inode); /* Do extra check against inode mode with di_type */ - if (btrfs_inode_type(inode) != di_type) { + if (unlikely(btrfs_inode_type(inode) != di_type)) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", inode->vfs_inode.i_mode, btrfs_inode_type(inode), @@ -6470,6 +6505,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (!args->subvol) btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); + btrfs_set_inode_mapping_order(BTRFS_I(inode)); if (S_ISREG(inode->i_mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; @@ -6477,7 +6513,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; btrfs_update_inode_mapping_flags(BTRFS_I(inode)); - btrfs_set_inode_mapping_order(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); @@ -6524,7 +6559,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); batch.nr = args->orphan ? 1 : 2; ret = btrfs_insert_empty_items(trans, root, path, &batch); - if (ret != 0) { + if (unlikely(ret != 0)) { btrfs_abort_transaction(trans, ret); goto discard; } @@ -6601,7 +6636,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, */ if (!args->subvol) { ret = btrfs_init_inode_security(trans, args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; } @@ -6621,14 +6656,14 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (args->orphan) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; } } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 0, BTRFS_I(inode)->dir_index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; } @@ -6659,7 +6694,7 @@ out: */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct fscrypt_str *name, int add_backref, u64 index) + const struct fscrypt_str *name, bool add_backref, u64 index) { int ret = 0; struct btrfs_key key; @@ -6692,7 +6727,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_inode_type(inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; - else if (ret) { + else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -6805,7 +6840,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct fscrypt_name fname; u64 index; int ret; - int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) @@ -6837,44 +6871,44 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, /* There are several dir indexes for this inode, clear the cache. */ BTRFS_I(inode)->dir_index = 0ULL; - inc_nlink(inode); inode_inc_iversion(inode); inode_set_ctime_current(inode); - ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); + if (ret) + goto fail; - if (ret) { - drop_inode = 1; - } else { - struct dentry *parent = dentry->d_parent; + /* Link added now we update the inode item with the new link count. */ + inc_nlink(inode); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + goto fail; + } - ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) + if (inode->i_nlink == 1) { + /* + * If the new hard link count is 1, it's a file created with the + * open(2) O_TMPFILE flag. + */ + ret = btrfs_orphan_del(trans, BTRFS_I(inode)); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); goto fail; - if (inode->i_nlink == 1) { - /* - * If new hard link count is 1, it's a file created - * with open(2) O_TMPFILE flag. - */ - ret = btrfs_orphan_del(trans, BTRFS_I(inode)); - if (ret) - goto fail; } - d_instantiate(dentry, inode); - btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); } + /* Grab reference for the new dentry passed to d_instantiate(). */ + ihold(inode); + d_instantiate(dentry, inode); + btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent); + fail: fscrypt_free_filename(&fname); if (trans) btrfs_end_transaction(trans); - if (drop_inode) { - inode_dec_link_count(inode); - iput(inode); - } btrfs_btree_balance_dirty(fs_info); return ret; } @@ -7068,7 +7102,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ - if (!S_ISREG(inode->vfs_inode.i_mode)) { + if (unlikely(!S_ISREG(inode->vfs_inode.i_mode))) { ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", @@ -7145,7 +7179,7 @@ not_found: insert: ret = 0; btrfs_release_path(path); - if (em->start > start || btrfs_extent_map_end(em) <= start) { + if (unlikely(em->start > start || btrfs_extent_map_end(em) <= start)) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); @@ -7830,6 +7864,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; + /* new_delalloc_bytes and last_dir_index_offset are in a union. */ ei->new_delalloc_bytes = 0; ei->defrag_bytes = 0; ei->disk_i_size = 0; @@ -7964,7 +7999,7 @@ int btrfs_drop_inode(struct inode *inode) if (btrfs_root_refs(&root->root_item) == 0) return 1; else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } static void init_once(void *foo) @@ -7972,6 +8007,9 @@ static void init_once(void *foo) struct btrfs_inode *ei = foo; inode_init_once(&ei->vfs_inode); +#ifdef CONFIG_FS_VERITY + ei->i_verity_info = NULL; +#endif } void __cold btrfs_destroy_cachep(void) @@ -8173,7 +8211,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { - if (need_abort) + if (unlikely(need_abort)) btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8221,7 +8259,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8229,12 +8267,12 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8243,7 +8281,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8251,12 +8289,12 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8264,14 +8302,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), new_name, 0, old_idx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), old_name, 0, new_idx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8512,7 +8550,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8520,12 +8558,12 @@ static int btrfs_rename(struct mnt_idmap *idmap, ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8536,7 +8574,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8545,7 +8583,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8553,7 +8591,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8562,7 +8600,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), &new_fname.disk_name, 0, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8576,7 +8614,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (flags & RENAME_WHITEOUT) { ret = btrfs_create_new_inode(trans, &whiteout_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } else { @@ -8870,7 +8908,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); discard_new_inode(inode); @@ -8882,7 +8920,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_free_path(path); discard_new_inode(inode); @@ -9095,7 +9133,7 @@ next: ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans); @@ -9263,7 +9301,7 @@ static ssize_t btrfs_encoded_read_inline( ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { - if (ret > 0) { + if (unlikely(ret > 0)) { /* The extent item disappeared? */ return -EIO; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7e13de2bdcbf..a454b5ba2097 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -376,13 +376,13 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (comp) { ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, strlen(comp), 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0); - if (ret && ret != -ENODATA) { + if (unlikely(ret && ret != -ENODATA)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -633,7 +633,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_clear_buffer_dirty(trans, leaf); btrfs_tree_unlock(leaf); ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1); - if (ret2 < 0) + if (unlikely(ret2 < 0)) btrfs_abort_transaction(trans, ret2); free_extent_buffer(leaf); goto out; @@ -654,14 +654,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap, /* ... and new_root is owned by new_inode_args.inode now. */ ret = btrfs_record_root_in_trans(trans, new_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -669,7 +669,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_record_new_subvolume(trans, BTRFS_I(dir)); ret = btrfs_create_new_inode(trans, &new_inode_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -957,7 +957,7 @@ static noinline int btrfs_mksnapshot(struct dentry *parent, /* * Force new buffered writes to reserve space even when NOCOW is - * possible. This is to avoid later writeback (running dealloc) to + * possible. This is to avoid later writeback (running delalloc) to * fallback to COW mode and unexpectedly fail with ENOSPC. */ btrfs_drew_read_lock(&root->snapshot_lock); @@ -1251,7 +1251,7 @@ out: } static noinline int btrfs_ioctl_snap_create(struct file *file, - void __user *arg, int subvol) + void __user *arg, bool subvol) { struct btrfs_ioctl_vol_args *vol_args; int ret; @@ -2133,7 +2133,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) ret = btrfs_next_leaf(fs_info->tree_root, path); if (ret < 0) { goto out; - } else if (ret > 0) { + } else if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -2216,7 +2216,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, ret = btrfs_next_leaf(root, path); if (ret < 0) { goto out; - } else if (ret > 0) { + } else if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -2245,7 +2245,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, ret = btrfs_next_item(root, path); if (ret < 0) { goto out; - } else if (ret > 0) { + } else if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -4008,7 +4008,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(root)); - if (ret && ret != -ENOENT) { + if (unlikely(ret && ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; @@ -4032,7 +4032,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(root)); - if (ret < 0 && ret != -EEXIST) { + if (unlikely(ret < 0 && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; @@ -4418,6 +4418,10 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_encoded_io_args_32 args32; @@ -4509,6 +4513,7 @@ out_acct: static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) { + struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); struct btrfs_ioctl_encoded_io_args args; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -4522,6 +4527,11 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } + if (!(file->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out_acct; @@ -4780,14 +4790,14 @@ out_fail: static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { + struct file *file = cmd->file; + struct btrfs_inode *inode = BTRFS_I(file->f_inode); + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; int ret; u64 disk_bytenr, disk_io_size; - struct file *file; - struct btrfs_inode *inode; - struct btrfs_fs_info *fs_info; - struct extent_io_tree *io_tree; loff_t pos; struct kiocb kiocb; struct extent_state *cached_state = NULL; @@ -4803,10 +4813,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue ret = -EPERM; goto out_acct; } - file = cmd->file; - inode = BTRFS_I(file->f_inode); - fs_info = inode->root->fs_info; - io_tree = &inode->io_tree; + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (issue_flags & IO_URING_F_COMPAT) { @@ -4933,9 +4944,10 @@ out_acct: static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) { + struct file *file = cmd->file; + struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); loff_t pos; struct kiocb kiocb; - struct file *file; ssize_t ret; void __user *sqe_addr; struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); @@ -4948,8 +4960,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu ret = -EPERM; goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } - file = cmd->file; sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (!(file->f_mode & FMODE_WRITE)) { @@ -5223,13 +5238,13 @@ long btrfs_ioctl(struct file *file, unsigned int case FITRIM: return btrfs_ioctl_fitrim(fs_info, argp); case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, argp, 0); + return btrfs_ioctl_snap_create(file, argp, false); case BTRFS_IOC_SNAP_CREATE_V2: - return btrfs_ioctl_snap_create_v2(file, argp, 0); + return btrfs_ioctl_snap_create_v2(file, argp, false); case BTRFS_IOC_SUBVOL_CREATE: - return btrfs_ioctl_snap_create(file, argp, 1); + return btrfs_ioctl_snap_create(file, argp, true); case BTRFS_IOC_SUBVOL_CREATE_V2: - return btrfs_ioctl_snap_create_v2(file, argp, 1); + return btrfs_ioctl_snap_create_v2(file, argp, true); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp, false); case BTRFS_IOC_SNAP_DESTROY_V2: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index a3e6d9616e60..0035851d72b0 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -361,7 +361,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) atomic_inc(&lock->readers); /* - * Ensure the pending reader count is perceieved BEFORE this reader + * Ensure the pending reader count is perceived BEFORE this reader * goes to sleep in case of active writers. This guarantees new writers * won't be allowed and that the current reader will be woken up when * the last active writer finishes its jobs. diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index af29df98ac14..a4673e7d95d7 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -74,7 +74,7 @@ enum btrfs_lock_nesting { BTRFS_NESTING_NEW_ROOT, /* - * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so + * We are limited to MAX_LOCKDEP_SUBCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over * the limit. As of this writing we're limited to 8, and we're * definitely using 8, hence this check to keep us from messing up in diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d403641889ca..4758f66da449 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -58,9 +58,6 @@ * 0x1000 | SegHdr N+1| Data payload N+1 ... | */ -#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) -#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) - struct workspace { void *mem; void *buf; /* where decompressed data goes */ @@ -68,7 +65,14 @@ struct workspace { struct list_head list; }; -static struct workspace_manager wsm; +static u32 workspace_buf_length(const struct btrfs_fs_info *fs_info) +{ + return lzo1x_worst_compress(fs_info->sectorsize); +} +static u32 workspace_cbuf_length(const struct btrfs_fs_info *fs_info) +{ + return lzo1x_worst_compress(fs_info->sectorsize); +} void lzo_free_workspace(struct list_head *ws) { @@ -80,7 +84,7 @@ void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *lzo_alloc_workspace(void) +struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info) { struct workspace *workspace; @@ -89,8 +93,8 @@ struct list_head *lzo_alloc_workspace(void) return ERR_PTR(-ENOMEM); workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN); - workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); - workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); + workspace->buf = kvmalloc(workspace_buf_length(fs_info), GFP_KERNEL | __GFP_NOWARN); + workspace->cbuf = kvmalloc(workspace_cbuf_length(fs_info), GFP_KERNEL | __GFP_NOWARN); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -128,19 +132,21 @@ static inline size_t read_compress_length(const char *buf) * * Will allocate new pages when needed. */ -static int copy_compressed_data_to_page(char *compressed_data, +static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info, + char *compressed_data, size_t compressed_size, struct folio **out_folios, unsigned long max_nr_folio, - u32 *cur_out, - const u32 sectorsize) + u32 *cur_out) { + const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; u32 sector_bytes_left; u32 orig_out; struct folio *cur_folio; char *kaddr; - if ((*cur_out / PAGE_SIZE) >= max_nr_folio) + if ((*cur_out >> min_folio_shift) >= max_nr_folio) return -E2BIG; /* @@ -149,18 +155,17 @@ static int copy_compressed_data_to_page(char *compressed_data, */ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); - cur_folio = out_folios[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out >> min_folio_shift]; /* Allocate a new page */ if (!cur_folio) { - cur_folio = btrfs_alloc_compr_folio(); + cur_folio = btrfs_alloc_compr_folio(fs_info); if (!cur_folio) return -ENOMEM; - out_folios[*cur_out / PAGE_SIZE] = cur_folio; + out_folios[*cur_out >> min_folio_shift] = cur_folio; } - kaddr = kmap_local_folio(cur_folio, 0); - write_compress_length(kaddr + offset_in_page(*cur_out), - compressed_size); + kaddr = kmap_local_folio(cur_folio, offset_in_folio(cur_folio, *cur_out)); + write_compress_length(kaddr, compressed_size); *cur_out += LZO_LEN; orig_out = *cur_out; @@ -172,20 +177,20 @@ static int copy_compressed_data_to_page(char *compressed_data, kunmap_local(kaddr); - if ((*cur_out / PAGE_SIZE) >= max_nr_folio) + if ((*cur_out >> min_folio_shift) >= max_nr_folio) return -E2BIG; - cur_folio = out_folios[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out >> min_folio_shift]; /* Allocate a new page */ if (!cur_folio) { - cur_folio = btrfs_alloc_compr_folio(); + cur_folio = btrfs_alloc_compr_folio(fs_info); if (!cur_folio) return -ENOMEM; - out_folios[*cur_out / PAGE_SIZE] = cur_folio; + out_folios[*cur_out >> min_folio_shift] = cur_folio; } kaddr = kmap_local_folio(cur_folio, 0); - memcpy(kaddr + offset_in_page(*cur_out), + memcpy(kaddr + offset_in_folio(cur_folio, *cur_out), compressed_data + *cur_out - orig_out, copy_len); *cur_out += copy_len; @@ -209,12 +214,15 @@ out: return 0; } -int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, +int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); - const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize; + const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); + struct address_space *mapping = inode->vfs_inode.i_mapping; struct folio *folio_in = NULL; char *sizes_ptr; const unsigned long max_nr_folio = *out_folios; @@ -263,9 +271,9 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, goto out; } - ret = copy_compressed_data_to_page(workspace->cbuf, out_len, + ret = copy_compressed_data_to_page(fs_info, workspace->cbuf, out_len, folios, max_nr_folio, - &cur_out, sectorsize); + &cur_out); if (ret < 0) goto out; @@ -280,8 +288,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, goto out; } - /* Check if we have reached page boundary */ - if (PAGE_ALIGNED(cur_in)) { + /* Check if we have reached folio boundary. */ + if (IS_ALIGNED(cur_in, min_folio_size)) { folio_put(folio_in); folio_in = NULL; } @@ -298,7 +306,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, out: if (folio_in) folio_put(folio_in); - *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE); + *out_folios = DIV_ROUND_UP(cur_out, min_folio_size); return ret; } @@ -310,15 +318,16 @@ out: static void copy_compressed_segment(struct compressed_bio *cb, char *dest, u32 len, u32 *cur_in) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - struct folio *cur_folio; - u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), - orig_in + len - *cur_in); + struct folio *cur_folio = cb->compressed_folios[*cur_in >> min_folio_shift]; + u32 copy_len = min_t(u32, orig_in + len - *cur_in, + folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in)); ASSERT(copy_len); - cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE]; memcpy_from_folio(dest + *cur_in - orig_in, cur_folio, offset_in_folio(cur_folio, *cur_in), copy_len); @@ -332,6 +341,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; char *kaddr; int ret; /* Compressed data length, can be unaligned */ @@ -378,14 +388,14 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) */ ASSERT(cur_in / sectorsize == (cur_in + LZO_LEN - 1) / sectorsize); - cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE]; + cur_folio = cb->compressed_folios[cur_in >> min_folio_shift]; ASSERT(cur_folio); kaddr = kmap_local_folio(cur_folio, 0); - seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); + seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in)); kunmap_local(kaddr); cur_in += LZO_LEN; - if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) { + if (unlikely(seg_len > workspace_cbuf_length(fs_info))) { struct btrfs_inode *inode = cb->bbio.inode; /* @@ -445,19 +455,19 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; - size_t max_segment_len = WORKSPACE_BUF_LENGTH; + size_t max_segment_len = workspace_buf_length(fs_info); int ret = 0; - if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) + if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)) return -EUCLEAN; in_len = read_compress_length(data_in); - if (in_len != srclen) + if (unlikely(in_len != srclen)) return -EUCLEAN; data_in += LZO_LEN; in_len = read_compress_length(data_in); - if (in_len != srclen - LZO_LEN * 2) { + if (unlikely(in_len != srclen - LZO_LEN * 2)) { ret = -EUCLEAN; goto out; } @@ -487,8 +497,7 @@ out: return ret; } -const struct btrfs_compress_op btrfs_lzo_compress = { - .workspace_manager = &wsm, +const struct btrfs_compress_levels btrfs_lzo_compress = { .max_level = 1, .default_level = 1, }; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 363fd28c0268..a0cf8effe008 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -18,6 +18,7 @@ static const char fs_state_chars[] = { [BTRFS_FS_STATE_REMOUNTING] = 'M', [BTRFS_FS_STATE_RO] = 0, [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_LOG_REPLAY_ABORTED] = 'O', [BTRFS_FS_STATE_DEV_REPLACING] = 'R', [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C', diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 022ebc89af85..4416c165644f 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -4,7 +4,6 @@ #define BTRFS_MESSAGES_H #include <linux/types.h> -#include <linux/types.h> #include <linux/printk.h> #include <linux/bug.h> diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index ff5eac84d819..60f9b000d644 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -11,6 +11,7 @@ #include <linux/pagemap.h> #include <linux/math64.h> #include <linux/rbtree.h> +#include <linux/bio.h> /* * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. @@ -20,6 +21,54 @@ name = (1U << __ ## name ## _BIT), \ __ ## name ## _SEQ = __ ## name ## _BIT +static inline phys_addr_t bio_iter_phys(struct bio *bio, struct bvec_iter *iter) +{ + struct bio_vec bv = bio_iter_iovec(bio, *iter); + + return bvec_phys(&bv); +} + +/* + * Iterate bio using btrfs block size. + * + * This will handle large folio and highmem. + * + * @paddr: Physical memory address of each iteration + * @bio: The bio to iterate + * @iter: The bvec_iter (pointer) to use. + * @blocksize: The blocksize to iterate. + * + * This requires all folios in the bio to cover at least one block. + */ +#define btrfs_bio_for_each_block(paddr, bio, iter, blocksize) \ + for (; (iter)->bi_size && \ + (paddr = bio_iter_phys((bio), (iter)), 1); \ + bio_advance_iter_single((bio), (iter), (blocksize))) + +/* Initialize a bvec_iter to the size of the specified bio. */ +static inline struct bvec_iter init_bvec_iter_for_bio(struct bio *bio) +{ + struct bio_vec *bvec; + u32 bio_size = 0; + int i; + + bio_for_each_bvec_all(bvec, bio, i) + bio_size += bvec->bv_len; + + return (struct bvec_iter) { + .bi_sector = 0, + .bi_size = bio_size, + .bi_idx = 0, + .bi_bvec_done = 0, + }; +} + +#define btrfs_bio_for_each_block_all(paddr, bio, blocksize) \ + for (struct bvec_iter iter = init_bvec_iter_for_bio(bio); \ + (iter).bi_size && \ + (paddr = bio_iter_phys((bio), &(iter)), 1); \ + bio_advance_iter_single((bio), &(iter), (blocksize))) + static inline void cond_wake_up(struct wait_queue_head *wq) { /* diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 74e38da9bd39..62b993fae54f 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -6,12 +6,19 @@ #include "messages.h" #include "ctree.h" #include "disk-io.h" +#include "file-item.h" #include "print-tree.h" #include "accessors.h" #include "tree-checker.h" #include "volumes.h" #include "raid-stripe-tree.h" +/* + * Large enough buffer size for the stringification of any key type yet short + * enough to use the stack and avoid allocations. + */ +#define KEY_TYPE_BUF_SIZE 32 + struct root_name_map { u64 id; const char *name; @@ -227,21 +234,209 @@ static void print_eb_refs_lock(const struct extent_buffer *eb) #endif } +static void print_timespec(const struct extent_buffer *eb, + struct btrfs_timespec *timespec, + const char *prefix, const char *suffix) +{ + const u64 secs = btrfs_timespec_sec(eb, timespec); + const u32 nsecs = btrfs_timespec_nsec(eb, timespec); + + pr_info("%s%llu.%u%s", prefix, secs, nsecs, suffix); +} + +static void print_inode_item(const struct extent_buffer *eb, int i) +{ + struct btrfs_inode_item *ii = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + + pr_info("\t\tinode generation %llu transid %llu size %llu nbytes %llu\n", + btrfs_inode_generation(eb, ii), btrfs_inode_transid(eb, ii), + btrfs_inode_size(eb, ii), btrfs_inode_nbytes(eb, ii)); + pr_info("\t\tblock group %llu mode %o links %u uid %u gid %u\n", + btrfs_inode_block_group(eb, ii), btrfs_inode_mode(eb, ii), + btrfs_inode_nlink(eb, ii), btrfs_inode_uid(eb, ii), + btrfs_inode_gid(eb, ii)); + pr_info("\t\trdev %llu sequence %llu flags 0x%llx\n", + btrfs_inode_rdev(eb, ii), btrfs_inode_sequence(eb, ii), + btrfs_inode_flags(eb, ii)); + print_timespec(eb, &ii->atime, "\t\tatime ", "\n"); + print_timespec(eb, &ii->ctime, "\t\tctime ", "\n"); + print_timespec(eb, &ii->mtime, "\t\tmtime ", "\n"); + print_timespec(eb, &ii->otime, "\t\totime ", "\n"); +} + +static void print_dir_item(const struct extent_buffer *eb, int i) +{ + const u32 size = btrfs_item_size(eb, i); + struct btrfs_dir_item *di = btrfs_item_ptr(eb, i, struct btrfs_dir_item); + u32 cur = 0; + + while (cur < size) { + const u32 name_len = btrfs_dir_name_len(eb, di); + const u32 data_len = btrfs_dir_data_len(eb, di); + const u32 len = sizeof(*di) + name_len + data_len; + struct btrfs_key location; + + btrfs_dir_item_key_to_cpu(eb, di, &location); + pr_info("\t\tlocation key (%llu %u %llu) type %d\n", + location.objectid, location.type, location.offset, + btrfs_dir_ftype(eb, di)); + pr_info("\t\ttransid %llu data_len %u name_len %u\n", + btrfs_dir_transid(eb, di), data_len, name_len); + di = (struct btrfs_dir_item *)((char *)di + len); + cur += len; + } +} + +static void print_inode_ref_item(const struct extent_buffer *eb, int i) +{ + const u32 size = btrfs_item_size(eb, i); + struct btrfs_inode_ref *ref = btrfs_item_ptr(eb, i, struct btrfs_inode_ref); + u32 cur = 0; + + while (cur < size) { + const u64 index = btrfs_inode_ref_index(eb, ref); + const u32 name_len = btrfs_inode_ref_name_len(eb, ref); + const u32 len = sizeof(*ref) + name_len; + + pr_info("\t\tindex %llu name_len %u\n", index, name_len); + ref = (struct btrfs_inode_ref *)((char *)ref + len); + cur += len; + } +} + +static void print_inode_extref_item(const struct extent_buffer *eb, int i) +{ + const u32 size = btrfs_item_size(eb, i); + struct btrfs_inode_extref *extref; + u32 cur = 0; + + extref = btrfs_item_ptr(eb, i, struct btrfs_inode_extref); + while (cur < size) { + const u64 index = btrfs_inode_extref_index(eb, extref); + const u32 name_len = btrfs_inode_extref_name_len(eb, extref); + const u64 parent = btrfs_inode_extref_parent(eb, extref); + const u32 len = sizeof(*extref) + name_len; + + pr_info("\t\tindex %llu parent %llu name_len %u\n", + index, parent, name_len); + extref = (struct btrfs_inode_extref *)((char *)extref + len); + cur += len; + } +} + +static void print_dir_log_index_item(const struct extent_buffer *eb, int i) +{ + struct btrfs_dir_log_item *dlog; + + dlog = btrfs_item_ptr(eb, i, struct btrfs_dir_log_item); + pr_info("\t\tdir log end %llu\n", btrfs_dir_log_end(eb, dlog)); +} + +static void print_extent_csum(const struct extent_buffer *eb, int i) +{ + const struct btrfs_fs_info *fs_info = eb->fs_info; + const u32 size = btrfs_item_size(eb, i); + const u32 csum_bytes = (size / fs_info->csum_size) * fs_info->sectorsize; + struct btrfs_key key; + + btrfs_item_key_to_cpu(eb, &key, i); + pr_info("\t\trange start %llu end %llu length %u\n", + key.offset, key.offset + csum_bytes, csum_bytes); +} + +static void print_file_extent_item(const struct extent_buffer *eb, int i) +{ + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + pr_info("\t\tgeneration %llu type %hhu\n", + btrfs_file_extent_generation(eb, fi), + btrfs_file_extent_type(eb, fi)); + + if (btrfs_file_extent_type(eb, fi) == BTRFS_FILE_EXTENT_INLINE) { + pr_info("\t\tinline extent data size %u ram_bytes %llu compression %hhu\n", + btrfs_file_extent_inline_item_len(eb, i), + btrfs_file_extent_ram_bytes(eb, fi), + btrfs_file_extent_compression(eb, fi)); + return; + } + + pr_info("\t\textent data disk bytenr %llu nr %llu\n", + btrfs_file_extent_disk_bytenr(eb, fi), + btrfs_file_extent_disk_num_bytes(eb, fi)); + pr_info("\t\textent data offset %llu nr %llu ram %llu\n", + btrfs_file_extent_offset(eb, fi), + btrfs_file_extent_num_bytes(eb, fi), + btrfs_file_extent_ram_bytes(eb, fi)); + pr_info("\t\textent compression %hhu\n", + btrfs_file_extent_compression(eb, fi)); +} + +static void key_type_string(const struct btrfs_key *key, char *buf, int buf_size) +{ + static const char *key_to_str[256] = { + [BTRFS_INODE_ITEM_KEY] = "INODE_ITEM", + [BTRFS_INODE_REF_KEY] = "INODE_REF", + [BTRFS_INODE_EXTREF_KEY] = "INODE_EXTREF", + [BTRFS_DIR_ITEM_KEY] = "DIR_ITEM", + [BTRFS_DIR_INDEX_KEY] = "DIR_INDEX", + [BTRFS_DIR_LOG_ITEM_KEY] = "DIR_LOG_ITEM", + [BTRFS_DIR_LOG_INDEX_KEY] = "DIR_LOG_INDEX", + [BTRFS_XATTR_ITEM_KEY] = "XATTR_ITEM", + [BTRFS_VERITY_DESC_ITEM_KEY] = "VERITY_DESC_ITEM", + [BTRFS_VERITY_MERKLE_ITEM_KEY] = "VERITY_MERKLE_ITEM", + [BTRFS_ORPHAN_ITEM_KEY] = "ORPHAN_ITEM", + [BTRFS_ROOT_ITEM_KEY] = "ROOT_ITEM", + [BTRFS_ROOT_REF_KEY] = "ROOT_REF", + [BTRFS_ROOT_BACKREF_KEY] = "ROOT_BACKREF", + [BTRFS_EXTENT_ITEM_KEY] = "EXTENT_ITEM", + [BTRFS_METADATA_ITEM_KEY] = "METADATA_ITEM", + [BTRFS_TREE_BLOCK_REF_KEY] = "TREE_BLOCK_REF", + [BTRFS_SHARED_BLOCK_REF_KEY] = "SHARED_BLOCK_REF", + [BTRFS_EXTENT_DATA_REF_KEY] = "EXTENT_DATA_REF", + [BTRFS_SHARED_DATA_REF_KEY] = "SHARED_DATA_REF", + [BTRFS_EXTENT_OWNER_REF_KEY] = "EXTENT_OWNER_REF", + [BTRFS_EXTENT_CSUM_KEY] = "EXTENT_CSUM", + [BTRFS_EXTENT_DATA_KEY] = "EXTENT_DATA", + [BTRFS_BLOCK_GROUP_ITEM_KEY] = "BLOCK_GROUP_ITEM", + [BTRFS_FREE_SPACE_INFO_KEY] = "FREE_SPACE_INFO", + [BTRFS_FREE_SPACE_EXTENT_KEY] = "FREE_SPACE_EXTENT", + [BTRFS_FREE_SPACE_BITMAP_KEY] = "FREE_SPACE_BITMAP", + [BTRFS_CHUNK_ITEM_KEY] = "CHUNK_ITEM", + [BTRFS_DEV_ITEM_KEY] = "DEV_ITEM", + [BTRFS_DEV_EXTENT_KEY] = "DEV_EXTENT", + [BTRFS_TEMPORARY_ITEM_KEY] = "TEMPORARY_ITEM", + [BTRFS_DEV_REPLACE_KEY] = "DEV_REPLACE", + [BTRFS_STRING_ITEM_KEY] = "STRING_ITEM", + [BTRFS_QGROUP_STATUS_KEY] = "QGROUP_STATUS", + [BTRFS_QGROUP_RELATION_KEY] = "QGROUP_RELATION", + [BTRFS_QGROUP_INFO_KEY] = "QGROUP_INFO", + [BTRFS_QGROUP_LIMIT_KEY] = "QGROUP_LIMIT", + [BTRFS_PERSISTENT_ITEM_KEY] = "PERSISTENT_ITEM", + [BTRFS_UUID_KEY_SUBVOL] = "UUID_KEY_SUBVOL", + [BTRFS_UUID_KEY_RECEIVED_SUBVOL] = "UUID_KEY_RECEIVED_SUBVOL", + [BTRFS_RAID_STRIPE_KEY] = "RAID_STRIPE", + }; + + if (key->type == 0 && key->objectid == BTRFS_FREE_SPACE_OBJECTID) + scnprintf(buf, buf_size, "UNTYPED"); + else if (key_to_str[key->type]) + scnprintf(buf, buf_size, key_to_str[key->type]); + else + scnprintf(buf, buf_size, "UNKNOWN.%d", key->type); +} + void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; int i; u32 type, nr; struct btrfs_root_item *ri; - struct btrfs_dir_item *di; - struct btrfs_inode_item *ii; struct btrfs_block_group_item *bi; - struct btrfs_file_extent_item *fi; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; struct btrfs_dev_extent *dev_extent; struct btrfs_key key; - struct btrfs_key found_key; if (!l) return; @@ -255,25 +450,35 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_leaf_free_space(l), btrfs_header_owner(l)); print_eb_refs_lock(l); for (i = 0 ; i < nr ; i++) { + char key_buf[KEY_TYPE_BUF_SIZE]; + btrfs_item_key_to_cpu(l, &key, i); type = key.type; - pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n", - i, key.objectid, type, key.offset, + key_type_string(&key, key_buf, KEY_TYPE_BUF_SIZE); + + pr_info("\titem %d key (%llu %s %llu) itemoff %d itemsize %d\n", + i, key.objectid, key_buf, key.offset, btrfs_item_offset(l, i), btrfs_item_size(l, i)); switch (type) { case BTRFS_INODE_ITEM_KEY: - ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); - pr_info("\t\tinode generation %llu size %llu mode %o\n", - btrfs_inode_generation(l, ii), - btrfs_inode_size(l, ii), - btrfs_inode_mode(l, ii)); + print_inode_item(l, i); + break; + case BTRFS_INODE_REF_KEY: + print_inode_ref_item(l, i); + break; + case BTRFS_INODE_EXTREF_KEY: + print_inode_extref_item(l, i); break; case BTRFS_DIR_ITEM_KEY: - di = btrfs_item_ptr(l, i, struct btrfs_dir_item); - btrfs_dir_item_key_to_cpu(l, di, &found_key); - pr_info("\t\tdir oid %llu flags %u\n", - found_key.objectid, - btrfs_dir_flags(l, di)); + case BTRFS_DIR_INDEX_KEY: + case BTRFS_XATTR_ITEM_KEY: + print_dir_item(l, i); + break; + case BTRFS_DIR_LOG_INDEX_KEY: + print_dir_log_index_item(l, i); + break; + case BTRFS_EXTENT_CSUM_KEY: + print_extent_csum(l, i); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); @@ -303,24 +508,7 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_shared_data_ref_count(l, sref)); break; case BTRFS_EXTENT_DATA_KEY: - fi = btrfs_item_ptr(l, i, - struct btrfs_file_extent_item); - pr_info("\t\tgeneration %llu type %hhu\n", - btrfs_file_extent_generation(l, fi), - btrfs_file_extent_type(l, fi)); - if (btrfs_file_extent_type(l, fi) == - BTRFS_FILE_EXTENT_INLINE) { - pr_info("\t\tinline extent data size %llu\n", - btrfs_file_extent_ram_bytes(l, fi)); - break; - } - pr_info("\t\textent data disk bytenr %llu nr %llu\n", - btrfs_file_extent_disk_bytenr(l, fi), - btrfs_file_extent_disk_num_bytes(l, fi)); - pr_info("\t\textent data offset %llu nr %llu ram %llu\n", - btrfs_file_extent_offset(l, fi), - btrfs_file_extent_num_bytes(l, fi), - btrfs_file_extent_ram_bytes(l, fi)); + print_file_extent_item(l, i); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ccaa9a3cf1ce..1175b8192cd7 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1069,7 +1069,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, } path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out_free_root; @@ -1081,7 +1081,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*ptr)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1111,7 +1111,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); if (ret > 0) goto out_add_root; - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1129,7 +1129,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, /* We should not have a stray @prealloc pointer. */ ASSERT(prealloc == NULL); prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); - if (!prealloc) { + if (unlikely(!prealloc)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out_free_path; @@ -1137,7 +1137,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, ret = add_qgroup_item(trans, quota_root, found_key.offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1145,13 +1145,13 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } ret = btrfs_search_slot_for_read(tree_root, &found_key, path, 1, 0); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1165,7 +1165,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, } } ret = btrfs_next_item(tree_root, path); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1176,7 +1176,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, out_add_root: btrfs_release_path(path); ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1190,7 +1190,7 @@ out_add_root: qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID); prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1376,13 +1376,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_free_qgroup_config(fs_info); ret = btrfs_clean_quota_tree(trans, quota_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_del_root(trans, "a_root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1455,6 +1455,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *qgroup; LIST_HEAD(qgroup_list); u64 num_bytes = src->excl; + u64 num_bytes_cmpr = src->excl_cmpr; int ret = 0; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -1466,11 +1467,12 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup_list *glist; qgroup->rfer += sign * num_bytes; - qgroup->rfer_cmpr += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes_cmpr; WARN_ON(sign < 0 && qgroup->excl < num_bytes); + WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr); qgroup->excl += sign * num_bytes; - qgroup->excl_cmpr += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes_cmpr; if (sign > 0) qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); @@ -2424,9 +2426,9 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, int i; /* Level sanity check */ - if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || - root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || - root_level < cur_level) { + if (unlikely(cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || + root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || + root_level < cur_level)) { btrfs_err_rl(fs_info, "%s: bad levels, cur_level=%d root_level=%d", __func__, cur_level, root_level); @@ -2442,7 +2444,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, * dst_path->nodes[root_level] must be initialized before * calling this function. */ - if (cur_level == root_level) { + if (unlikely(cur_level == root_level)) { btrfs_err_rl(fs_info, "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", __func__, root_level, root_level, cur_level); @@ -2528,7 +2530,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, return 0; /* Wrong parameter order */ - if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { + if (unlikely(btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb))) { btrfs_err_rl(fs_info, "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, btrfs_header_generation(src_eb), @@ -2536,7 +2538,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, return -EUCLEAN; } - if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { + if (unlikely(!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb))) { ret = -EIO; goto out; } @@ -2727,7 +2729,7 @@ static void qgroup_iterator_nested_clean(struct list_head *head) */ static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info, struct ulist *roots, struct list_head *qgroups, - u64 seq, int update_old) + u64 seq, bool update_old) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -4708,8 +4710,8 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, if (!btrfs_qgroup_full_accounting(fs_info)) return 0; - if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > - btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { + if (unlikely(btrfs_node_ptr_generation(subvol_parent, subvol_slot) > + btrfs_node_ptr_generation(reloc_parent, reloc_slot))) { btrfs_err_rl(fs_info, "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", __func__, @@ -4841,7 +4843,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, reloc_eb = NULL; goto free_out; } - if (!extent_buffer_uptodate(reloc_eb)) { + if (unlikely(!extent_buffer_uptodate(reloc_eb))) { ret = -EIO; goto free_out; } diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index cab0b291088c..cc6f6095cc9f 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -67,7 +67,7 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *stripe_root = fs_info->stripe_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *leaf; u64 found_start; @@ -260,7 +260,6 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le btrfs_release_path(path); } - btrfs_free_path(path); return ret; } @@ -269,7 +268,7 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, struct btrfs_stripe_extent *stripe_extent, const size_t item_size) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; int ret; int slot; @@ -288,7 +287,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), item_size); - btrfs_free_path(path); return ret; } @@ -306,7 +304,7 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, int ret; stripe_extent = kzalloc(item_size, GFP_NOFS); - if (!stripe_extent) { + if (!unlikely(stripe_extent)) { btrfs_abort_transaction(trans, -ENOMEM); btrfs_end_transaction(trans); return -ENOMEM; @@ -376,7 +374,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, struct btrfs_stripe_extent *stripe_extent; struct btrfs_key stripe_key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; const u64 end = logical + *length; int num_stripes; @@ -402,7 +400,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0); if (ret < 0) - goto free_path; + return ret; if (ret) { if (path->slots[0] != 0) path->slots[0]--; @@ -459,8 +457,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, trace_btrfs_get_raid_extent_offset(fs_info, logical, *length, stripe->physical, devid); - ret = 0; - goto free_path; + return 0; } /* If we're here, we haven't found the requested devid in the stripe. */ @@ -474,8 +471,6 @@ out: logical, logical + *length, stripe->dev->devid, btrfs_bg_type_to_raid_name(map_type)); } -free_path: - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 3ff2bedfb3a4..0135dceb7baa 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1167,7 +1167,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, /* Check if we have reached tolerance early. */ found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) + if (unlikely(found_errors > rbio->bioc->max_errors)) return -EIO; return 0; } @@ -1208,17 +1208,16 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) const u32 sectorsize = rbio->bioc->fs_info->sectorsize; const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits; struct bvec_iter iter = bio->bi_iter; + phys_addr_t paddr; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; - while (iter.bi_size) { + btrfs_bio_for_each_block(paddr, bio, &iter, sectorsize) { unsigned int index = (offset >> sectorsize_bits); struct sector_ptr *sector = &rbio->bio_sectors[index]; - struct bio_vec bv = bio_iter_iovec(bio, iter); sector->has_paddr = true; - sector->paddr = bvec_phys(&bv); - bio_advance_iter_single(bio, &iter, sectorsize); + sector->paddr = paddr; offset += sectorsize; } } @@ -1511,22 +1510,17 @@ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, */ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + const u32 blocksize = rbio->bioc->fs_info->sectorsize; + phys_addr_t paddr; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - struct sector_ptr *sector; - phys_addr_t paddr = bvec_phys(bvec); + btrfs_bio_for_each_block_all(paddr, bio, blocksize) { + struct sector_ptr *sector = find_stripe_sector(rbio, paddr); - for (u32 off = 0; off < bvec->bv_len; off += sectorsize) { - sector = find_stripe_sector(rbio, paddr + off); - ASSERT(sector); - if (sector) - sector->uptodate = 1; - } + ASSERT(sector); + if (sector) + sector->uptodate = 1; } } @@ -1573,8 +1567,7 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; int total_sector_nr = get_bio_sector_nr(rbio, bio); - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + phys_addr_t paddr; /* No data csum for the whole stripe, no need to verify. */ if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1584,27 +1577,20 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) return; - bio_for_each_segment_all(bvec, bio, iter_all) { - void *kaddr; - - kaddr = bvec_kmap_local(bvec); - for (u32 off = 0; off < bvec->bv_len; - off += fs_info->sectorsize, total_sector_nr++) { - u8 csum_buf[BTRFS_CSUM_SIZE]; - u8 *expected_csum = rbio->csum_buf + - total_sector_nr * fs_info->csum_size; - int ret; + btrfs_bio_for_each_block_all(paddr, bio, fs_info->sectorsize) { + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; + int ret; - /* No csum for this sector, skip to the next sector. */ - if (!test_bit(total_sector_nr, rbio->csum_bitmap)) - continue; + /* No csum for this sector, skip to the next sector. */ + if (!test_bit(total_sector_nr, rbio->csum_bitmap)) + continue; - ret = btrfs_check_sector_csum(fs_info, kaddr + off, - csum_buf, expected_csum); - if (ret < 0) - set_bit(total_sector_nr, rbio->error_bitmap); - } - kunmap_local(kaddr); + ret = btrfs_check_block_csum(fs_info, paddr, + csum_buf, expected_csum); + if (ret < 0) + set_bit(total_sector_nr, rbio->error_bitmap); + total_sector_nr++; } } @@ -1802,7 +1788,6 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, struct sector_ptr *sector; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; - void *kaddr; int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1824,9 +1809,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - kaddr = kmap_local_sector(sector); - ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected); - kunmap_local(kaddr); + ret = btrfs_check_block_csum(fs_info, sector->paddr, csum_buf, csum_expected); return ret; } @@ -1864,7 +1847,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, if (!found_errors) return 0; - if (found_errors > rbio->bioc->max_errors) + if (unlikely(found_errors > rbio->bioc->max_errors)) return -EIO; /* @@ -2416,7 +2399,7 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio) int found_errors; found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; } @@ -2705,7 +2688,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; goto out; } @@ -2729,7 +2712,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) * data, so the capability of the repair is declined. (In the * case of RAID5, we can not repair anything.) */ - if (dfail > rbio->bioc->max_errors - 1) { + if (unlikely(dfail > rbio->bioc->max_errors - 1)) { ret = -EIO; goto out; } @@ -2746,7 +2729,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) * scrubbing parity, luckily, use the other one to repair the * data, or we can not repair the data stripe. */ - if (failp != rbio->scrubp) { + if (unlikely(failp != rbio->scrubp)) { ret = -EIO; goto out; } @@ -2837,7 +2820,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio) int found_errors; found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; } @@ -2861,19 +2844,22 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) * This is for scrub call sites where we already have correct data contents. * This allows us to avoid reading data stripes again. * - * Unfortunately here we have to do page copy, other than reusing the pages. + * Unfortunately here we have to do folio copy, other than reusing the pages. * This is due to the fact rbio has its own page management for its cache. */ -void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, - struct page **data_pages, u64 data_logical) +void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, + struct folio **data_folios, u64 data_logical) { + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; const u64 offset_in_full_stripe = data_logical - rbio->bioc->full_stripe_logical; - const int page_index = offset_in_full_stripe >> PAGE_SHIFT; - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - const u32 sectors_per_page = PAGE_SIZE / sectorsize; + unsigned int findex = 0; + unsigned int foffset = 0; int ret; + /* We shouldn't hit RAID56 for bs > ps cases for now. */ + ASSERT(fs_info->sectorsize <= PAGE_SIZE); + /* * If we hit ENOMEM temporarily, but later at * raid56_parity_submit_scrub_rbio() time it succeeded, we just do @@ -2890,14 +2876,25 @@ void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); - for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { - struct page *dst = rbio->stripe_pages[page_nr + page_index]; - struct page *src = data_pages[page_nr]; + for (unsigned int cur_off = offset_in_full_stripe; + cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN; + cur_off += PAGE_SIZE) { + const unsigned int pindex = cur_off >> PAGE_SHIFT; + void *kaddr; + + kaddr = kmap_local_page(rbio->stripe_pages[pindex]); + memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE); + kunmap_local(kaddr); - memcpy_page(dst, 0, src, 0, PAGE_SIZE); - for (int sector_nr = sectors_per_page * page_index; - sector_nr < sectors_per_page * (page_index + 1); - sector_nr++) - rbio->stripe_sectors[sector_nr].uptodate = true; + foffset += PAGE_SIZE; + ASSERT(foffset <= folio_size(data_folios[findex])); + if (foffset == folio_size(data_folios[findex])) { + findex++; + foffset = 0; + } } + for (unsigned int sector_nr = offset_in_full_stripe >> fs_info->sectorsize_bits; + sector_nr < (offset_in_full_stripe + BTRFS_STRIPE_LEN) >> fs_info->sectorsize_bits; + sector_nr++) + rbio->stripe_sectors[sector_nr].uptodate = true; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0d7b4c2fb6ae..84c4d1d29c7a 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -201,8 +201,8 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); -void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, - struct page **data_pages, u64 data_logical); +void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, + struct folio **data_folios, u64 data_logical); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 3871c3a6c743..de4cb0f3fbd0 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -971,7 +971,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *extent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; int tree_block_level = 0; u64 bytenr = 0, num_bytes = 0; @@ -980,11 +980,18 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; + extent_root = btrfs_extent_root(fs_info, 0); + /* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */ + if (IS_ERR(extent_root)) { + btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling"); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; - extent_root = btrfs_extent_root(fs_info, 0); eb = btrfs_read_lock_root_node(extent_root); level = btrfs_header_level(eb); path->nodes[level] = eb; @@ -1014,6 +1021,5 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 559bd25a2b7a..1ce544d53cc5 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -12,7 +12,7 @@ struct btrfs_fs_info; struct btrfs_ref; -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG #include <linux/spinlock.h> @@ -53,6 +53,6 @@ static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) { } -#endif /* CONFIG_BTRFS_FS_REF_VERIFY */ +#endif /* CONFIG_BTRFS_DEBUG */ #endif diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index ce25ab7f0e99..5465a5eae9b2 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -23,7 +23,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, u64 endoff, const u64 destoff, const u64 olen, - int no_time_update) + bool no_time_update) { int ret; @@ -43,7 +43,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, } ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -268,12 +268,12 @@ copy_inline_extent: drop_args.end = aligned_end; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_insert_empty_item(trans, root, path, new_key, size); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -285,7 +285,7 @@ copy_inline_extent: btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); btrfs_set_inode_full_sync(inode); ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); out: if (!ret && !trans) { @@ -337,10 +337,10 @@ copy_to_page: */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff, int no_time_update) + const u64 destoff, bool no_time_update) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_trans_handle *trans; char *buf = NULL; @@ -611,7 +611,6 @@ process_slot: } out: - btrfs_free_path(path); kvfree(buf); clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7256f6748c8f..8dd8de6b9fb8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -821,7 +821,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, u64 bytenr, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(reloc_inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; int ret; @@ -834,11 +834,9 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + return ret; + if (ret > 0) + return -ENOENT; leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], @@ -849,16 +847,11 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)); - if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { - ret = -EINVAL; - goto out; - } + if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) + return -EINVAL; *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -974,7 +967,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_init_data_ref(&ref, key.objectid, key.offset, btrfs_root_id(root), false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -988,7 +981,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_init_data_ref(&ref, key.objectid, key.offset, btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1199,7 +1192,7 @@ again: ref.ref_root = btrfs_root_id(src); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1212,7 +1205,7 @@ again: ref.ref_root = btrfs_root_id(dest); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1226,7 +1219,7 @@ again: ref.ref_root = btrfs_root_id(src); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1240,7 +1233,7 @@ again: ref.ref_root = btrfs_root_id(dest); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1490,7 +1483,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) * ->reloc_root. If it fails however we must * drop the ref ourselves. */ - ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); + ret2 = btrfs_drop_snapshot(reloc_root, false, true); if (ret2 < 0) { btrfs_put_root(reloc_root); if (!ret) @@ -1500,7 +1493,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) btrfs_put_root(root); } else { /* Orphan reloc tree, just clean it up */ - ret2 = btrfs_drop_snapshot(root, 0, 1); + ret2 = btrfs_drop_snapshot(root, false, true); if (ret2 < 0) { btrfs_put_root(root); if (!ret) @@ -1791,7 +1784,7 @@ again: list_add(&reloc_root->root_list, &reloc_roots); btrfs_put_root(root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); if (!err) err = ret; @@ -1960,7 +1953,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root)); return PTR_ERR(root); } - if (root->reloc_root != reloc_root) { + if (unlikely(root->reloc_root != reloc_root)) { DEBUG_WARN("unexpected reloc root found"); btrfs_err(fs_info, "root %llu has two reloc roots associated with it", @@ -2031,7 +2024,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, if (!root) return ERR_PTR(-ENOENT); - if (next->new_bytenr) { + if (unlikely(next->new_bytenr)) { /* * We just created the reloc root, so we shouldn't have * ->new_bytenr set yet. If it is then we have multiple roots @@ -2090,7 +2083,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) * This can occur if we have incomplete extent refs leading all * the way up a particular path, in this case return -EUCLEAN. */ - if (!root) + if (unlikely(!root)) return ERR_PTR(-EUCLEAN); /* No other choice for non-shareable tree */ @@ -2277,7 +2270,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(upper->eb, slot); if (lowest) { - if (bytenr != node->bytenr) { + if (unlikely(bytenr != node->bytenr)) { btrfs_err(root->fs_info, "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", bytenr, node->bytenr, slot, @@ -2332,7 +2325,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!ret) ret = btrfs_drop_subtree(trans, root, eb, upper->eb); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } next: @@ -2454,7 +2447,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, eb = read_tree_block(fs_info, block->bytenr, &check); if (IS_ERR(eb)) return PTR_ERR(eb); - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); return -EIO; } @@ -2519,7 +2512,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, * normal user in the case of corruption. */ ASSERT(node->new_bytenr == 0); - if (node->new_bytenr) { + if (unlikely(node->new_bytenr)) { btrfs_err(root->fs_info, "bytenr %llu has improper references to it", node->bytenr); @@ -2839,7 +2832,7 @@ again: if (!folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { ret = -EIO; goto release_folio; } @@ -3158,7 +3151,7 @@ static int __add_tree_block(struct reloc_control *rc, struct rb_root *blocks) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret; bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); @@ -3186,7 +3179,7 @@ again: path->skip_locking = 1; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0 && skinny) { if (path->slots[0]) { @@ -3213,14 +3206,10 @@ again: "tree block extent item (%llu) is not found in extent tree", bytenr); WARN_ON(1); - ret = -EINVAL; - goto out; + return -EINVAL; } - ret = add_tree_block(rc, &key, path, blocks); -out: - btrfs_free_path(path); - return ret; + return add_tree_block(rc, &key, path, blocks); } static int delete_block_group_cache(struct btrfs_block_group *block_group, @@ -3510,7 +3499,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) struct rb_root blocks = RB_ROOT; struct btrfs_key key; struct btrfs_trans_handle *trans = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; u64 flags; int ret; @@ -3679,14 +3668,13 @@ out_free: if (ret < 0 && !err) err = ret; btrfs_free_block_rsv(fs_info, rc->block_rsv); - btrfs_free_path(path); return err; } static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_item *item; struct extent_buffer *leaf; int ret; @@ -3697,7 +3685,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_inode(trans, root, path, objectid); if (ret) - goto out; + return ret; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -3707,15 +3695,13 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); -out: - btrfs_free_path(path); - return ret; + return 0; } static void delete_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -3738,7 +3724,6 @@ static void delete_orphan_inode(struct btrfs_trans_handle *trans, out: if (ret) btrfs_abort_transaction(trans, ret); - btrfs_free_path(path); } /* diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index e22e6b06927a..d07eab70f759 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -85,7 +85,7 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, * Key with offset -1 found, there would have to exist a root * with such id, but this is out of the valid range. */ - if (ret == 0) { + if (unlikely(ret == 0)) { ret = -EUCLEAN; goto out; } @@ -130,7 +130,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *item) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *l; int ret; int slot; @@ -143,15 +143,15 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { + if (unlikely(ret > 0)) { btrfs_crit(fs_info, "unable to find root key (%llu %u %llu) in tree %llu", key->objectid, key->type, key->offset, btrfs_root_id(root)); ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } l = path->nodes[0]; @@ -168,22 +168,22 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_release_path(path); ret = btrfs_search_slot(trans, root, key, path, -1, 1); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ret = btrfs_del_item(trans, root, path); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } l = path->nodes[0]; slot = path->slots[0]; @@ -197,8 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); write_extent_buffer(l, item, ptr, sizeof(*item)); -out: - btrfs_free_path(path); return ret; } @@ -216,7 +214,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root *tree_root = fs_info->tree_root; struct extent_buffer *leaf; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root; int err = 0; @@ -309,7 +307,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) btrfs_put_root(root); } - btrfs_free_path(path); return err; } @@ -318,7 +315,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key) { struct btrfs_root *root = trans->fs_info->tree_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; path = btrfs_alloc_path(); @@ -326,17 +323,12 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) - goto out; - if (ret != 0) { + return ret; + if (unlikely(ret > 0)) /* The root must exist but we did not find it by the key. */ - ret = -EUCLEAN; - goto out; - } + return -EUCLEAN; - ret = btrfs_del_item(trans, root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, @@ -344,7 +336,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, const struct fscrypt_str *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root_ref *ref; struct extent_buffer *leaf; struct btrfs_key key; @@ -361,7 +353,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) { - goto out; + return ret; } else if (ret == 0) { leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], @@ -369,18 +361,16 @@ again: ptr = (unsigned long)(ref + 1); if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || (btrfs_root_ref_name_len(leaf, ref) != name->len) || - memcmp_extent_buffer(leaf, name->name, ptr, name->len)) { - ret = -ENOENT; - goto out; - } + memcmp_extent_buffer(leaf, name->name, ptr, name->len)) + return -ENOENT; + *sequence = btrfs_root_ref_sequence(leaf, ref); ret = btrfs_del_item(trans, tree_root, path); if (ret) - goto out; + return ret; } else { - ret = -ENOENT; - goto out; + return -ENOENT; } if (key.type == BTRFS_ROOT_BACKREF_KEY) { @@ -391,8 +381,6 @@ again: goto again; } -out: - btrfs_free_path(path); return ret; } @@ -418,7 +406,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_key key; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root_ref *ref; struct extent_buffer *leaf; unsigned long ptr; @@ -433,9 +421,8 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name->len); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - btrfs_free_path(path); return ret; } @@ -455,7 +442,6 @@ again: goto again; } - btrfs_free_path(path); return 0; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6776e6ab8d10..4691d0bdb2e8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -113,7 +113,7 @@ enum { /* Which blocks are covered by extent items. */ scrub_bitmap_nr_has_extent = 0, - /* Which blocks are meteadata. */ + /* Which blocks are metadata. */ scrub_bitmap_nr_is_metadata, /* @@ -130,7 +130,7 @@ enum { scrub_bitmap_nr_last, }; -#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) +#define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE) /* * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. @@ -139,7 +139,7 @@ struct scrub_stripe { struct scrub_ctx *sctx; struct btrfs_block_group *bg; - struct page *pages[SCRUB_STRIPE_PAGES]; + struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS]; struct scrub_sector_verification *sectors; struct btrfs_device *dev; @@ -206,7 +206,7 @@ struct scrub_ctx { ktime_t throttle_deadline; u64 throttle_sent; - int is_dev_replace; + bool is_dev_replace; u64 write_pointer; struct mutex wr_lock; @@ -339,10 +339,10 @@ static void release_scrub_stripe(struct scrub_stripe *stripe) if (!stripe) return; - for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) { - if (stripe->pages[i]) - __free_page(stripe->pages[i]); - stripe->pages[i] = NULL; + for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) { + if (stripe->folios[i]) + folio_put(stripe->folios[i]); + stripe->folios[i] = NULL; } kfree(stripe->sectors); kfree(stripe->csums); @@ -355,6 +355,7 @@ static void release_scrub_stripe(struct scrub_stripe *stripe) static int init_scrub_stripe(struct btrfs_fs_info *fs_info, struct scrub_stripe *stripe) { + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; int ret; memset(stripe, 0, sizeof(*stripe)); @@ -367,7 +368,9 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info, atomic_set(&stripe->pending_io, 0); spin_lock_init(&stripe->write_error_lock); - ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false); + ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS); + ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift, + fs_info->block_min_order, stripe->folios); if (ret < 0) goto error; @@ -446,7 +449,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx) } static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( - struct btrfs_fs_info *fs_info, int is_dev_replace) + struct btrfs_fs_info *fs_info, bool is_dev_replace) { struct scrub_ctx *sctx; int i; @@ -585,7 +588,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * bool is_super, u64 logical, u64 physical) { struct btrfs_fs_info *fs_info = dev->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key found_key; struct extent_buffer *eb; struct btrfs_extent_item *ei; @@ -612,7 +615,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, &flags); if (ret < 0) - goto out; + return; swarn.extent_item_size = found_key.offset; @@ -658,9 +661,6 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); } - -out: - btrfs_free_path(path); } static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) @@ -687,13 +687,30 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) { - u32 offset = (sector_nr << stripe->bg->fs_info->sectorsize_bits); - const struct page *page = stripe->pages[offset >> PAGE_SHIFT]; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + u32 offset = (sector_nr << fs_info->sectorsize_bits); + const struct folio *folio = stripe->folios[offset >> min_folio_shift]; - /* stripe->pages[] is allocated by us and no highmem is allowed. */ - ASSERT(page); - ASSERT(!PageHighMem(page)); - return page_address(page) + offset_in_page(offset); + /* stripe->folios[] is allocated by us and no highmem is allowed. */ + ASSERT(folio); + ASSERT(!folio_test_partial_kmap(folio)); + return folio_address(folio) + offset_in_folio(folio, offset); +} + +static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + u32 offset = (sector_nr << fs_info->sectorsize_bits); + const struct folio *folio = stripe->folios[offset >> min_folio_shift]; + + /* stripe->folios[] is allocated by us and no highmem is allowed. */ + ASSERT(folio); + ASSERT(!folio_test_partial_kmap(folio)); + /* And the range must be contained inside the folio. */ + ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); + return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); } static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) @@ -788,7 +805,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; - void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); + phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr); u8 csum_buf[BTRFS_CSUM_SIZE]; int ret; @@ -833,7 +850,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) return; } - ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, sector->csum); + ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum); if (ret < 0) { scrub_bitmap_set_bit_csum_error(stripe, sector_nr); scrub_bitmap_set_bit_error(stripe, sector_nr); @@ -1369,8 +1386,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d * Slice is divided into intervals when the IO is submitted, adjust by * bwlimit and maximum of 64 intervals. */ - div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); - div = min_t(u32, 64, div); + div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); /* Start new epoch, set deadline */ now = ktime_get(); @@ -1513,7 +1529,7 @@ static int find_first_extent_item(struct btrfs_root *extent_root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -1859,6 +1875,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; int mirror = stripe->mirror_num; @@ -1871,7 +1888,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, return; } - bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, + bbio = btrfs_bio_alloc(BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, fs_info, scrub_read_endio, stripe); bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; @@ -1970,7 +1987,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) * metadata, we should immediately abort. */ for (int i = 0; i < nr_stripes; i++) { - if (stripe_has_metadata_error(&sctx->stripes[i])) { + if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) { ret = -EIO; goto out; } @@ -2164,7 +2181,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, * As we may hit an empty data stripe while it's missing. */ bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); - if (!bitmap_empty(&error, stripe->nr_sectors)) { + if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) { btrfs_err(fs_info, "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, @@ -2202,7 +2219,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; - raid56_parity_cache_data_pages(rbio, stripe->pages, + raid56_parity_cache_data_folios(rbio, stripe->folios, full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); } raid56_parity_submit_scrub_rbio(rbio); @@ -2586,7 +2603,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root = fs_info->dev_root; u64 chunk_offset; @@ -2858,8 +2875,8 @@ skip_unfreeze: btrfs_put_block_group(cache); if (ret) break; - if (sctx->is_dev_replace && - atomic64_read(&dev_replace->num_write_errors) > 0) { + if (unlikely(sctx->is_dev_replace && + atomic64_read(&dev_replace->num_write_errors) > 0)) { ret = -EIO; break; } @@ -2872,8 +2889,6 @@ skip: btrfs_release_path(path); } - btrfs_free_path(path); - return ret; } @@ -2889,13 +2904,13 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, if (ret < 0) return ret; ret = btrfs_check_super_csum(fs_info, sb); - if (ret != 0) { + if (unlikely(ret != 0)) { btrfs_err_rl(fs_info, "scrub: super block at physical %llu devid %llu has bad csum", physical, dev->devid); return -EIO; } - if (btrfs_super_generation(sb) != generation) { + if (unlikely(btrfs_super_generation(sb) != generation)) { btrfs_err_rl(fs_info, "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", physical, dev->devid, @@ -3013,7 +3028,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly, int is_dev_replace) + bool readonly, bool is_dev_replace) { struct btrfs_dev_lookup_args args = { .devid = devid }; struct scrub_ctx *sctx; @@ -3065,8 +3080,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } mutex_lock(&fs_info->scrub_lock); - if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { + if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EIO; diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h index f0df597b75c7..aa68b6ebaf55 100644 --- a/fs/btrfs/scrub.h +++ b/fs/btrfs/scrub.h @@ -11,7 +11,7 @@ struct btrfs_scrub_progress; int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly, int is_dev_replace); + bool readonly, bool is_dev_replace); void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); int btrfs_scrub_cancel(struct btrfs_fs_info *info); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7664025a5af4..9230e5066fc6 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -646,7 +646,7 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) ret = kernel_write(filp, buf + pos, len - pos, off); if (ret < 0) return ret; - if (ret == 0) + if (unlikely(ret == 0)) return -EIO; pos += ret; } @@ -909,7 +909,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, struct btrfs_inode_info *info) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_item *ii; struct btrfs_key key; @@ -924,11 +924,11 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, if (ret) { if (ret > 0) ret = -ENOENT; - goto out; + return ret; } if (!info) - goto out; + return 0; ii = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); @@ -945,9 +945,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, */ info->fileattr = btrfs_inode_flags(path->nodes[0], ii); -out: - btrfs_free_path(path); - return ret; + return 0; } static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) @@ -973,13 +971,13 @@ typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx); * path must point to the INODE_REF or INODE_EXTREF when called. */ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *found_key, int resolve, + struct btrfs_key *found_key, bool resolve, iterate_inode_ref_t iterate, void *ctx) { struct extent_buffer *eb = path->nodes[0]; struct btrfs_inode_ref *iref; struct btrfs_inode_extref *extref; - struct btrfs_path *tmp_path; + BTRFS_PATH_AUTO_FREE(tmp_path); struct fs_path *p; u32 cur = 0; u32 total; @@ -1076,7 +1074,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, } out: - btrfs_free_path(tmp_path); fs_path_free(p); return ret; } @@ -1224,7 +1221,7 @@ static int get_inode_path(struct btrfs_root *root, { int ret; struct btrfs_key key, found_key; - struct btrfs_path *p; + BTRFS_PATH_AUTO_FREE(p); p = alloc_path_for_send(); if (!p) @@ -1238,28 +1235,20 @@ static int get_inode_path(struct btrfs_root *root, ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); if (ret < 0) - goto out; - if (ret) { - ret = 1; - goto out; - } + return ret; + if (ret) + return 1; + btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); if (found_key.objectid != ino || (found_key.type != BTRFS_INODE_REF_KEY && - found_key.type != BTRFS_INODE_EXTREF_KEY)) { - ret = -ENOENT; - goto out; - } + found_key.type != BTRFS_INODE_EXTREF_KEY)) + return -ENOENT; - ret = iterate_inode_ref(root, p, &found_key, 1, - __copy_first_ref, path); + ret = iterate_inode_ref(root, p, &found_key, true, __copy_first_ref, path); if (ret < 0) - goto out; - ret = 0; - -out: - btrfs_free_path(p); - return ret; + return ret; + return 0; } struct backref_ctx { @@ -1389,7 +1378,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct backref_ctx *bctx = ctx; struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; - const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + const u64 key = leaf_bytenr >> fs_info->nodesize_bits; struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; @@ -1444,7 +1433,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, if (!new_entry) return; - new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.key = leaf_bytenr >> fs_info->nodesize_bits; new_entry->entry.gen = 0; new_entry->num_roots = 0; ULIST_ITER_INIT(&uiter); @@ -1716,7 +1705,7 @@ static int read_symlink(struct btrfs_root *root, struct fs_path *dest) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_file_extent_item *ei; u8 type; @@ -1733,21 +1722,20 @@ static int read_symlink(struct btrfs_root *root, key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; - if (ret) { + return ret; + if (unlikely(ret)) { /* * An empty symlink inode. Can happen in rare error paths when * creating a symlink (transaction committed before the inode * eviction handler removed the symlink inode items and a crash - * happened in between or the subvol was snapshoted in between). + * happened in between or the subvol was snapshotted in between). * Print an informative message to dmesg/syslog so that the user * can delete the symlink. */ btrfs_err(root->fs_info, "Found empty symlink inode %llu at root %llu", ino, btrfs_root_id(root)); - ret = -EIO; - goto out; + return -EIO; } ei = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -1758,7 +1746,7 @@ static int read_symlink(struct btrfs_root *root, btrfs_crit(root->fs_info, "send: found symlink extent that is not inline, ino %llu root %llu extent type %d", ino, btrfs_root_id(root), type); - goto out; + return ret; } compression = btrfs_file_extent_compression(path->nodes[0], ei); if (unlikely(compression != BTRFS_COMPRESS_NONE)) { @@ -1766,17 +1754,13 @@ static int read_symlink(struct btrfs_root *root, btrfs_crit(root->fs_info, "send: found symlink extent with compression, ino %llu root %llu compression type %d", ino, btrfs_root_id(root), compression); - goto out; + return ret; } off = btrfs_file_extent_inline_start(ei); len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); - ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); - -out: - btrfs_free_path(path); - return ret; + return fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); } /* @@ -1787,8 +1771,7 @@ static int gen_unique_name(struct send_ctx *sctx, u64 ino, u64 gen, struct fs_path *dest) { - int ret = 0; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; char tmp[64]; int len; @@ -1811,10 +1794,9 @@ static int gen_unique_name(struct send_ctx *sctx, path, BTRFS_FIRST_FREE_OBJECTID, &tmp_name, 0); btrfs_release_path(path); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } + if (IS_ERR(di)) + return PTR_ERR(di); + if (di) { /* not unique, try again */ idx++; @@ -1823,7 +1805,6 @@ static int gen_unique_name(struct send_ctx *sctx, if (!sctx->parent_root) { /* unique */ - ret = 0; break; } @@ -1831,10 +1812,9 @@ static int gen_unique_name(struct send_ctx *sctx, path, BTRFS_FIRST_FREE_OBJECTID, &tmp_name, 0); btrfs_release_path(path); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } + if (IS_ERR(di)) + return PTR_ERR(di); + if (di) { /* not unique, try again */ idx++; @@ -1844,11 +1824,7 @@ static int gen_unique_name(struct send_ctx *sctx, break; } - ret = fs_path_add(dest, tmp, len); - -out: - btrfs_free_path(path); - return ret; + return fs_path_add(dest, tmp, len); } enum inode_state { @@ -1960,7 +1936,7 @@ static int lookup_dir_item_inode(struct btrfs_root *root, int ret = 0; struct btrfs_dir_item *di; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len); path = alloc_path_for_send(); @@ -1968,19 +1944,15 @@ static int lookup_dir_item_inode(struct btrfs_root *root, return -ENOMEM; di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); - if (IS_ERR_OR_NULL(di)) { - ret = di ? PTR_ERR(di) : -ENOENT; - goto out; - } + if (IS_ERR_OR_NULL(di)) + return di ? PTR_ERR(di) : -ENOENT; + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); - if (key.type == BTRFS_ROOT_ITEM_KEY) { - ret = -ENOENT; - goto out; - } + if (key.type == BTRFS_ROOT_ITEM_KEY) + return -ENOENT; + *found_inode = key.objectid; -out: - btrfs_free_path(path); return ret; } @@ -1994,7 +1966,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, int ret; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int len; u64 parent_dir; @@ -2008,16 +1980,14 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); if (ret < 0) - goto out; + return ret; if (!ret) btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); if (ret || found_key.objectid != ino || (found_key.type != BTRFS_INODE_REF_KEY && - found_key.type != BTRFS_INODE_EXTREF_KEY)) { - ret = -ENOENT; - goto out; - } + found_key.type != BTRFS_INODE_EXTREF_KEY)) + return -ENOENT; if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; @@ -2038,19 +2008,17 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); } if (ret < 0) - goto out; + return ret; btrfs_release_path(path); if (dir_gen) { ret = get_inode_gen(root, parent_dir, dir_gen); if (ret < 0) - goto out; + return ret; } *dir = parent_dir; -out: - btrfs_free_path(path); return ret; } @@ -2486,7 +2454,7 @@ static int send_subvol_begin(struct send_ctx *sctx) int ret; struct btrfs_root *send_root = sctx->send_root; struct btrfs_root *parent_root = sctx->parent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root_ref *ref; struct extent_buffer *leaf; @@ -2498,10 +2466,8 @@ static int send_subvol_begin(struct send_ctx *sctx) return -ENOMEM; name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); - if (!name) { - btrfs_free_path(path); + if (!name) return -ENOMEM; - } key.objectid = btrfs_root_id(send_root); key.type = BTRFS_ROOT_BACKREF_KEY; @@ -2564,7 +2530,6 @@ static int send_subvol_begin(struct send_ctx *sctx) tlv_put_failure: out: - btrfs_free_path(path); kfree(name); return ret; } @@ -2715,7 +2680,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) int ret = 0; struct fs_path *p = NULL; struct btrfs_inode_item *ii; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; struct btrfs_key key; int slot; @@ -2759,7 +2724,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) tlv_put_failure: out: free_path_for_command(sctx, p); - btrfs_free_path(path); return ret; } @@ -2769,7 +2733,7 @@ out: * processing an inode that is a directory and it just got renamed, and existing * entries in the cache may refer to inodes that have the directory in their * full path - in which case we would generate outdated paths (pre-rename) - * for the inodes that the cache entries point to. Instead of prunning the + * for the inodes that the cache entries point to. Instead of pruning the * cache when inserting, do it after we finish processing each inode at * finish_inode_if_needed(). */ @@ -2930,7 +2894,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) { int ret = 0; int iter_ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; struct btrfs_key di_key; @@ -2970,7 +2934,6 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) if (iter_ret < 0) ret = iter_ret; - btrfs_free_path(path); return ret; } @@ -3750,7 +3713,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, struct recorded_ref *parent_ref, const bool is_orphan) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key di_key; struct btrfs_dir_item *di; @@ -3771,19 +3734,15 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - goto out; - } + if (ret < 0) + return ret; + if (ret > 0) + return 0; di = btrfs_match_dir_item_name(path, parent_ref->name, parent_ref->name_len); - if (!di) { - ret = 0; - goto out; - } + if (!di) + return 0; /* * di_key.objectid has the number of the inode that has a dentry in the * parent directory with the same name that sctx->cur_ino is being @@ -3793,26 +3752,22 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, * that it happens after that other inode is renamed. */ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); - if (di_key.type != BTRFS_INODE_ITEM_KEY) { - ret = 0; - goto out; - } + if (di_key.type != BTRFS_INODE_ITEM_KEY) + return 0; ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen); if (ret < 0) - goto out; + return ret; ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen); if (ret < 0) { if (ret == -ENOENT) ret = 0; - goto out; + return ret; } /* Different inode, no need to delay the rename of sctx->cur_ino */ - if (right_gen != left_gen) { - ret = 0; - goto out; - } + if (right_gen != left_gen) + return 0; wdm = get_waiting_dir_move(sctx, di_key.objectid); if (wdm && !wdm->orphanized) { @@ -3826,8 +3781,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, if (!ret) ret = 1; } -out: - btrfs_free_path(path); return ret; } @@ -3877,7 +3830,7 @@ static int is_ancestor(struct btrfs_root *root, bool free_fs_path = false; int ret = 0; int iter_ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; if (!fs_path) { @@ -3945,7 +3898,6 @@ static int is_ancestor(struct btrfs_root *root, ret = iter_ret; out: - btrfs_free_path(path); if (free_fs_path) fs_path_free(fs_path); return ret; @@ -4756,8 +4708,8 @@ static int record_new_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, record_new_ref_if_needed, sctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, + false, record_new_ref_if_needed, sctx); if (ret < 0) return ret; @@ -4768,9 +4720,8 @@ static int record_deleted_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, record_deleted_ref_if_needed, - sctx); + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, + false, record_deleted_ref_if_needed, sctx); if (ret < 0) return ret; @@ -4781,12 +4732,12 @@ static int record_changed_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, record_new_ref_if_needed, sctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, + false, record_new_ref_if_needed, sctx); if (ret < 0) return ret; - ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, + false, record_deleted_ref_if_needed, sctx); if (ret < 0) return ret; @@ -4803,7 +4754,7 @@ static int process_all_refs(struct send_ctx *sctx, int ret = 0; int iter_ret = 0; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; iterate_inode_ref_t cb; @@ -4822,8 +4773,7 @@ static int process_all_refs(struct send_ctx *sctx, } else { btrfs_err(sctx->send_root->fs_info, "Wrong command %d in process_all_refs", cmd); - ret = -EINVAL; - goto out; + return -EINVAL; } key.objectid = sctx->cmp_key->objectid; @@ -4835,15 +4785,14 @@ static int process_all_refs(struct send_ctx *sctx, found_key.type != BTRFS_INODE_EXTREF_KEY)) break; - ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); + ret = iterate_inode_ref(root, path, &found_key, false, cb, sctx); if (ret < 0) - goto out; + return ret; } /* Catch error found during iteration */ - if (iter_ret < 0) { - ret = iter_ret; - goto out; - } + if (iter_ret < 0) + return iter_ret; + btrfs_release_path(path); /* @@ -4851,10 +4800,7 @@ static int process_all_refs(struct send_ctx *sctx, * re-creating this inode and will be rename'ing it into place once we * rename the parent directory. */ - ret = process_recorded_refs(sctx, &pending_move); -out: - btrfs_free_path(path); - return ret; + return process_recorded_refs(sctx, &pending_move); } static int send_set_xattr(struct send_ctx *sctx, @@ -5080,7 +5026,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx) int ret = 0; int iter_ret = 0; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; @@ -5108,7 +5054,6 @@ static int process_all_new_xattrs(struct send_ctx *sctx) if (iter_ret < 0) ret = iter_ret; - btrfs_free_path(path); return ret; } @@ -5254,7 +5199,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) if (!folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); btrfs_err(fs_info, "send: IO error at offset %llu for inode %llu root %llu", @@ -5656,7 +5601,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + /* + * Do not go through encoded read for bs > ps cases. + * + * Encoded send is using vmallocated pages as buffer, which we can + * not ensure every folio is large enough to contain a block. + */ + if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE && + (sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { bool is_inline = (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE); @@ -5766,7 +5718,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, */ static int send_capabilities(struct send_ctx *sctx) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; struct extent_buffer *leaf; unsigned long data_ptr; @@ -5804,7 +5756,6 @@ static int send_capabilities(struct send_ctx *sctx) strlen(XATTR_NAME_CAPS), buf, buf_len); out: kfree(buf); - btrfs_free_path(path); return ret; } @@ -5812,7 +5763,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, struct clone_root *clone_root, const u64 disk_byte, u64 data_offset, u64 offset, u64 len) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret; struct btrfs_inode_info info; @@ -5848,7 +5799,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = get_inode_info(clone_root->root, clone_root->ino, &info); btrfs_release_path(path); if (ret < 0) - goto out; + return ret; clone_src_i_size = info.size; /* @@ -5878,7 +5829,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, key.offset = clone_root->offset; ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0 && path->slots[0] > 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); if (key.objectid == clone_root->ino && @@ -5899,7 +5850,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); if (ret < 0) - goto out; + return ret; else if (ret > 0) break; continue; @@ -5936,7 +5887,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_extent_data(sctx, dst_path, offset, hole_len); if (ret < 0) - goto out; + return ret; len -= hole_len; if (len == 0) @@ -6007,7 +5958,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_clone(sctx, offset, slen, clone_root); if (ret < 0) - goto out; + return ret; } ret = send_extent_data(sctx, dst_path, offset + slen, @@ -6041,7 +5992,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, } if (ret < 0) - goto out; + return ret; len -= clone_len; if (len == 0) @@ -6072,8 +6023,6 @@ next: ret = send_extent_data(sctx, dst_path, offset, len); else ret = 0; -out: - btrfs_free_path(path); return ret; } @@ -6162,7 +6111,7 @@ static int is_extent_unchanged(struct send_ctx *sctx, { int ret = 0; struct btrfs_key key; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; int slot; struct btrfs_key found_key; @@ -6188,10 +6137,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); left_type = btrfs_file_extent_type(eb, ei); - if (left_type != BTRFS_FILE_EXTENT_REG) { - ret = 0; - goto out; - } + if (left_type != BTRFS_FILE_EXTENT_REG) + return 0; + left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); left_len = btrfs_file_extent_num_bytes(eb, ei); left_offset = btrfs_file_extent_offset(eb, ei); @@ -6223,11 +6171,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, key.offset = ekey->offset; ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0); if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + return ret; + if (ret) + return 0; /* * Handle special case where the right side has no extents at all. @@ -6236,11 +6182,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || - found_key.type != key.type) { + found_key.type != key.type) /* If we're a hole then just pretend nothing changed */ - ret = (left_disknr) ? 0 : 1; - goto out; - } + return (left_disknr ? 0 : 1); /* * We're now on 2a, 2b or 7. @@ -6250,10 +6194,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); if (right_type != BTRFS_FILE_EXTENT_REG && - right_type != BTRFS_FILE_EXTENT_INLINE) { - ret = 0; - goto out; - } + right_type != BTRFS_FILE_EXTENT_INLINE) + return 0; if (right_type == BTRFS_FILE_EXTENT_INLINE) { right_len = btrfs_file_extent_ram_bytes(eb, ei); @@ -6266,11 +6208,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, * Are we at extent 8? If yes, we know the extent is changed. * This may only happen on the first iteration. */ - if (found_key.offset + right_len <= ekey->offset) { + if (found_key.offset + right_len <= ekey->offset) /* If we're a hole just pretend nothing changed */ - ret = (left_disknr) ? 0 : 1; - goto out; - } + return (left_disknr ? 0 : 1); /* * We just wanted to see if when we have an inline extent, what @@ -6280,10 +6220,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, * compressed extent representing data with a size matching * the page size (currently the same as sector size). */ - if (right_type == BTRFS_FILE_EXTENT_INLINE) { - ret = 0; - goto out; - } + if (right_type == BTRFS_FILE_EXTENT_INLINE) + return 0; right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); right_offset = btrfs_file_extent_offset(eb, ei); @@ -6303,17 +6241,15 @@ static int is_extent_unchanged(struct send_ctx *sctx, */ if (left_disknr != right_disknr || left_offset_fixed != right_offset || - left_gen != right_gen) { - ret = 0; - goto out; - } + left_gen != right_gen) + return 0; /* * Go to the next extent. */ ret = btrfs_next_item(sctx->parent_root, path); if (ret < 0) - goto out; + return ret; if (!ret) { eb = path->nodes[0]; slot = path->slots[0]; @@ -6324,10 +6260,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, key.offset += right_len; break; } - if (found_key.offset != key.offset + right_len) { - ret = 0; - goto out; - } + if (found_key.offset != key.offset + right_len) + return 0; + key = found_key; } @@ -6340,15 +6275,12 @@ static int is_extent_unchanged(struct send_ctx *sctx, else ret = 0; - -out: - btrfs_free_path(path); return ret; } static int get_last_extent(struct send_ctx *sctx, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = sctx->send_root; struct btrfs_key key; int ret; @@ -6364,15 +6296,13 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset) key.offset = offset; ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); if (ret < 0) - goto out; + return ret; ret = 0; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) - goto out; + return ret; sctx->cur_inode_last_extent = btrfs_file_extent_end(path); -out: - btrfs_free_path(path); return ret; } @@ -6380,7 +6310,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, const u64 start, const u64 end) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root = sctx->parent_root; u64 search_start = start; @@ -6395,7 +6325,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, key.offset = search_start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0 && path->slots[0] > 0) path->slots[0]--; @@ -6408,8 +6338,8 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; - else if (ret > 0) + return ret; + if (ret > 0) break; continue; } @@ -6431,15 +6361,11 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, search_start = extent_end; goto next; } - ret = 0; - goto out; + return 0; next: path->slots[0]++; } - ret = 1; -out: - btrfs_free_path(path); - return ret; + return 1; } static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, @@ -6547,7 +6473,7 @@ static int process_all_extents(struct send_ctx *sctx) int ret = 0; int iter_ret = 0; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; @@ -6574,11 +6500,10 @@ static int process_all_extents(struct send_ctx *sctx) if (iter_ret < 0) ret = iter_ret; - btrfs_free_path(path); return ret; } -static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, +static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end, int *pending_move, int *refs_processed) { @@ -6601,7 +6526,7 @@ out: return ret; } -static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) +static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end) { int ret = 0; struct btrfs_inode_info info; @@ -7036,7 +6961,7 @@ static int changed_ref(struct send_ctx *sctx, { int ret = 0; - if (sctx->cur_ino != sctx->cmp_key->objectid) { + if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) { inconsistent_snapshot_error(sctx, result, "reference"); return -EIO; } @@ -7064,7 +6989,7 @@ static int changed_xattr(struct send_ctx *sctx, { int ret = 0; - if (sctx->cur_ino != sctx->cmp_key->objectid) { + if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) { inconsistent_snapshot_error(sctx, result, "xattr"); return -EIO; } @@ -7304,7 +7229,7 @@ static int search_key_again(const struct send_ctx *sctx, */ ret = btrfs_search_slot(NULL, root, key, path, 0, 0); ASSERT(ret <= 0); - if (ret > 0) { + if (unlikely(ret > 0)) { btrfs_print_tree(path->nodes[path->lowest_level], false); btrfs_err(root->fs_info, "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", @@ -7324,7 +7249,7 @@ static int full_send_tree(struct send_ctx *sctx) struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; struct btrfs_fs_info *fs_info = send_root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = alloc_path_for_send(); if (!path) @@ -7341,7 +7266,7 @@ static int full_send_tree(struct send_ctx *sctx) ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) - goto out; + return ret; if (ret) goto out_finish; @@ -7351,7 +7276,7 @@ static int full_send_tree(struct send_ctx *sctx) ret = changed_cb(path, NULL, &key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) - goto out; + return ret; down_read(&fs_info->commit_root_sem); if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { @@ -7370,14 +7295,14 @@ static int full_send_tree(struct send_ctx *sctx) btrfs_release_path(path); ret = search_key_again(sctx, send_root, path, &key); if (ret < 0) - goto out; + return ret; } else { up_read(&fs_info->commit_root_sem); } ret = btrfs_next_item(send_root, path); if (ret < 0) - goto out; + return ret; if (ret) { ret = 0; break; @@ -7385,11 +7310,7 @@ static int full_send_tree(struct send_ctx *sctx) } out_finish: - ret = finish_inode_if_needed(sctx, 1); - -out: - btrfs_free_path(path); - return ret; + return finish_inode_if_needed(sctx, 1); } static int replace_node_with_clone(struct btrfs_path *path, int level) @@ -7644,8 +7565,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; int cmp; - struct btrfs_path *left_path = NULL; - struct btrfs_path *right_path = NULL; + BTRFS_PATH_AUTO_FREE(left_path); + BTRFS_PATH_AUTO_FREE(right_path); struct btrfs_key left_key; struct btrfs_key right_key; char *tmp_buf = NULL; @@ -7918,8 +7839,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, out_unlock: up_read(&fs_info->commit_root_sem); out: - btrfs_free_path(left_path); - btrfs_free_path(right_path); kvfree(tmp_buf); return ret; } @@ -7986,7 +7905,7 @@ static int ensure_commit_roots_uptodate(struct send_ctx *sctx) } /* - * Make sure any existing dellaloc is flushed for any root used by a send + * Make sure any existing delalloc is flushed for any root used by a send * operation so that we do not miss any data and we do not race with writeback * finishing and changing a tree while send is using the tree. This could * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 0481c693ac2e..97452fb5d29b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -479,7 +479,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, /* * On the zoned mode, we always allocate one zone as one chunk. - * Returning non-zone size alingned bytes here will result in + * Returning non-zone size aligned bytes here will result in * less pressure for the async metadata reclaim process, and it * will over-commit too much leading to ENOSPC. Align down to the * zone size to avoid that. @@ -1528,7 +1528,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, * turned into error mode due to a transaction abort when flushing space * above, in that case fail with the abort error instead of returning * success to the caller if we can steal from the global rsv - this is - * just to have caller fail immeditelly instead of later when trying to + * just to have caller fail immediately instead of later when trying to * modify the fs, making it easier to debug -ENOSPC problems. */ if (BTRFS_FS_ERROR(fs_info)) { @@ -1830,7 +1830,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, flush, "enospc"); - queue_work(system_unbound_wq, async_work); + queue_work(system_dfl_wq, async_work); } } else { list_add_tail(&ticket.list, @@ -1847,7 +1847,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, need_preemptive_reclaim(fs_info, space_info)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); - queue_work(system_unbound_wq, + queue_work(system_dfl_wq, &fs_info->preempt_reclaim_work); } } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index cb4f97833dc3..5ca8d4db6722 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -690,7 +690,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, \ GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ - "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ + "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ start, len, folio_pos(folio), \ blocks_per_folio, &bitmap); \ } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index ee0710eb13fd..ad0552db7c7d 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -13,7 +13,7 @@ struct address_space; struct folio; /* - * Extra info for subpapge bitmap. + * Extra info for subpage bitmap. * * For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into * one larger bitmap. diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a262b494a89f..d6e496436539 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -133,9 +133,8 @@ enum { Opt_enospc_debug, #ifdef CONFIG_BTRFS_DEBUG Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY Opt_ref_verify, + Opt_ref_tracker, #endif Opt_err, }; @@ -257,8 +256,7 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { fsparam_flag_no("enospc_debug", Opt_enospc_debug), #ifdef CONFIG_BTRFS_DEBUG fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment), -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY + fsparam_flag("ref_tracker", Opt_ref_tracker), fsparam_flag("ref_verify", Opt_ref_verify), #endif {} @@ -276,6 +274,7 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, const struct fs_parameter *param, int opt) { const char *string = param->string; + int ret; /* * Provide the same semantics as older kernels that don't use fs @@ -294,21 +293,30 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "zlib", true)) { ctx->compress_type = BTRFS_COMPRESS_ZLIB; - ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, - string + 4); + ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4, + &ctx->compress_level); + if (ret < 0) + goto error; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (btrfs_match_compress_type(string, "lzo", false)) { + } else if (btrfs_match_compress_type(string, "lzo", true)) { ctx->compress_type = BTRFS_COMPRESS_LZO; - ctx->compress_level = 0; + ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3, + &ctx->compress_level); + if (ret < 0) + goto error; + if (string[3] == ':' && string[4]) + btrfs_warn(NULL, "Compression level ignored for LZO"); btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "zstd", true)) { ctx->compress_type = BTRFS_COMPRESS_ZSTD; - ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, - string + 4); + ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4, + &ctx->compress_level); + if (ret < 0) + goto error; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); @@ -319,10 +327,14 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_clear_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); } else { - btrfs_err(NULL, "unrecognized compression value %s", string); - return -EINVAL; + ret = -EINVAL; + goto error; } return 0; +error: + btrfs_err(NULL, "failed to parse compression option '%s'", string); + return ret; + } static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) @@ -632,11 +644,12 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } break; -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY case Opt_ref_verify: btrfs_set_opt(ctx->mount_opt, REF_VERIFY); break; + case Opt_ref_tracker: + btrfs_set_opt(ctx->mount_opt, REF_TRACKER); + break; #endif default: btrfs_err(NULL, "unrecognized mount option '%s'", param->key); @@ -912,7 +925,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec { struct btrfs_root *root = fs_info->tree_root; struct btrfs_dir_item *di; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key location; struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; @@ -929,7 +942,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0); if (IS_ERR(di)) { - btrfs_free_path(path); return PTR_ERR(di); } if (!di) { @@ -938,13 +950,11 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec * it's always been there, but don't freak out, just try and * mount the top-level subvolume. */ - btrfs_free_path(path); *objectid = BTRFS_FS_TREE_OBJECTID; return 0; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - btrfs_free_path(path); *objectid = location.objectid; return 0; } @@ -1079,7 +1089,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); - if (info->compress_level) + if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO) seq_printf(seq, ":%d", info->compress_level); } if (btrfs_test_opt(info, NOSSD)) @@ -1142,6 +1152,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) #endif if (btrfs_test_opt(info, REF_VERIFY)) seq_puts(seq, ",ref_verify"); + if (btrfs_test_opt(info, REF_TRACKER)) + seq_puts(seq, ",ref_tracker"); seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); subvol_name = btrfs_get_subvol_name_from_objectid(info, btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); @@ -1268,7 +1280,7 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); /* - * We need to cleanup all defragable inodes if the autodefragment is + * We need to cleanup all defraggable inodes if the autodefragment is * close or the filesystem is read only. */ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && @@ -2260,10 +2272,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, device = btrfs_scan_one_device(vol->name, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); - if (IS_ERR(device)) - ret = PTR_ERR(device); - else - ret = 0; + ret = PTR_ERR_OR_ZERO(device); break; } ret = !(device->fs_devices->num_devices == @@ -2316,14 +2325,14 @@ static int check_dev_super(struct btrfs_device *dev) /* Verify the checksum. */ csum_type = btrfs_super_csum_type(sb); - if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) { + if (unlikely(csum_type != btrfs_super_csum_type(fs_info->super_copy))) { btrfs_err(fs_info, "csum type changed, has %u expect %u", csum_type, btrfs_super_csum_type(fs_info->super_copy)); ret = -EUCLEAN; goto out; } - if (btrfs_check_super_csum(fs_info, sb)) { + if (unlikely(btrfs_check_super_csum(fs_info, sb))) { btrfs_err(fs_info, "csum for on-disk super block no longer matches"); ret = -EUCLEAN; goto out; @@ -2335,7 +2344,7 @@ static int check_dev_super(struct btrfs_device *dev) goto out; last_trans = btrfs_get_last_trans_committed(fs_info); - if (btrfs_super_generation(sb) != last_trans) { + if (unlikely(btrfs_super_generation(sb) != last_trans)) { btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", btrfs_super_generation(sb), last_trans); ret = -EUCLEAN; @@ -2472,9 +2481,6 @@ static int __init btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_ASSERT ", assert=on" #endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - ", ref-verify=on" -#endif #ifdef CONFIG_BLK_DEV_ZONED ", zoned=yes" #else diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 9d398f7a36ad..81f52c1f55ce 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -409,13 +409,17 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, char *buf) { ssize_t ret = 0; + bool has_output = false; - if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE) - ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE); - if (PAGE_SIZE > SZ_4K) - ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); - ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); - + for (u32 cur = BTRFS_MIN_BLOCKSIZE; cur <= BTRFS_MAX_BLOCKSIZE; cur *= 2) { + if (!btrfs_supported_blocksize(cur)) + continue; + if (has_output) + ret += sysfs_emit_at(buf, ret, " "); + ret += sysfs_emit_at(buf, ret, "%u", cur); + has_output = true; + } + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_sectorsizes, diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c index 265370e79a54..e2248acb906b 100644 --- a/fs/btrfs/tests/delayed-refs-tests.c +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -997,12 +997,12 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) ret = simple_tests(&trans); if (!ret) { - test_msg("running delayed refs merg tests on metadata refs"); + test_msg("running delayed refs merge tests on metadata refs"); ret = merge_tests(&trans, BTRFS_REF_METADATA); } if (!ret) { - test_msg("running delayed refs merg tests on data refs"); + test_msg("running delayed refs merge tests on data refs"); ret = merge_tests(&trans, BTRFS_REF_DATA); } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 3a86534c116f..42af6c737c6e 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -1095,7 +1095,7 @@ int btrfs_test_extent_map(void) /* * Test a chunk with 2 data stripes one of which * intersects the physical address of the super block - * is correctly recognised. + * is correctly recognized. */ .raid_type = BTRFS_BLOCK_GROUP_RAID1, .physical_start = SZ_64M - SZ_4M, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c5c0d9cf1a80..89ae0c7a610a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -103,7 +103,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep; * | attached to transid N+1. | * | | * | To next stage: | - * | Until all tree blocks are super blocks are | + * | Until all tree blocks and super blocks are | * | written to block devices | * V | * Transaction N [[TRANS_STATE_COMPLETED]] V @@ -404,7 +404,7 @@ loop: */ static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int force) + bool force) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; @@ -1569,7 +1569,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * qgroup counters could end up wrong. */ ret = btrfs_run_delayed_refs(trans, U64_MAX); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -1641,7 +1641,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; struct btrfs_inode *parent_inode = pending->dir; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; struct extent_buffer *old; @@ -1694,10 +1694,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto clear_skip_qgroup; } - key.objectid = objectid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; trans->bytes_reserved = trans->block_rsv->reserved; @@ -1714,7 +1710,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * insert the directory item */ ret = btrfs_set_inode_index(parent_inode, &index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1735,7 +1731,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_create_qgroup(trans, objectid); if (ret && ret != -EEXIST) { - if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) { + if (unlikely(ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info))) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1748,13 +1744,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * snapshot */ ret = btrfs_run_delayed_items(trans); - if (ret) { /* Transaction aborted */ + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } ret = record_root_in_trans(trans, root, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1789,7 +1785,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, old = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, BTRFS_NESTING_COW); - if (ret) { + if (unlikely(ret)) { btrfs_tree_unlock(old); free_extent_buffer(old); btrfs_abort_transaction(trans, ret); @@ -1800,21 +1796,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* clean up in any case */ btrfs_tree_unlock(old); free_extent_buffer(old); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } /* see comments in should_cow_block() */ set_bit(BTRFS_ROOT_FORCE_COW, &root->state); - smp_wmb(); + smp_mb__after_atomic(); btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = trans->transid; ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1826,7 +1824,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_root_id(parent_root), btrfs_ino(parent_inode), index, &fname.disk_name); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1841,7 +1839,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } ret = btrfs_reloc_post_snapshot(trans, pending); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1864,7 +1862,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, &fname.disk_name, parent_inode, &key, BTRFS_FT_DIR, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1874,14 +1872,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, inode_set_mtime_to_ts(&parent_inode->vfs_inode, inode_set_ctime_current(&parent_inode->vfs_inode)); ret = btrfs_update_inode_fallback(trans, parent_inode); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_uuid_tree_add(trans, new_root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1889,7 +1887,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); - if (ret && ret != -EEXIST) { + if (unlikely(ret && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1907,7 +1905,6 @@ free_fname: free_pending: kfree(new_root_item); pending->root_item = NULL; - btrfs_free_path(path); pending->path = NULL; return ret; @@ -2423,7 +2420,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * them. * * We needn't worry that this operation will corrupt the snapshots, - * because all the tree which are snapshoted will be forced to COW + * because all the tree which are snapshotted will be forced to COW * the nodes and leaves. */ ret = btrfs_run_delayed_items(trans); @@ -2657,9 +2654,9 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - ret = btrfs_drop_snapshot(root, 0, 0); + ret = btrfs_drop_snapshot(root, false, false); else - ret = btrfs_drop_snapshot(root, 1, 0); + ret = btrfs_drop_snapshot(root, true, false); btrfs_put_root(root); return (ret < 0) ? 0 : 1; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 0f556f4de3f9..ca30b15ea452 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -183,6 +183,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, /* Only these key->types needs to be checked */ ASSERT(key->type == BTRFS_XATTR_ITEM_KEY || key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY || key->type == BTRFS_DIR_INDEX_KEY || key->type == BTRFS_DIR_ITEM_KEY || key->type == BTRFS_EXTENT_DATA_KEY); @@ -1209,7 +1210,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, /* * For legacy root item, the members starting at generation_v2 will be * all filled with 0. - * And since we allow geneartion_v2 as 0, it will still pass the check. + * And since we allow generation_v2 as 0, it will still pass the check. */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), btrfs_item_size(leaf, slot)); @@ -1756,10 +1757,10 @@ static int check_inode_ref(struct extent_buffer *leaf, while (ptr < end) { u16 namelen; - if (unlikely(ptr + sizeof(iref) > end)) { + if (unlikely(ptr + sizeof(*iref) > end)) { inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", - ptr, end, sizeof(iref)); + ptr, end, sizeof(*iref)); return -EUCLEAN; } @@ -1782,6 +1783,39 @@ static int check_inode_ref(struct extent_buffer *leaf, return 0; } +static int check_inode_extref(struct extent_buffer *leaf, + struct btrfs_key *key, struct btrfs_key *prev_key, + int slot) +{ + unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); + unsigned long end = ptr + btrfs_item_size(leaf, slot); + + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) + return -EUCLEAN; + + while (ptr < end) { + struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr; + u16 namelen; + + if (unlikely(ptr + sizeof(*extref)) > end) { + inode_ref_err(leaf, slot, + "inode extref overflow, ptr %lu end %lu inode_extref size %zu", + ptr, end, sizeof(*extref)); + return -EUCLEAN; + } + + namelen = btrfs_inode_extref_name_len(leaf, extref); + if (unlikely(ptr + sizeof(*extref) + namelen > end)) { + inode_ref_err(leaf, slot, + "inode extref overflow, ptr %lu end %lu namelen %u", + ptr, end, namelen); + return -EUCLEAN; + } + ptr += sizeof(*extref) + namelen; + } + return 0; +} + static int check_raid_stripe_extent(const struct extent_buffer *leaf, const struct btrfs_key *key, int slot) { @@ -1893,6 +1927,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, case BTRFS_INODE_REF_KEY: ret = check_inode_ref(leaf, key, prev_key, slot); break; + case BTRFS_INODE_EXTREF_KEY: + ret = check_inode_extref(leaf, key, prev_key, slot); + break; case BTRFS_BLOCK_GROUP_ITEM_KEY: ret = check_block_group_item(leaf, key, slot); break; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 69e11557fd13..6aad6b65522b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #include "file-item.h" #include "file.h" #include "orphan.h" +#include "print-tree.h" #include "tree-checker.h" #define MAX_CONFLICT_INODES 10 @@ -101,17 +102,134 @@ enum { LOG_WALK_REPLAY_ALL, }; +/* + * The walk control struct is used to pass state down the chain when processing + * the log tree. The stage field tells us which part of the log tree processing + * we are currently doing. + */ +struct walk_control { + /* + * Signal that we are freeing the metadata extents of a log tree. + * This is used at transaction commit time while freeing a log tree. + */ + bool free; + + /* + * Signal that we are pinning the metadata extents of a log tree and the + * data extents its leaves point to (if using mixed block groups). + * This happens in the first stage of log replay to ensure that during + * replay, while we are modifying subvolume trees, we don't overwrite + * the metadata extents of log trees. + */ + bool pin; + + /* What stage of the replay code we're currently in. */ + int stage; + + /* + * Ignore any items from the inode currently being processed. Needs + * to be set every time we find a BTRFS_INODE_ITEM_KEY. + */ + bool ignore_cur_inode; + + /* + * The root we are currently replaying to. This is NULL for the replay + * stage LOG_WALK_PIN_ONLY. + */ + struct btrfs_root *root; + + /* The log tree we are currently processing (not NULL for any stage). */ + struct btrfs_root *log; + + /* The transaction handle used for replaying all log trees. */ + struct btrfs_trans_handle *trans; + + /* + * The function that gets used to process blocks we find in the tree. + * Note the extent_buffer might not be up to date when it is passed in, + * and it must be checked or read if you need the data inside it. + */ + int (*process_func)(struct extent_buffer *eb, + struct walk_control *wc, u64 gen, int level); + + /* + * The following are used only when stage is >= LOG_WALK_REPLAY_INODES + * and by the replay_one_buffer() callback. + */ + + /* The current log leaf being processed. */ + struct extent_buffer *log_leaf; + /* The key being processed of the current log leaf. */ + struct btrfs_key log_key; + /* The slot being processed of the current log leaf. */ + int log_slot; + + /* A path used for searches and modifications to subvolume trees. */ + struct btrfs_path *subvol_path; +}; + +static void do_abort_log_replay(struct walk_control *wc, const char *function, + unsigned int line, int error, const char *fmt, ...) +{ + struct btrfs_fs_info *fs_info = wc->trans->fs_info; + struct va_format vaf; + va_list args; + + /* + * Do nothing if we already aborted, to avoid dumping leaves again which + * can be verbose. Further more, only the first call is useful since it + * is where we have a problem. Note that we do not use the flag + * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that + * are outside of tree-log.c that can abort transactions (such as + * btrfs_add_link() for example), so if that happens we still want to + * dump all log replay specific information below. + */ + if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state)) + return; + + btrfs_abort_transaction(wc->trans, error); + + if (wc->subvol_path->nodes[0]) { + btrfs_crit(fs_info, + "subvolume (root %llu) leaf currently being processed:", + btrfs_root_id(wc->root)); + btrfs_print_leaf(wc->subvol_path->nodes[0]); + } + + if (wc->log_leaf) { + btrfs_crit(fs_info, + "log tree (for root %llu) leaf currently being processed (slot %d key %llu %u %llu):", + btrfs_root_id(wc->root), wc->log_slot, + wc->log_key.objectid, wc->log_key.type, wc->log_key.offset); + btrfs_print_leaf(wc->log_leaf); + } + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV", + function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf); + + va_end(args); +} + +/* + * Use this for aborting a transaction during log replay while we are down the + * call chain of replay_one_buffer(), so that we get a lot more useful + * information for debugging issues when compared to a plain call to + * btrfs_abort_transaction(). + */ +#define btrfs_abort_log_replay(wc, error, fmt, args...) \ + do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args) + static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx); -static int link_to_fixup_dir(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid); -static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, +static int link_to_fixup_dir(struct walk_control *wc, u64 objectid); +static noinline int replay_dir_deletes(struct walk_control *wc, u64 dirid, bool del_all); static void wait_log_commit(struct btrfs_root *root, int transid); @@ -300,53 +418,13 @@ void btrfs_end_log_trans(struct btrfs_root *root) } /* - * the walk control struct is used to pass state down the chain when - * processing the log tree. The stage field tells us which part - * of the log tree processing we are currently doing. The others - * are state fields used for that specific part - */ -struct walk_control { - /* should we free the extent on disk when done? This is used - * at transaction commit time while freeing a log tree - */ - int free; - - /* pin only walk, we record which extents on disk belong to the - * log trees - */ - int pin; - - /* what stage of the replay code we're currently in */ - int stage; - - /* - * Ignore any items from the inode currently being processed. Needs - * to be set every time we find a BTRFS_INODE_ITEM_KEY. - */ - bool ignore_cur_inode; - - /* the root we are currently replaying */ - struct btrfs_root *replay_dest; - - /* the trans handle for the current replay */ - struct btrfs_trans_handle *trans; - - /* the function that gets used to process blocks we find in the - * tree. Note the extent_buffer might not be up to date when it is - * passed in, and it must be checked or read if you need the data - * inside it - */ - int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, - struct walk_control *wc, u64 gen, int level); -}; - -/* * process_func used to pin down extents, write them or wait on them */ -static int process_one_buffer(struct btrfs_root *log, - struct extent_buffer *eb, +static int process_one_buffer(struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { + struct btrfs_root *log = wc->log; + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; @@ -361,25 +439,36 @@ static int process_one_buffer(struct btrfs_root *log, }; ret = btrfs_read_extent_buffer(eb, &check); - if (ret) + if (unlikely(ret)) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; + } } if (wc->pin) { - ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); - if (ret) + ASSERT(trans != NULL); + ret = btrfs_pin_extent_for_log_replay(trans, eb); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); return ret; + } - if (btrfs_buffer_uptodate(eb, gen, 0) && - btrfs_header_level(eb) == 0) + if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) { ret = btrfs_exclude_logged_extents(eb); + if (ret) + btrfs_abort_transaction(trans, ret); + } } return ret; } /* - * Item overwrite used by log replay. The given eb, slot and key all refer to - * the source data we are copying out. + * Item overwrite used by log replay. The given log tree leaf, slot and key + * from the walk_control structure all refer to the source data we are copying + * out. * * The given root is for the tree we are copying into, and path is a scratch * path for use in this function (it should be released on entry and will be @@ -391,12 +480,10 @@ static int process_one_buffer(struct btrfs_root *log, * * If the key isn't in the destination yet, a new item is inserted. */ -static int overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static int overwrite_item(struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; int ret; u32 item_size; u64 saved_i_size = 0; @@ -405,7 +492,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, unsigned long dst_ptr; struct extent_buffer *dst_eb; int dst_slot; - bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; + const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY); /* * This is only used during log replay, so the root is always from a @@ -416,16 +503,21 @@ static int overwrite_item(struct btrfs_trans_handle *trans, */ ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); - item_size = btrfs_item_size(eb, slot); - src_ptr = btrfs_item_ptr_offset(eb, slot); + item_size = btrfs_item_size(wc->log_leaf, wc->log_slot); + src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot); /* Look for the key in the destination tree. */ - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) + ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search subvolume tree for key (%llu %u %llu) root %llu", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset, btrfs_root_id(root)); return ret; + } - dst_eb = path->nodes[0]; - dst_slot = path->slots[0]; + dst_eb = wc->subvol_path->nodes[0]; + dst_slot = wc->subvol_path->slots[0]; if (ret == 0) { char *src_copy; @@ -435,16 +527,17 @@ static int overwrite_item(struct btrfs_trans_handle *trans, goto insert; if (item_size == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } src_copy = kmalloc(item_size, GFP_NOFS); if (!src_copy) { - btrfs_release_path(path); + btrfs_abort_log_replay(wc, -ENOMEM, + "failed to allocate memory for log leaf item"); return -ENOMEM; } - read_extent_buffer(eb, src_copy, src_ptr, item_size); + read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size); dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size); @@ -456,7 +549,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * sync */ if (ret == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } @@ -464,7 +557,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * We need to load the old nbytes into the inode so when we * replay the extents we've logged we get the right nbytes. */ - if (inode_item) { + if (is_inode_item) { struct btrfs_inode_item *item; u64 nbytes; u32 mode; @@ -472,20 +565,20 @@ static int overwrite_item(struct btrfs_trans_handle *trans, item = btrfs_item_ptr(dst_eb, dst_slot, struct btrfs_inode_item); nbytes = btrfs_inode_nbytes(dst_eb, item); - item = btrfs_item_ptr(eb, slot, + item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item); - btrfs_set_inode_nbytes(eb, item, nbytes); + btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes); /* * If this is a directory we need to reset the i_size to * 0 so that we can set it up properly when replaying * the rest of the items in this log. */ - mode = btrfs_inode_mode(eb, item); + mode = btrfs_inode_mode(wc->log_leaf, item); if (S_ISDIR(mode)) - btrfs_set_inode_size(eb, item, 0); + btrfs_set_inode_size(wc->log_leaf, item, 0); } - } else if (inode_item) { + } else if (is_inode_item) { struct btrfs_inode_item *item; u32 mode; @@ -493,38 +586,41 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * New inode, set nbytes to 0 so that the nbytes comes out * properly when we replay the extents. */ - item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); - btrfs_set_inode_nbytes(eb, item, 0); + item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item); + btrfs_set_inode_nbytes(wc->log_leaf, item, 0); /* * If this is a directory we need to reset the i_size to 0 so * that we can set it up properly when replaying the rest of * the items in this log. */ - mode = btrfs_inode_mode(eb, item); + mode = btrfs_inode_mode(wc->log_leaf, item); if (S_ISDIR(mode)) - btrfs_set_inode_size(eb, item, 0); + btrfs_set_inode_size(wc->log_leaf, item, 0); } insert: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* try to insert the key into the destination tree */ - path->skip_release_on_error = 1; - ret = btrfs_insert_empty_item(trans, root, path, - key, item_size); - path->skip_release_on_error = 0; + wc->subvol_path->skip_release_on_error = 1; + ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size); + wc->subvol_path->skip_release_on_error = 0; - dst_eb = path->nodes[0]; - dst_slot = path->slots[0]; + dst_eb = wc->subvol_path->nodes[0]; + dst_slot = wc->subvol_path->slots[0]; /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { const u32 found_size = btrfs_item_size(dst_eb, dst_slot); if (found_size > item_size) - btrfs_truncate_item(trans, path, item_size, 1); + btrfs_truncate_item(trans, wc->subvol_path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(trans, path, item_size - found_size); + btrfs_extend_item(trans, wc->subvol_path, item_size - found_size); } else if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to insert item for key (%llu %u %llu)", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset); return ret; } dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); @@ -538,15 +634,15 @@ insert: * state of the tree found in the subvolume, and i_size is modified * as it goes */ - if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { + if (is_inode_item && ret == -EEXIST) { struct btrfs_inode_item *src_item; struct btrfs_inode_item *dst_item; src_item = (struct btrfs_inode_item *)src_ptr; dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(eb, src_item) == 0) { - const u64 ino_size = btrfs_inode_size(eb, src_item); + if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) { + const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item); /* * For regular files an ino_size == 0 is used only when @@ -555,21 +651,21 @@ insert: * case don't set the size of the inode in the fs/subvol * tree, otherwise we would be throwing valid data away. */ - if (S_ISREG(btrfs_inode_mode(eb, src_item)) && + if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) && S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && ino_size != 0) btrfs_set_inode_size(dst_eb, dst_item, ino_size); goto no_copy; } - if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && + if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) && S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) { save_old_i_size = 1; saved_i_size = btrfs_inode_size(dst_eb, dst_item); } } - copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size); + copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size); if (save_old_i_size) { struct btrfs_inode_item *dst_item; @@ -579,7 +675,7 @@ insert: } /* make sure the generation is filled in */ - if (key->type == BTRFS_INODE_ITEM_KEY) { + if (is_inode_item) { struct btrfs_inode_item *dst_item; dst_item = (struct btrfs_inode_item *)dst_ptr; @@ -587,7 +683,7 @@ insert: btrfs_set_inode_generation(dst_eb, dst_item, trans->transid); } no_copy: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } @@ -618,292 +714,354 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, * The extent is inserted into the file, dropping any existing extents * from the file that overlap the new one. */ -static noinline int replay_one_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static noinline int replay_one_extent(struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_fs_info *fs_info = root->fs_info; int found_type; u64 extent_end; - u64 start = key->offset; + const u64 start = wc->log_key.offset; u64 nbytes = 0; + u64 csum_start; + u64 csum_end; + LIST_HEAD(ordered_sums); + u64 offset; + unsigned long dest_offset; + struct btrfs_key ins; struct btrfs_file_extent_item *item; struct btrfs_inode *inode = NULL; - unsigned long size; int ret = 0; - item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(eb, item); + item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(wc->log_leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - nbytes = btrfs_file_extent_num_bytes(eb, item); - extent_end = start + nbytes; - - /* - * We don't add to the inodes nbytes if we are prealloc or a - * hole. - */ - if (btrfs_file_extent_disk_bytenr(eb, item) == 0) - nbytes = 0; + extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item); + /* Holes don't take up space. */ + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0) + nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_ram_bytes(eb, item); - nbytes = btrfs_file_extent_ram_bytes(eb, item); - extent_end = ALIGN(start + size, - fs_info->sectorsize); + nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item); + extent_end = ALIGN(start + nbytes, fs_info->sectorsize); } else { - btrfs_err(fs_info, - "unexpected extent type=%d root=%llu inode=%llu offset=%llu", - found_type, btrfs_root_id(root), key->objectid, key->offset); + btrfs_abort_log_replay(wc, -EUCLEAN, + "unexpected extent type=%d root=%llu inode=%llu offset=%llu", + found_type, btrfs_root_id(root), + wc->log_key.objectid, wc->log_key.offset); return -EUCLEAN; } - inode = btrfs_iget_logging(key->objectid, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); + inode = btrfs_iget_logging(wc->log_key.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to get inode %llu for root %llu", + wc->log_key.objectid, btrfs_root_id(root)); + return ret; + } /* * first check to see if we already have this extent in the * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); + ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path, + btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC)) { + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; struct btrfs_file_extent_item existing; unsigned long ptr; - ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing)); + ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); + read_extent_buffer(leaf, &existing, ptr, sizeof(existing)); /* * we already have a pointer to this exact extent, * we don't have to do anything */ - if (memcmp_extent_buffer(eb, &existing, (unsigned long)item, + if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item, sizeof(existing)) == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); goto out; } } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* drop any overlapping extents */ drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; + drop_args.path = wc->subvol_path; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to drop extents for inode %llu range [%llu, %llu) root %llu", + wc->log_key.objectid, start, extent_end, + btrfs_root_id(root)); goto out; + } - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 offset; - unsigned long dest_offset; - struct btrfs_key ins; - - if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && - btrfs_fs_incompat(fs_info, NO_HOLES)) - goto update_inode; - - ret = btrfs_insert_empty_item(trans, root, path, key, - sizeof(*item)); + if (found_type == BTRFS_FILE_EXTENT_INLINE) { + /* inline extents are easy, we just overwrite them */ + ret = overwrite_item(wc); if (ret) goto out; - dest_offset = btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]); - copy_extent_buffer(path->nodes[0], eb, dest_offset, - (unsigned long)item, sizeof(*item)); + goto update_inode; + } - ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); - ins.type = BTRFS_EXTENT_ITEM_KEY; - ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); - offset = key->offset - btrfs_file_extent_offset(eb, item); + /* + * If not an inline extent, it can only be a regular or prealloc one. + * We have checked that above and returned -EUCLEAN if not. + */ - /* - * Manually record dirty extent, as here we did a shallow - * file extent item copy and skip normal backref update, - * but modifying extent tree all by ourselves. - * So need to manually record dirty extent for qgroup, - * as the owner of the file extent changed from log tree - * (doesn't affect qgroup) to fs/file tree(affects qgroup) - */ - ret = btrfs_qgroup_trace_extent(trans, - btrfs_file_extent_disk_bytenr(eb, item), - btrfs_file_extent_disk_num_bytes(eb, item)); - if (ret < 0) - goto out; + /* A hole and NO_HOLES feature enabled, nothing else to do. */ + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 && + btrfs_fs_incompat(fs_info, NO_HOLES)) + goto update_inode; - if (ins.objectid > 0) { - u64 csum_start; - u64 csum_end; - LIST_HEAD(ordered_sums); + ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, + &wc->log_key, sizeof(*item)); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to insert item with key (%llu %u %llu) root %llu", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset, btrfs_root_id(root)); + goto out; + } + dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0], + wc->subvol_path->slots[0]); + copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset, + (unsigned long)item, sizeof(*item)); - /* - * is this extent already allocated in the extent - * allocation tree? If so, just add a reference - */ - ret = btrfs_lookup_data_extent(fs_info, ins.objectid, - ins.offset); - if (ret < 0) { - goto out; - } else if (ret == 0) { - struct btrfs_ref ref = { - .action = BTRFS_ADD_DELAYED_REF, - .bytenr = ins.objectid, - .num_bytes = ins.offset, - .owning_root = btrfs_root_id(root), - .ref_root = btrfs_root_id(root), - }; - btrfs_init_data_ref(&ref, key->objectid, offset, - 0, false); - ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) - goto out; - } else { - /* - * insert the extent pointer in the extent - * allocation tree - */ - ret = btrfs_alloc_logged_file_extent(trans, - btrfs_root_id(root), - key->objectid, offset, &ins); - if (ret) - goto out; - } - btrfs_release_path(path); + /* + * We have an explicit hole and NO_HOLES is not enabled. We have added + * the hole file extent item to the subvolume tree, so we don't have + * anything else to do other than update the file extent item range and + * update the inode item. + */ + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) { + btrfs_release_path(wc->subvol_path); + goto update_inode; + } - if (btrfs_file_extent_compression(eb, item)) { - csum_start = ins.objectid; - csum_end = csum_start + ins.offset; - } else { - csum_start = ins.objectid + - btrfs_file_extent_offset(eb, item); - csum_end = csum_start + - btrfs_file_extent_num_bytes(eb, item); - } + ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item); + ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item); + offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item); - ret = btrfs_lookup_csums_list(root->log_root, - csum_start, csum_end - 1, - &ordered_sums, false); - if (ret < 0) - goto out; - ret = 0; - /* - * Now delete all existing cums in the csum root that - * cover our range. We do this because we can have an - * extent that is completely referenced by one file - * extent item and partially referenced by another - * file extent item (like after using the clone or - * extent_same ioctls). In this case if we end up doing - * the replay of the one that partially references the - * extent first, and we do not do the csum deletion - * below, we can get 2 csum items in the csum tree that - * overlap each other. For example, imagine our log has - * the two following file extent items: - * - * key (257 EXTENT_DATA 409600) - * extent data disk byte 12845056 nr 102400 - * extent data offset 20480 nr 20480 ram 102400 - * - * key (257 EXTENT_DATA 819200) - * extent data disk byte 12845056 nr 102400 - * extent data offset 0 nr 102400 ram 102400 - * - * Where the second one fully references the 100K extent - * that starts at disk byte 12845056, and the log tree - * has a single csum item that covers the entire range - * of the extent: - * - * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 - * - * After the first file extent item is replayed, the - * csum tree gets the following csum item: - * - * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 - * - * Which covers the 20K sub-range starting at offset 20K - * of our extent. Now when we replay the second file - * extent item, if we do not delete existing csum items - * that cover any of its blocks, we end up getting two - * csum items in our csum tree that overlap each other: - * - * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 - * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 - * - * Which is a problem, because after this anyone trying - * to lookup up for the checksum of any block of our - * extent starting at an offset of 40K or higher, will - * end up looking at the second csum item only, which - * does not contain the checksum for any block starting - * at offset 40K or higher of our extent. - */ - while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums; - struct btrfs_root *csum_root; - - sums = list_first_entry(&ordered_sums, - struct btrfs_ordered_sum, - list); - csum_root = btrfs_csum_root(fs_info, - sums->logical); - if (!ret) - ret = btrfs_del_csums(trans, csum_root, - sums->logical, - sums->len); - if (!ret) - ret = btrfs_csum_file_blocks(trans, - csum_root, - sums); - list_del(&sums->list); - kfree(sums); - } - if (ret) - goto out; - } else { - btrfs_release_path(path); + /* + * Manually record dirty extent, as here we did a shallow file extent + * item copy and skip normal backref update, but modifying extent tree + * all by ourselves. So need to manually record dirty extent for qgroup, + * as the owner of the file extent changed from log tree (doesn't affect + * qgroup) to fs/file tree (affects qgroup). + */ + ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu", + ins.objectid, ins.offset, + wc->log_key.objectid, btrfs_root_id(root)); + goto out; + } + + /* + * Is this extent already allocated in the extent tree? + * If so, just add a reference. + */ + ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu", + ins.objectid, ins.offset, + wc->log_key.objectid, btrfs_root_id(root)); + goto out; + } else if (ret == 0) { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + + btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false); + ret = btrfs_inc_extent_ref(trans, &ref); + if (ret) { + btrfs_abort_log_replay(wc, ret, +"failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu", + ins.objectid, ins.offset, + wc->log_key.objectid, + btrfs_root_id(root)); + goto out; } - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - /* inline extents are easy, we just overwrite them */ - ret = overwrite_item(trans, root, path, eb, slot, key); - if (ret) + } else { + /* Insert the extent pointer in the extent tree. */ + ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root), + wc->log_key.objectid, offset, &ins); + if (ret) { + btrfs_abort_log_replay(wc, ret, +"failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu", + ins.objectid, ins.offset, offset, + wc->log_key.objectid, btrfs_root_id(root)); goto out; + } } - ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); + btrfs_release_path(wc->subvol_path); + + if (btrfs_file_extent_compression(wc->log_leaf, item)) { + csum_start = ins.objectid; + csum_end = csum_start + ins.offset; + } else { + csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item); + csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item); + } + + ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, + &ordered_sums, false); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to lookups csums for range [%llu, %llu) inode %llu root %llu", + csum_start, csum_end, wc->log_key.objectid, + btrfs_root_id(root)); + goto out; + } + ret = 0; + /* + * Now delete all existing cums in the csum root that cover our range. + * We do this because we can have an extent that is completely + * referenced by one file extent item and partially referenced by + * another file extent item (like after using the clone or extent_same + * ioctls). In this case if we end up doing the replay of the one that + * partially references the extent first, and we do not do the csum + * deletion below, we can get 2 csum items in the csum tree that overlap + * each other. For example, imagine our log has the two following file + * extent items: + * + * key (257 EXTENT_DATA 409600) + * extent data disk byte 12845056 nr 102400 + * extent data offset 20480 nr 20480 ram 102400 + * + * key (257 EXTENT_DATA 819200) + * extent data disk byte 12845056 nr 102400 + * extent data offset 0 nr 102400 ram 102400 + * + * Where the second one fully references the 100K extent that starts at + * disk byte 12845056, and the log tree has a single csum item that + * covers the entire range of the extent: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * + * After the first file extent item is replayed, the csum tree gets the + * following csum item: + * + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which covers the 20K sub-range starting at offset 20K of our extent. + * Now when we replay the second file extent item, if we do not delete + * existing csum items that cover any of its blocks, we end up getting + * two csum items in our csum tree that overlap each other: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which is a problem, because after this anyone trying to lookup for + * the checksum of any block of our extent starting at an offset of 40K + * or higher, will end up looking at the second csum item only, which + * does not contain the checksum for any block starting at offset 40K or + * higher of our extent. + */ + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums; + struct btrfs_root *csum_root; + + sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list); + csum_root = btrfs_csum_root(fs_info, sums->logical); + if (!ret) { + ret = btrfs_del_csums(trans, csum_root, sums->logical, + sums->len); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to delete csums for range [%llu, %llu) inode %llu root %llu", + sums->logical, + sums->logical + sums->len, + wc->log_key.objectid, + btrfs_root_id(root)); + } + if (!ret) { + ret = btrfs_csum_file_blocks(trans, csum_root, sums); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to add csums for range [%llu, %llu) inode %llu root %llu", + sums->logical, + sums->logical + sums->len, + wc->log_key.objectid, + btrfs_root_id(root)); + } + list_del(&sums->list); + kfree(sums); + } if (ret) goto out; update_inode: + ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to set file extent range [%llu, %llu) inode %llu root %llu", + start, extent_end, wc->log_key.objectid, + btrfs_root_id(root)); + goto out; + } + btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + wc->log_key.objectid, btrfs_root_id(root)); out: iput(&inode->vfs_inode); return ret; } -static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, +static int unlink_inode_for_log_replay(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_inode *inode, const struct fscrypt_str *name) { + struct btrfs_trans_handle *trans = wc->trans; int ret; ret = btrfs_unlink_inode(trans, dir, inode, name); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to unlink inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), name->len, + name->name, btrfs_root_id(inode->root)); return ret; + } /* * Whenever we need to check if a name exists or not, we check the * fs/subvolume tree. So after an unlink we must run delayed items, so * that future checks for a name during log replay see that the name * does not exists anymore. */ - return btrfs_run_delayed_items(trans); + ret = btrfs_run_delayed_items(trans); + if (ret) + btrfs_abort_log_replay(wc, ret, +"failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), name->len, + name->name, btrfs_root_id(inode->root)); + + return ret; } /* @@ -914,39 +1072,44 @@ static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, * This is a helper function to do the unlink of a specific directory * item */ -static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, +static noinline int drop_one_dir_item(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_dir_item *di) { struct btrfs_root *root = dir->root; struct btrfs_inode *inode; struct fscrypt_str name; - struct extent_buffer *leaf; + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; struct btrfs_key location; int ret; - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &location); ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); - if (ret) - return -ENOMEM; + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for dir %llu root %llu", + btrfs_ino(dir), btrfs_root_id(root)); + return ret; + } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); inode = btrfs_iget_logging(location.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to open inode %llu parent dir %llu name %.*s root %llu", + location.objectid, btrfs_ino(dir), + name.len, name.name, btrfs_root_id(root)); inode = NULL; goto out; } - ret = link_to_fixup_dir(trans, root, path, location.objectid); + ret = link_to_fixup_dir(wc, location.objectid); if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, inode, &name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &name); out: kfree(name.name); if (inode) @@ -1013,7 +1176,7 @@ static noinline int backref_in_log(struct btrfs_root *log, u64 ref_objectid, const struct fscrypt_str *name) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; path = btrfs_alloc_path(); @@ -1021,12 +1184,10 @@ static noinline int backref_in_log(struct btrfs_root *log, return -ENOMEM; ret = btrfs_search_slot(NULL, log, key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret == 1) { - ret = 0; - goto out; - } + if (ret < 0) + return ret; + if (ret == 1) + return 0; if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], @@ -1035,20 +1196,15 @@ static noinline int backref_in_log(struct btrfs_root *log, else ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name); -out: - btrfs_free_path(path); return ret; } -static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_root *log_root, +static int unlink_refs_not_in_log(struct walk_control *wc, struct btrfs_key *search_key, struct btrfs_inode *dir, - struct btrfs_inode *inode, - u64 parent_objectid) + struct btrfs_inode *inode) { - struct extent_buffer *leaf = path->nodes[0]; + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; unsigned long ptr; unsigned long ptr_end; @@ -1057,8 +1213,8 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, * log. If so, we allow them to stay otherwise they must be unlinked as * a conflict. */ - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); + ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]); while (ptr < ptr_end) { struct fscrypt_str victim_name; struct btrfs_inode_ref *victim_ref; @@ -1068,22 +1224,34 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, ret = read_alloc_one_name(leaf, (victim_ref + 1), btrfs_inode_ref_name_len(leaf, victim_ref), &victim_name); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for inode %llu parent dir %llu root %llu", + btrfs_ino(inode), btrfs_ino(dir), + btrfs_root_id(inode->root)); return ret; + } - ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name); if (ret) { - kfree(victim_name.name); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), + victim_name.len, victim_name.name, + btrfs_root_id(inode->root)); + kfree(victim_name.name); return ret; + } + kfree(victim_name.name); ptr = (unsigned long)(victim_ref + 1) + victim_name.len; continue; } inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); - ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name); kfree(victim_name.name); if (ret) return ret; @@ -1093,64 +1261,64 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, return 0; } -static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_root *root, - struct btrfs_root *log_root, +static int unlink_extrefs_not_in_log(struct walk_control *wc, struct btrfs_key *search_key, - struct btrfs_inode *inode, - u64 inode_objectid, - u64 parent_objectid) + struct btrfs_inode *dir, + struct btrfs_inode *inode) { - struct extent_buffer *leaf = path->nodes[0]; - const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]); - const u32 item_size = btrfs_item_size(leaf, path->slots[0]); + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; + const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); + const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]); u32 cur_offset = 0; while (cur_offset < item_size) { + struct btrfs_root *log_root = wc->log; struct btrfs_inode_extref *extref; - struct btrfs_inode *victim_parent; struct fscrypt_str victim_name; int ret; extref = (struct btrfs_inode_extref *)(base + cur_offset); victim_name.len = btrfs_inode_extref_name_len(leaf, extref); - if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir)) goto next; ret = read_alloc_one_name(leaf, &extref->name, victim_name.len, &victim_name); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for inode %llu parent dir %llu root %llu", + btrfs_ino(inode), btrfs_ino(dir), + btrfs_root_id(inode->root)); return ret; + } - search_key->objectid = inode_objectid; + search_key->objectid = btrfs_ino(inode); search_key->type = BTRFS_INODE_EXTREF_KEY; - search_key->offset = btrfs_extref_hash(parent_objectid, + search_key->offset = btrfs_extref_hash(btrfs_ino(dir), victim_name.name, victim_name.len); - ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name); if (ret) { - kfree(victim_name.name); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), + victim_name.len, victim_name.name, + btrfs_root_id(inode->root)); + kfree(victim_name.name); return ret; + } + kfree(victim_name.name); next: cur_offset += victim_name.len + sizeof(*extref); continue; } - victim_parent = btrfs_iget_logging(parent_objectid, root); - if (IS_ERR(victim_parent)) { - kfree(victim_name.name); - return PTR_ERR(victim_parent); - } - inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); - ret = unlink_inode_for_log_replay(trans, victim_parent, inode, - &victim_name); - iput(&victim_parent->vfs_inode); + ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name); kfree(victim_name.name); if (ret) return ret; @@ -1160,27 +1328,29 @@ next: return 0; } -static inline int __add_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_root *log_root, +static inline int __add_inode_ref(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_inode *inode, - u64 inode_objectid, u64 parent_objectid, u64 ref_index, struct fscrypt_str *name) { int ret; + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_dir_item *di; struct btrfs_key search_key; struct btrfs_inode_extref *extref; again: /* Search old style refs */ - search_key.objectid = inode_objectid; + search_key.objectid = btrfs_ino(inode); search_key.type = BTRFS_INODE_REF_KEY; - search_key.offset = parent_objectid; - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + search_key.offset = btrfs_ino(dir); + ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search subvolume tree for key (%llu %u %llu) root %llu", + search_key.objectid, search_key.type, + search_key.offset, btrfs_root_id(root)); return ret; } else if (ret == 0) { /* @@ -1190,52 +1360,60 @@ again: if (search_key.objectid == search_key.offset) return 1; - ret = unlink_refs_not_in_log(trans, path, log_root, &search_key, - dir, inode, parent_objectid); + ret = unlink_refs_not_in_log(wc, &search_key, dir, inode); if (ret == -EAGAIN) goto again; else if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* Same search but for extended refs */ - extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid); + extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name, + btrfs_ino(inode), btrfs_ino(dir)); if (IS_ERR(extref)) { return PTR_ERR(extref); } else if (extref) { - ret = unlink_extrefs_not_in_log(trans, path, root, log_root, - &search_key, inode, - inode_objectid, parent_objectid); + ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode); if (ret == -EAGAIN) goto again; else if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* look for a conflicting sequence number */ - di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), + di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir), ref_index, name, 0); if (IS_ERR(di)) { - return PTR_ERR(di); + ret = PTR_ERR(di); + btrfs_abort_log_replay(wc, ret, +"failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu", + btrfs_ino(dir), ref_index, name->len, + name->name, btrfs_root_id(root)); + return ret; } else if (di) { - ret = drop_one_dir_item(trans, path, dir, di); + ret = drop_one_dir_item(wc, dir, di); if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* look for a conflicting name */ - di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); + di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0); if (IS_ERR(di)) { - return PTR_ERR(di); + ret = PTR_ERR(di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir item for dir %llu name %.*s root %llu", + btrfs_ino(dir), name->len, name->name, + btrfs_root_id(root)); + return ret; } else if (di) { - ret = drop_one_dir_item(trans, path, dir, di); + ret = drop_one_dir_item(wc, dir, di); if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } @@ -1288,63 +1466,79 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, * proper unlink of that name (that is, remove its entry from the inode * reference item and both dir index keys). */ -static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_inode *inode, - struct extent_buffer *log_eb, - int log_slot, - struct btrfs_key *key) +static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode) { + struct btrfs_root *root = wc->root; int ret; unsigned long ref_ptr; unsigned long ref_end; struct extent_buffer *eb; again: - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + btrfs_release_path(wc->subvol_path); + ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0); if (ret > 0) { ret = 0; goto out; } - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search subvolume tree for key (%llu %u %llu) root %llu", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset, btrfs_root_id(root)); goto out; + } - eb = path->nodes[0]; - ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); - ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); + eb = wc->subvol_path->nodes[0]; + ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]); + ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]); while (ref_ptr < ref_end) { struct fscrypt_str name; u64 parent_id; - if (key->type == BTRFS_INODE_EXTREF_KEY) { + if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) { ret = extref_get_fields(eb, ref_ptr, &name, NULL, &parent_id); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to get extref details for inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); + goto out; + } } else { - parent_id = key->offset; + parent_id = wc->log_key.offset; ret = ref_get_fields(eb, ref_ptr, &name, NULL); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to get ref details for inode %llu parent_id %llu root %llu", + btrfs_ino(inode), parent_id, + btrfs_root_id(root)); + goto out; + } } - if (ret) - goto out; - if (key->type == BTRFS_INODE_EXTREF_KEY) - ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, + if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) + ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot, parent_id, &name); else - ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); + ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot, + &name); if (!ret) { struct btrfs_inode *dir; - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); dir = btrfs_iget_logging(parent_id, root); if (IS_ERR(dir)) { ret = PTR_ERR(dir); kfree(name.name); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + parent_id, btrfs_root_id(root)); goto out; } - ret = unlink_inode_for_log_replay(trans, dir, inode, &name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &name); kfree(name.name); iput(&dir->vfs_inode); if (ret) @@ -1354,56 +1548,51 @@ again: kfree(name.name); ref_ptr += name.len; - if (key->type == BTRFS_INODE_EXTREF_KEY) + if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) ref_ptr += sizeof(struct btrfs_inode_extref); else ref_ptr += sizeof(struct btrfs_inode_ref); } ret = 0; out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } /* - * replay one inode back reference item found in the log tree. - * eb, slot and key refer to the buffer and key found in the log tree. - * root is the destination we are replaying into, and path is for temp - * use by this function. (it should be released on return). + * Replay one inode back reference item found in the log tree. + * Path is for temporary use by this function (it should be released on return). */ -static noinline int add_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static noinline int add_inode_ref(struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_inode *dir = NULL; struct btrfs_inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; struct fscrypt_str name = { 0 }; int ret; - const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY); + const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY); u64 parent_objectid; u64 inode_objectid; u64 ref_index = 0; int ref_struct_size; - ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size(eb, slot); + ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot); + ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot); if (is_extref_item) { struct btrfs_inode_extref *r; ref_struct_size = sizeof(struct btrfs_inode_extref); r = (struct btrfs_inode_extref *)ref_ptr; - parent_objectid = btrfs_inode_extref_parent(eb, r); + parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r); } else { ref_struct_size = sizeof(struct btrfs_inode_ref); - parent_objectid = key->offset; + parent_objectid = wc->log_key.offset; } - inode_objectid = key->objectid; + inode_objectid = wc->log_key.objectid; /* * it is possible that we didn't log all the parent directories @@ -1416,6 +1605,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir); if (ret == -ENOENT) ret = 0; + else + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + parent_objectid, btrfs_root_id(root)); dir = NULL; goto out; } @@ -1423,16 +1616,24 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, inode = btrfs_iget_logging(inode_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + inode_objectid, btrfs_root_id(root)); inode = NULL; goto out; } while (ref_ptr < ref_end) { if (is_extref_item) { - ret = extref_get_fields(eb, ref_ptr, &name, + ret = extref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index, &parent_objectid); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to get extref details for inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); goto out; + } /* * parent object can change from one array * item to another. @@ -1457,19 +1658,35 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, */ ret = 0; goto next; + } else { + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + parent_objectid, + btrfs_root_id(root)); } goto out; } } } else { - ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); - if (ret) + ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to get ref details for inode %llu parent_objectid %llu root %llu", + btrfs_ino(inode), + parent_objectid, + btrfs_root_id(root)); goto out; + } } - ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), - ref_index, &name); + ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir), + btrfs_ino(inode), ref_index, &name); if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), + ref_index, name.len, name.name, + btrfs_root_id(root)); goto out; } else if (ret == 0) { /* @@ -1479,9 +1696,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(trans, root, path, log, dir, inode, - inode_objectid, parent_objectid, - ref_index, &name); + ret = __add_inode_ref(wc, dir, inode, ref_index, &name); if (ret) { if (ret == 1) ret = 0; @@ -1490,12 +1705,24 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, /* insert our name */ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, +"failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu", + btrfs_ino(inode), + btrfs_ino(dir), ref_index, + name.len, name.name, + btrfs_root_id(root)); goto out; + } ret = btrfs_update_inode(trans, inode); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); goto out; + } } /* Else, ret == 1, we already have a perfect match, we're done. */ @@ -1517,14 +1744,14 @@ next: * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); + ret = unlink_old_inode_refs(wc, inode); if (ret) goto out; /* finally write the back reference in the inode */ - ret = overwrite_item(trans, root, path, eb, slot, key); + ret = overwrite_item(wc); out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); kfree(name.name); if (dir) iput(&dir->vfs_inode); @@ -1642,26 +1869,22 @@ process_slot: * number of back refs found. If it goes down to zero, the iput * will free the inode. */ -static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, +static noinline int fixup_inode_link_count(struct walk_control *wc, struct btrfs_inode *inode) { + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = inode->root; - struct btrfs_path *path; int ret; u64 nlink = 0; const u64 ino = btrfs_ino(inode); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = count_inode_refs(inode, path); + ret = count_inode_refs(inode, wc->subvol_path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(inode, path); + ret = count_inode_extrefs(inode, wc->subvol_path); if (ret < 0) goto out; @@ -1680,7 +1903,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (inode->vfs_inode.i_nlink == 0) { if (S_ISDIR(inode->vfs_inode.i_mode)) { - ret = replay_dir_deletes(trans, root, NULL, path, ino, true); + ret = replay_dir_deletes(wc, ino, true); if (ret) goto out; } @@ -1690,13 +1913,11 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, } out: - btrfs_free_path(path); + btrfs_release_path(wc->subvol_path); return ret; } -static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path) +static noinline int fixup_inode_link_counts(struct walk_control *wc) { int ret; struct btrfs_key key; @@ -1705,48 +1926,50 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_inode *inode; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1); if (ret < 0) break; if (ret == 1) { ret = 0; - if (path->slots[0] == 0) + if (wc->subvol_path->slots[0] == 0) break; - path->slots[0]--; + wc->subvol_path->slots[0]--; } - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]); if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || key.type != BTRFS_ORPHAN_ITEM_KEY) break; - ret = btrfs_del_item(trans, root, path); + ret = btrfs_del_item(trans, root, wc->subvol_path); if (ret) break; - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); inode = btrfs_iget_logging(key.offset, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); break; } - ret = fixup_inode_link_count(trans, inode); + ret = fixup_inode_link_count(wc, inode); iput(&inode->vfs_inode); if (ret) break; /* * fixup on a directory may create new entries, - * make sure we always look for the highset possible + * make sure we always look for the highest possible * offset */ key.offset = (u64)-1; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } @@ -1756,36 +1979,48 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, * count when replay is done. The link count is incremented here * so the inode won't go away until we check it */ -static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 objectid) +static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_key key; int ret = 0; struct btrfs_inode *inode; struct inode *vfs_inode; inode = btrfs_iget_logging(objectid, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + objectid, btrfs_root_id(root)); + return ret; + } vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (ret == 0) { if (!vfs_inode->i_nlink) set_nlink(vfs_inode, 1); else inc_nlink(vfs_inode); ret = btrfs_update_inode(trans, inode); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + objectid, btrfs_root_id(root)); } else if (ret == -EEXIST) { ret = 0; + } else { + btrfs_abort_log_replay(wc, ret, + "failed to insert fixup item for inode %llu root %llu", + objectid, btrfs_root_id(root)); } iput(vfs_inode); @@ -1826,9 +2061,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, return ret; } -static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, +static int delete_conflicting_dir_entry(struct walk_control *wc, struct btrfs_inode *dir, - struct btrfs_path *path, struct btrfs_dir_item *dst_di, const struct btrfs_key *log_key, u8 log_flags, @@ -1836,12 +2070,12 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, { struct btrfs_key found_key; - btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key); /* The existing dentry points to the same inode, don't delete it. */ if (found_key.objectid == log_key->objectid && found_key.type == log_key->type && found_key.offset == log_key->offset && - btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) + btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags) return 1; /* @@ -1851,7 +2085,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, if (!exists) return 0; - return drop_one_dir_item(trans, path, dir, dst_di); + return drop_one_dir_item(wc, dir, dst_di); } /* @@ -1870,13 +2104,10 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a * non-existing inode) and 1 if the name was replayed. */ -static noinline int replay_one_name(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, - struct btrfs_dir_item *di, - struct btrfs_key *key) +static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct fscrypt_str name = { 0 }; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; @@ -1891,53 +2122,85 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool update_size = true; bool name_added = false; - dir = btrfs_iget_logging(key->objectid, root); - if (IS_ERR(dir)) - return PTR_ERR(dir); + dir = btrfs_iget_logging(wc->log_key.objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + wc->log_key.objectid, btrfs_root_id(root)); + return ret; + } - ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); - if (ret) + ret = read_alloc_one_name(wc->log_leaf, di + 1, + btrfs_dir_name_len(wc->log_leaf, di), &name); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for dir %llu root %llu", + btrfs_ino(dir), btrfs_root_id(root)); goto out; + } - log_flags = btrfs_dir_flags(eb, di); - btrfs_dir_item_key_to_cpu(eb, di, &log_key); - ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); - btrfs_release_path(path); - if (ret < 0) + log_flags = btrfs_dir_flags(wc->log_leaf, di); + btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key); + ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0); + btrfs_release_path(wc->subvol_path); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + log_key.objectid, btrfs_root_id(root)); goto out; + } exists = (ret == 0); ret = 0; - dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, - &name, 1); + dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, + wc->log_key.objectid, &name, 1); if (IS_ERR(dir_dst_di)) { ret = PTR_ERR(dir_dst_di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir item for dir %llu name %.*s root %llu", + wc->log_key.objectid, name.len, name.name, + btrfs_root_id(root)); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, + ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di, &log_key, log_flags, exists); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to delete conflicting entry for dir %llu name %.*s root %llu", + btrfs_ino(dir), name.len, name.name, + btrfs_root_id(root)); goto out; + } dir_dst_matches = (ret == 1); } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); - index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, - key->objectid, key->offset, - &name, 1); + index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, + wc->log_key.objectid, + wc->log_key.offset, &name, 1); if (IS_ERR(index_dst_di)) { ret = PTR_ERR(index_dst_di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir index item for dir %llu name %.*s root %llu", + wc->log_key.objectid, name.len, name.name, + btrfs_root_id(root)); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, + ret = delete_conflicting_dir_entry(wc, dir, index_dst_di, &log_key, log_flags, exists); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to delete conflicting entry for dir %llu name %.*s root %llu", + btrfs_ino(dir), name.len, name.name, + btrfs_root_id(root)); goto out; + } index_dst_matches = (ret == 1); } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (dir_dst_matches && index_dst_matches) { ret = 0; @@ -1951,9 +2214,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, */ search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_REF_KEY; - search_key.offset = key->objectid; + search_key.offset = wc->log_key.objectid; ret = backref_in_log(root->log_root, &search_key, 0, &name); if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu", + search_key.objectid, btrfs_ino(dir), + name.len, name.name, btrfs_root_id(root)); goto out; } else if (ret) { /* The dentry will be added later. */ @@ -1964,9 +2231,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); + search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len); + ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name); if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu", + search_key.objectid, btrfs_ino(dir), + name.len, name.name, btrfs_root_id(root)); goto out; } else if (ret) { /* The dentry will be added later. */ @@ -1974,11 +2245,15 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, update_size = false; goto out; } - btrfs_release_path(path); - ret = insert_one_name(trans, root, key->objectid, key->offset, + ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset, &name, &log_key); - if (ret && ret != -ENOENT && ret != -EEXIST) + if (ret && ret != -ENOENT && ret != -EEXIST) { + btrfs_abort_log_replay(wc, ret, + "failed to insert name %.*s for inode %llu dir %llu root %llu", + name.len, name.name, log_key.objectid, + btrfs_ino(dir), btrfs_root_id(root)); goto out; + } if (!ret) name_added = true; update_size = false; @@ -1988,6 +2263,10 @@ out: if (!ret && update_size) { btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); ret = btrfs_update_inode(trans, dir); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to update dir inode %llu root %llu", + btrfs_ino(dir), btrfs_root_id(root)); } kfree(name.name); iput(&dir->vfs_inode); @@ -1997,20 +2276,16 @@ out: } /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ -static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static noinline int replay_one_dir_item(struct walk_control *wc) { int ret; struct btrfs_dir_item *di; /* We only log dir index keys, which only contain a single dir item. */ - ASSERT(key->type == BTRFS_DIR_INDEX_KEY); + ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY); - di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); - ret = replay_one_name(trans, root, path, eb, di, key); + di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item); + ret = replay_one_name(wc, di); if (ret < 0) return ret; @@ -2040,17 +2315,11 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, * to ever delete the parent directory has it would result in stale * dentries that can never be deleted. */ - if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { - struct btrfs_path *fixup_path; + if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) { struct btrfs_key di_key; - fixup_path = btrfs_alloc_path(); - if (!fixup_path) - return -ENOMEM; - - btrfs_dir_item_key_to_cpu(eb, di, &di_key); - ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); - btrfs_free_path(fixup_path); + btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key); + ret = link_to_fixup_dir(wc, di_key.objectid); } return ret; @@ -2143,13 +2412,13 @@ out: * item is not in the log, the item is removed and the inode it points * to is unlinked */ -static noinline int check_item_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *log, - struct btrfs_path *path, +static noinline int check_item_in_log(struct walk_control *wc, struct btrfs_path *log_path, struct btrfs_inode *dir, - struct btrfs_key *dir_key) + struct btrfs_key *dir_key, + bool force_remove) { + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; @@ -2167,21 +2436,31 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, */ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); - eb = path->nodes[0]; - slot = path->slots[0]; + eb = wc->subvol_path->nodes[0]; + slot = wc->subvol_path->slots[0]; di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for dir %llu index %llu root %llu", + btrfs_ino(dir), dir_key->offset, + btrfs_root_id(root)); goto out; + } - if (log) { + if (!force_remove) { struct btrfs_dir_item *log_di; - log_di = btrfs_lookup_dir_index_item(trans, log, log_path, + log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path, dir_key->objectid, dir_key->offset, &name, 0); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir index item for dir %llu index %llu name %.*s root %llu", + btrfs_ino(dir), dir_key->offset, + name.len, name.name, + btrfs_root_id(root)); goto out; } else if (log_di) { /* The dentry exists in the log, we have nothing to do. */ @@ -2191,28 +2470,31 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, } btrfs_dir_item_key_to_cpu(eb, di, &location); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); btrfs_release_path(log_path); inode = btrfs_iget_logging(location.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + location.objectid, btrfs_root_id(root)); goto out; } - ret = link_to_fixup_dir(trans, root, path, location.objectid); + ret = link_to_fixup_dir(wc, location.objectid); if (ret) goto out; inc_nlink(&inode->vfs_inode); - ret = unlink_inode_for_log_replay(trans, dir, inode, &name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset * (an index number), so we're done. */ out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); btrfs_release_path(log_path); kfree(name.name); if (inode) @@ -2220,59 +2502,67 @@ out: return ret; } -static int replay_xattr_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, - const u64 ino) +static int replay_xattr_deletes(struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; + struct btrfs_root *log = wc->log; struct btrfs_key search_key; - struct btrfs_path *log_path; - int i; + BTRFS_PATH_AUTO_FREE(log_path); + const u64 ino = wc->log_key.objectid; int nritems; int ret; log_path = btrfs_alloc_path(); - if (!log_path) + if (!log_path) { + btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path"); return -ENOMEM; + } search_key.objectid = ino; search_key.type = BTRFS_XATTR_ITEM_KEY; search_key.offset = 0; again: - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret < 0) + ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search xattrs for inode %llu root %llu", + ino, btrfs_root_id(root)); goto out; + } process_leaf: - nritems = btrfs_header_nritems(path->nodes[0]); - for (i = path->slots[0]; i < nritems; i++) { + nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]); + for (int i = wc->subvol_path->slots[0]; i < nritems; i++) { struct btrfs_key key; struct btrfs_dir_item *di; struct btrfs_dir_item *log_di; u32 total_size; u32 cur; - btrfs_item_key_to_cpu(path->nodes[0], &key, i); + btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i); if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { ret = 0; goto out; } - di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); - total_size = btrfs_item_size(path->nodes[0], i); + di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item); + total_size = btrfs_item_size(wc->subvol_path->nodes[0], i); cur = 0; while (cur < total_size) { - u16 name_len = btrfs_dir_name_len(path->nodes[0], di); - u16 data_len = btrfs_dir_data_len(path->nodes[0], di); + u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di); + u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di); u32 this_len = sizeof(*di) + name_len + data_len; char *name; name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; + btrfs_abort_log_replay(wc, ret, + "failed to allocate memory for name of length %u", + name_len); goto out; } - read_extent_buffer(path->nodes[0], name, + read_extent_buffer(wc->subvol_path->nodes[0], name, (unsigned long)(di + 1), name_len); log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, @@ -2280,40 +2570,59 @@ process_leaf: btrfs_release_path(log_path); if (!log_di) { /* Doesn't exist in log tree, so delete it. */ - btrfs_release_path(path); - di = btrfs_lookup_xattr(trans, root, path, ino, + btrfs_release_path(wc->subvol_path); + di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino, name, name_len, -1); - kfree(name); if (IS_ERR(di)) { ret = PTR_ERR(di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup xattr with name %.*s for inode %llu root %llu", + name_len, name, ino, + btrfs_root_id(root)); + kfree(name); goto out; } ASSERT(di); ret = btrfs_delete_one_dir_name(trans, root, - path, di); - if (ret) + wc->subvol_path, di); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to delete xattr with name %.*s for inode %llu root %llu", + name_len, name, ino, + btrfs_root_id(root)); + kfree(name); goto out; - btrfs_release_path(path); + } + btrfs_release_path(wc->subvol_path); + kfree(name); search_key = key; goto again; } - kfree(name); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup xattr in log tree with name %.*s for inode %llu root %llu", + name_len, name, ino, + btrfs_root_id(root)); + kfree(name); goto out; } + kfree(name); cur += this_len; di = (struct btrfs_dir_item *)((char *)di + this_len); } } - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(root, wc->subvol_path); if (ret > 0) ret = 0; else if (ret == 0) goto process_leaf; + else + btrfs_abort_log_replay(wc, ret, + "failed to get next leaf in subvolume root %llu", + btrfs_root_id(root)); out: - btrfs_free_path(log_path); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } @@ -2328,12 +2637,11 @@ out: * Anything we don't find in the log is unlinked and removed from the * directory. */ -static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, +static noinline int replay_dir_deletes(struct walk_control *wc, u64 dirid, bool del_all) { + struct btrfs_root *root = wc->root; + struct btrfs_root *log = (del_all ? NULL : wc->log); u64 range_start; u64 range_end; int ret = 0; @@ -2345,8 +2653,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_INDEX_KEY; log_path = btrfs_alloc_path(); - if (!log_path) + if (!log_path) { + btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path"); return -ENOMEM; + } dir = btrfs_iget_logging(dirid, root); /* @@ -2358,6 +2668,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir); if (ret == -ENOENT) ret = 0; + else + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + dirid, btrfs_root_id(root)); return ret; } @@ -2367,32 +2681,46 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (del_all) range_end = (u64)-1; else { - ret = find_dir_range(log, path, dirid, + ret = find_dir_range(log, wc->subvol_path, dirid, &range_start, &range_end); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to find range for dir %llu in log tree root %llu", + dirid, btrfs_root_id(root)); goto out; - else if (ret > 0) + } else if (ret > 0) { break; + } } dir_key.offset = range_start; while (1) { int nritems; - ret = btrfs_search_slot(NULL, root, &dir_key, path, - 0, 0); - if (ret < 0) + ret = btrfs_search_slot(NULL, root, &dir_key, + wc->subvol_path, 0, 0); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search root %llu for key (%llu %u %llu)", + btrfs_root_id(root), + dir_key.objectid, dir_key.type, + dir_key.offset); goto out; + } - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret == 1) + nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]); + if (wc->subvol_path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, wc->subvol_path); + if (ret == 1) { break; - else if (ret < 0) + } else if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to get next leaf in subvolume root %llu", + btrfs_root_id(root)); goto out; + } } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); + btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key, + wc->subvol_path->slots[0]); if (found_key.objectid != dirid || found_key.type != dir_key.type) { ret = 0; @@ -2402,23 +2730,21 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (found_key.offset > range_end) break; - ret = check_item_in_log(trans, log, path, - log_path, dir, - &found_key); + ret = check_item_in_log(wc, log_path, dir, &found_key, del_all); if (ret) goto out; if (found_key.offset == (u64)-1) break; dir_key.offset = found_key.offset + 1; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (range_end == (u64)-1) break; range_start = range_end + 1; } ret = 0; out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); btrfs_free_path(log_path); iput(&dir->vfs_inode); return ret; @@ -2435,7 +2761,7 @@ out: * only in the log (references come from either directory items or inode * back refs). */ -static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, +static int replay_one_buffer(struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { int nritems; @@ -2443,33 +2769,44 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, .transid = gen, .level = level }; - struct btrfs_path *path; - struct btrfs_root *root = wc->replay_dest; - struct btrfs_key key; - int i; + struct btrfs_root *root = wc->root; + struct btrfs_trans_handle *trans = wc->trans; int ret; - ret = btrfs_read_extent_buffer(eb, &check); - if (ret) - return ret; - - level = btrfs_header_level(eb); - if (level != 0) return 0; - path = btrfs_alloc_path(); - if (!path) + /* + * Set to NULL since it was not yet read and in case we abort log replay + * on error, we have no valid log tree leaf to dump. + */ + wc->log_leaf = NULL; + ret = btrfs_read_extent_buffer(eb, &check); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to read log tree leaf %llu for root %llu", + eb->start, btrfs_root_id(root)); + return ret; + } + + ASSERT(wc->subvol_path == NULL); + wc->subvol_path = btrfs_alloc_path(); + if (!wc->subvol_path) { + btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path"); return -ENOMEM; + } + + wc->log_leaf = eb; nritems = btrfs_header_nritems(eb); - for (i = 0; i < nritems; i++) { + for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) { struct btrfs_inode_item *inode_item; - btrfs_item_key_to_cpu(eb, &key, i); + btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot); - if (key.type == BTRFS_INODE_ITEM_KEY) { - inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(eb, wc->log_slot, + struct btrfs_inode_item); /* * An inode with no links is either: * @@ -2498,22 +2835,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, } /* Inode keys are done during the first stage. */ - if (key.type == BTRFS_INODE_ITEM_KEY && + if (wc->log_key.type == BTRFS_INODE_ITEM_KEY && wc->stage == LOG_WALK_REPLAY_INODES) { u32 mode; - ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid); + ret = replay_xattr_deletes(wc); if (ret) break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { - ret = replay_dir_deletes(wc->trans, root, log, path, - key.objectid, false); + ret = replay_dir_deletes(wc, wc->log_key.objectid, false); if (ret) break; } - ret = overwrite_item(wc->trans, root, path, - eb, i, &key); + ret = overwrite_item(wc); if (ret) break; @@ -2530,9 +2865,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct btrfs_inode *inode; u64 from; - inode = btrfs_iget_logging(key.objectid, root); + inode = btrfs_iget_logging(wc->log_key.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + wc->log_key.objectid, + btrfs_root_id(root)); break; } from = ALIGN(i_size_read(&inode->vfs_inode), @@ -2540,21 +2879,31 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - ret = btrfs_drop_extents(wc->trans, root, inode, - &drop_args); - if (!ret) { + drop_args.path = wc->subvol_path; + ret = btrfs_drop_extents(trans, root, inode, &drop_args); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to drop extents for inode %llu root %llu offset %llu", + btrfs_ino(inode), + btrfs_root_id(root), + from); + } else { inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ - ret = btrfs_update_inode(wc->trans, inode); + ret = btrfs_update_inode(trans, inode); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); } iput(&inode->vfs_inode); if (ret) break; } - ret = link_to_fixup_dir(wc->trans, root, - path, key.objectid); + ret = link_to_fixup_dir(wc, wc->log_key.objectid); if (ret) break; } @@ -2562,10 +2911,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (wc->ignore_cur_inode) continue; - if (key.type == BTRFS_DIR_INDEX_KEY && + if (wc->log_key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { - ret = replay_one_dir_item(wc->trans, root, path, - eb, i, &key); + ret = replay_one_dir_item(wc); if (ret) break; } @@ -2574,20 +2922,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, continue; /* these keys are simply copied */ - if (key.type == BTRFS_XATTR_ITEM_KEY) { - ret = overwrite_item(wc->trans, root, path, - eb, i, &key); + if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) { + ret = overwrite_item(wc); if (ret) break; - } else if (key.type == BTRFS_INODE_REF_KEY || - key.type == BTRFS_INODE_EXTREF_KEY) { - ret = add_inode_ref(wc->trans, root, log, path, - eb, i, &key); + } else if (wc->log_key.type == BTRFS_INODE_REF_KEY || + wc->log_key.type == BTRFS_INODE_EXTREF_KEY) { + ret = add_inode_ref(wc); if (ret) break; - } else if (key.type == BTRFS_EXTENT_DATA_KEY) { - ret = replay_one_extent(wc->trans, root, path, - eb, i, &key); + } else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc); if (ret) break; } @@ -2598,55 +2943,55 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, * older kernel with such keys, ignore them. */ } - btrfs_free_path(path); + btrfs_free_path(wc->subvol_path); + wc->subvol_path = NULL; return ret; } -/* - * Correctly adjust the reserved bytes occupied by a log tree extent buffer - */ -static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) -{ - struct btrfs_block_group *cache; - - cache = btrfs_lookup_block_group(fs_info, start); - if (!cache) { - btrfs_err(fs_info, "unable to find block group for %llu", start); - return -ENOENT; - } - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->reserved -= fs_info->nodesize; - cache->space_info->bytes_reserved -= fs_info->nodesize; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - btrfs_put_block_group(cache); - - return 0; -} - static int clean_log_buffer(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_block_group *bg; + btrfs_tree_lock(eb); btrfs_clear_buffer_dirty(trans, eb); wait_on_extent_buffer_writeback(eb); btrfs_tree_unlock(eb); - if (trans) - return btrfs_pin_reserved_extent(trans, eb); + if (trans) { + int ret; - return unaccount_log_buffer(eb->fs_info, eb->start); + ret = btrfs_pin_reserved_extent(trans, eb); + if (ret) + btrfs_abort_transaction(trans, ret); + return ret; + } + + bg = btrfs_lookup_block_group(fs_info, eb->start); + if (!bg) { + btrfs_err(fs_info, "unable to find block group for %llu", eb->start); + btrfs_handle_fs_error(fs_info, -ENOENT, NULL); + return -ENOENT; + } + + spin_lock(&bg->space_info->lock); + spin_lock(&bg->lock); + bg->reserved -= fs_info->nodesize; + bg->space_info->bytes_reserved -= fs_info->nodesize; + spin_unlock(&bg->lock); + spin_unlock(&bg->space_info->lock); + + btrfs_put_block_group(bg); + + return 0; } -static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level, - struct walk_control *wc) +static noinline int walk_down_log_tree(struct btrfs_path *path, int *level, + struct walk_control *wc) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_fs_info *fs_info = wc->log->fs_info; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; @@ -2674,12 +3019,17 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_header_owner(cur), *level - 1); - if (IS_ERR(next)) - return PTR_ERR(next); + if (IS_ERR(next)) { + ret = PTR_ERR(next); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; + } if (*level == 1) { - ret = wc->process_func(root, next, wc, ptr_gen, - *level - 1); + ret = wc->process_func(next, wc, ptr_gen, *level - 1); if (ret) { free_extent_buffer(next); return ret; @@ -2690,6 +3040,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; } @@ -2705,6 +3059,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; } @@ -2721,10 +3079,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return 0; } -static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level, - struct walk_control *wc) +static noinline int walk_up_log_tree(struct btrfs_path *path, int *level, + struct walk_control *wc) { int i; int slot; @@ -2738,14 +3094,14 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level == 0); return 0; } else { - ret = wc->process_func(root, path->nodes[*level], wc, + ret = wc->process_func(path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level]), *level); if (ret) return ret; if (wc->free) { - ret = clean_log_buffer(trans, path->nodes[*level]); + ret = clean_log_buffer(wc->trans, path->nodes[*level]); if (ret) return ret; } @@ -2762,13 +3118,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, * the tree freeing any blocks that have a ref count of zero after being * decremented. */ -static int walk_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *log, struct walk_control *wc) +static int walk_log_tree(struct walk_control *wc) { + struct btrfs_root *log = wc->log; int ret = 0; int wret; int level; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int orig_level; path = btrfs_alloc_path(); @@ -2782,36 +3138,30 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, path->slots[level] = 0; while (1) { - wret = walk_down_log_tree(trans, log, path, &level, wc); + wret = walk_down_log_tree(path, &level, wc); if (wret > 0) break; - if (wret < 0) { - ret = wret; - goto out; - } + if (wret < 0) + return wret; - wret = walk_up_log_tree(trans, log, path, &level, wc); + wret = walk_up_log_tree(path, &level, wc); if (wret > 0) break; - if (wret < 0) { - ret = wret; - goto out; - } + if (wret < 0) + return wret; } /* was the root node processed? if not, catch it here */ if (path->nodes[orig_level]) { - ret = wc->process_func(log, path->nodes[orig_level], wc, + ret = wc->process_func(path->nodes[orig_level], wc, btrfs_header_generation(path->nodes[orig_level]), orig_level); if (ret) - goto out; + return ret; if (wc->free) - ret = clean_log_buffer(trans, path->nodes[orig_level]); + ret = clean_log_buffer(wc->trans, path->nodes[orig_level]); } -out: - btrfs_free_path(path); return ret; } @@ -3220,7 +3570,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); ret = write_all_supers(fs_info, 1); mutex_unlock(&fs_info->tree_log_mutex); - if (ret) { + if (unlikely(ret)) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); goto out_wake_log_root; @@ -3272,12 +3622,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans, { int ret; struct walk_control wc = { - .free = 1, - .process_func = process_one_buffer + .free = true, + .process_func = process_one_buffer, + .log = log, + .trans = trans, }; if (log->node) { - ret = walk_log_tree(trans, log, &wc); + ret = walk_log_tree(&wc); if (ret) { /* * We weren't able to traverse the entire log tree, the @@ -3340,6 +3692,31 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, return 0; } +static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + bool ret = false; + + /* + * Do this only if ->logged_trans is still 0 to prevent races with + * concurrent logging as we may see the inode not logged when + * inode_logged() is called but it gets logged after inode_logged() did + * not find it in the log tree and we end up setting ->logged_trans to a + * value less than trans->transid after the concurrent logging task has + * set it to trans->transid. As a consequence, subsequent rename, unlink + * and link operations may end up not logging new names and removing old + * names from the log. + */ + spin_lock(&inode->lock); + if (inode->logged_trans == 0) + inode->logged_trans = trans->transid - 1; + else if (inode->logged_trans == trans->transid) + ret = true; + spin_unlock(&inode->lock); + + return ret; +} + /* * Check if an inode was logged in the current transaction. This correctly deals * with the case where the inode was logged but has a logged_trans of 0, which @@ -3357,15 +3734,32 @@ static int inode_logged(const struct btrfs_trans_handle *trans, struct btrfs_key key; int ret; - if (inode->logged_trans == trans->transid) + /* + * Quick lockless call, since once ->logged_trans is set to the current + * transaction, we never set it to a lower value anywhere else. + */ + if (data_race(inode->logged_trans) == trans->transid) return 1; /* - * If logged_trans is not 0, then we know the inode logged was not logged - * in this transaction, so we can return false right away. + * If logged_trans is not 0 and not trans->transid, then we know the + * inode was not logged in this transaction, so we can return false + * right away. We take the lock to avoid a race caused by load/store + * tearing with a concurrent btrfs_log_inode() call or a concurrent task + * in this function further below - an update to trans->transid can be + * teared into two 32 bits updates for example, in which case we could + * see a positive value that is not trans->transid and assume the inode + * was not logged when it was. */ - if (inode->logged_trans > 0) + spin_lock(&inode->lock); + if (inode->logged_trans == trans->transid) { + spin_unlock(&inode->lock); + return 1; + } else if (inode->logged_trans > 0) { + spin_unlock(&inode->lock); return 0; + } + spin_unlock(&inode->lock); /* * If no log tree was created for this root in this transaction, then @@ -3374,10 +3768,8 @@ static int inode_logged(const struct btrfs_trans_handle *trans, * transaction's ID, to avoid the search below in a future call in case * a log tree gets created after this. */ - if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { - inode->logged_trans = trans->transid - 1; - return 0; - } + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) + return mark_inode_as_not_logged(trans, inode); /* * We have a log tree and the inode's logged_trans is 0. We can't tell @@ -3431,29 +3823,17 @@ static int inode_logged(const struct btrfs_trans_handle *trans, * Set logged_trans to a value greater than 0 and less then the * current transaction to avoid doing the search in future calls. */ - inode->logged_trans = trans->transid - 1; - return 0; + return mark_inode_as_not_logged(trans, inode); } /* * The inode was previously logged and then evicted, set logged_trans to - * the current transacion's ID, to avoid future tree searches as long as + * the current transaction's ID, to avoid future tree searches as long as * the inode is not evicted again. */ + spin_lock(&inode->lock); inode->logged_trans = trans->transid; - - /* - * If it's a directory, then we must set last_dir_index_offset to the - * maximum possible value, so that the next attempt to log the inode does - * not skip checking if dir index keys found in modified subvolume tree - * leaves have been logged before, otherwise it would result in attempts - * to insert duplicate dir index keys in the log tree. This must be done - * because last_dir_index_offset is an in-memory only field, not persisted - * in the inode item or any other on-disk structure, so its value is lost - * once the inode is evicted. - */ - if (S_ISDIR(inode->vfs_inode.i_mode)) - inode->last_dir_index_offset = (u64)-1; + spin_unlock(&inode->lock); return 1; } @@ -3519,13 +3899,13 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; ret = inode_logged(trans, dir, NULL); if (ret == 0) return; - else if (ret < 0) { + if (ret < 0) { btrfs_set_log_full_commit(trans); return; } @@ -3539,7 +3919,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, ret = join_running_log_trans(root); ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); if (WARN_ON(ret)) - goto out; + return; mutex_lock(&dir->log_mutex); @@ -3549,8 +3929,6 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, if (ret < 0) btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); -out: - btrfs_free_path(path); } /* see comments for btrfs_del_dir_entries_in_log */ @@ -3663,8 +4041,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, struct btrfs_key *ins_keys; u32 *ins_sizes; - ins_data = kmalloc(count * sizeof(u32) + - count * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) return -ENOMEM; @@ -4045,7 +4422,7 @@ done: /* * If the inode was logged before and it was evicted, then its - * last_dir_index_offset is (u64)-1, so we don't the value of the last index + * last_dir_index_offset is 0, so we don't know the value of the last index * key offset. If that's the case, search for it and update the inode. This * is to avoid lookups in the log tree every time we try to insert a dir index * key from a leaf changed in the current transaction, and to allow us to always @@ -4061,7 +4438,7 @@ static int update_last_dir_index_offset(struct btrfs_inode *inode, lockdep_assert_held(&inode->log_mutex); - if (inode->last_dir_index_offset != (u64)-1) + if (inode->last_dir_index_offset != 0) return 0; if (!ctx->logged_before) { @@ -4227,7 +4604,7 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans, static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, - struct inode *inode, int log_inode_only, + struct inode *inode, bool log_inode_only, u64 logged_isize) { u64 flags; @@ -4323,7 +4700,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, - 0, 0); + false, 0); btrfs_release_path(path); return 0; } @@ -4427,8 +4804,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src = src_path->nodes[0]; - ins_data = kmalloc(nr * sizeof(struct btrfs_key) + - nr * sizeof(u32), GFP_NOFS); + ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS); if (!ins_data) return -ENOMEM; @@ -4829,7 +5205,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct btrfs_key key; const u64 i_size = i_size_read(&inode->vfs_inode); const u64 ino = btrfs_ino(inode); - struct btrfs_path *dst_path = NULL; + BTRFS_PATH_AUTO_FREE(dst_path); bool dropped_extents = false; u64 truncate_offset = i_size; struct extent_buffer *leaf; @@ -4947,7 +5323,6 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, start_slot, ins_nr, 1, 0, ctx); out: btrfs_release_path(path); - btrfs_free_path(dst_path); return ret; } @@ -5320,7 +5695,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, u64 *other_ino, u64 *other_parent) { int ret; - struct btrfs_path *search_path; + BTRFS_PATH_AUTO_FREE(search_path); char *name = NULL; u32 name_len = 0; u32 item_size = btrfs_item_size(eb, slot); @@ -5405,7 +5780,6 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, } ret = 0; out: - btrfs_free_path(search_path); kfree(name); return ret; } @@ -6133,8 +6507,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, if (!first) return 0; - ins_data = kmalloc(max_batch_size * sizeof(u32) + - max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) return -ENOMEM; ins_sizes = (u32 *)ins_data; @@ -6788,7 +7161,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root = inode->root; const u64 ino = btrfs_ino(inode); @@ -6804,7 +7177,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; while (true) { struct extent_buffer *leaf = path->nodes[0]; @@ -6816,8 +7189,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; - else if (ret > 0) + return ret; + if (ret > 0) break; continue; } @@ -6875,10 +7248,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, * at both parents and the old parent B would still * exist. */ - if (IS_ERR(dir_inode)) { - ret = PTR_ERR(dir_inode); - goto out; - } + if (IS_ERR(dir_inode)) + return PTR_ERR(dir_inode); if (!need_log_inode(trans, dir_inode)) { btrfs_add_delayed_iput(dir_inode); @@ -6891,14 +7262,11 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ret = log_new_dir_dentries(trans, dir_inode, ctx); btrfs_add_delayed_iput(dir_inode); if (ret) - goto out; + return ret; } path->slots[0]++; } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } static int log_new_ancestors(struct btrfs_trans_handle *trans, @@ -7009,7 +7377,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; const u64 ino = btrfs_ino(inode); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret; @@ -7030,7 +7398,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) path->slots[0]++; @@ -7042,8 +7410,8 @@ again: if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; - else if (ret > 0) + return ret; + if (ret > 0) break; continue; } @@ -7060,10 +7428,8 @@ again: * this loop, etc). So just return some error to fallback to * a transaction commit. */ - if (found_key.type == BTRFS_INODE_EXTREF_KEY) { - ret = -EMLINK; - goto out; - } + if (found_key.type == BTRFS_INODE_EXTREF_KEY) + return -EMLINK; /* * Logging ancestors needs to do more searches on the fs/subvol @@ -7075,14 +7441,11 @@ again: ret = log_new_ancestors(trans, root, path, ctx); if (ret) - goto out; + return ret; btrfs_release_path(path); goto again; } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -7262,10 +7625,12 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) } wc.trans = trans; - wc.pin = 1; + wc.pin = true; + wc.log = log_root_tree; - ret = walk_log_tree(trans, log_root_tree, &wc); - if (ret) { + ret = walk_log_tree(&wc); + wc.log = NULL; + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7276,12 +7641,11 @@ again: key.offset = (u64)-1; while (1) { - struct btrfs_root *log; struct btrfs_key found_key; ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7296,20 +7660,19 @@ again: if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) break; - log = btrfs_read_tree_root(log_root_tree, &found_key); - if (IS_ERR(log)) { - ret = PTR_ERR(log); + wc.log = btrfs_read_tree_root(log_root_tree, &found_key); + if (IS_ERR(wc.log)) { + ret = PTR_ERR(wc.log); + wc.log = NULL; btrfs_abort_transaction(trans, ret); goto error; } - wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, - true); - if (IS_ERR(wc.replay_dest)) { - ret = PTR_ERR(wc.replay_dest); - wc.replay_dest = NULL; - if (ret != -ENOENT) { - btrfs_put_root(log); + wc.root = btrfs_get_fs_root(fs_info, found_key.offset, true); + if (IS_ERR(wc.root)) { + ret = PTR_ERR(wc.root); + wc.root = NULL; + if (unlikely(ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7325,33 +7688,34 @@ again: * block from being modified, and we'll just bail for * each subsequent pass. */ - ret = btrfs_pin_extent_for_log_replay(trans, log->node); - if (ret) { - btrfs_put_root(log); + ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node); + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error; } goto next; } - wc.replay_dest->log_root = log; - ret = btrfs_record_root_in_trans(trans, wc.replay_dest); - if (ret) { + wc.root->log_root = wc.log; + ret = btrfs_record_root_in_trans(trans, wc.root); + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } - ret = walk_log_tree(trans, log, &wc); - if (ret) { + ret = walk_log_tree(&wc); + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } if (wc.stage == LOG_WALK_REPLAY_ALL) { - struct btrfs_root *root = wc.replay_dest; + struct btrfs_root *root = wc.root; - ret = fixup_inode_link_counts(trans, wc.replay_dest, path); - if (ret) { + wc.subvol_path = path; + ret = fixup_inode_link_counts(&wc); + wc.subvol_path = NULL; + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } @@ -7364,17 +7728,18 @@ again: * could only happen during mount. */ ret = btrfs_init_root_free_objectid(root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } } next: - if (wc.replay_dest) { - wc.replay_dest->log_root = NULL; - btrfs_put_root(wc.replay_dest); + if (wc.root) { + wc.root->log_root = NULL; + btrfs_put_root(wc.root); } - btrfs_put_root(log); + btrfs_put_root(wc.log); + wc.log = NULL; if (ret) goto error; @@ -7386,7 +7751,7 @@ next: /* step one is to pin it all, step two is to replay just inodes */ if (wc.pin) { - wc.pin = 0; + wc.pin = false; wc.process_func = replay_one_buffer; wc.stage = LOG_WALK_REPLAY_INODES; goto again; @@ -7404,14 +7769,13 @@ next: if (ret) return ret; - log_root_tree->log_root = NULL; clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); - btrfs_put_root(log_root_tree); return 0; error: if (wc.trans) btrfs_end_transaction(wc.trans); + btrfs_put_root(wc.log); clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); btrfs_free_path(path); return ret; diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index b7a96a005487..46bd8ca58670 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -487,12 +487,12 @@ static int rollback_verity(struct btrfs_inode *inode) inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = del_orphan(trans, inode); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -676,11 +676,11 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) if (ret < 0) return ret; - if (item.reserved[0] != 0 || item.reserved[1] != 0) + if (unlikely(item.reserved[0] != 0 || item.reserved[1] != 0)) return -EUCLEAN; true_size = btrfs_stack_verity_descriptor_size(&item); - if (true_size > INT_MAX) + if (unlikely(true_size > INT_MAX)) return -EUCLEAN; if (buf_size == 0) @@ -802,6 +802,8 @@ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations btrfs_verityops = { + .inode_info_offs = (int)offsetof(struct btrfs_inode, i_verity_info) - + (int)offsetof(struct btrfs_inode, vfs_inode), .begin_enable_verity = btrfs_begin_enable_verity, .end_enable_verity = btrfs_end_enable_verity, .get_verity_descriptor = btrfs_get_verity_descriptor, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fa7a929a0461..2bec544d8ba3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1377,8 +1377,8 @@ struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, } /* - * Make sure the last byte of label is properly NUL termiated. We use - * '%s' to print the label, if not properly NUL termiated we can access + * Make sure the last byte of label is properly NUL terminated. We use + * '%s' to print the label, if not properly NUL terminated we can access * beyond the label. */ if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1]) @@ -1911,7 +1911,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, if (ret < 0) goto error; - if (ret == 0) { + if (unlikely(ret == 0)) { /* Corruption */ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); ret = -EUCLEAN; @@ -2243,7 +2243,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } ret = btrfs_rm_dev_item(trans, device); - if (ret) { + if (unlikely(ret)) { /* Any error in dev item removal is critical */ btrfs_crit(fs_info, "failed to remove device item for devid %llu: %d", @@ -2722,6 +2722,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error; } + if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) { + ret = -EINVAL; + goto error; + } + if (fs_devices->seeding) { seeding_dev = true; down_write(&sb->s_umount); @@ -2838,21 +2843,21 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_lock(&fs_info->chunk_mutex); ret = init_first_rw_device(trans); mutex_unlock(&fs_info->chunk_mutex); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } } ret = btrfs_add_dev_item(trans, device); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } if (seeding_dev) { ret = btrfs_finish_sprout(trans); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } @@ -3044,7 +3049,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; - else if (ret > 0) { /* Logic error or corruption */ + else if (unlikely(ret > 0)) { /* Logic error or corruption */ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing", chunk_offset); btrfs_abort_transaction(trans, -ENOENT); @@ -3053,7 +3058,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } ret = btrfs_del_item(trans, root, path); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset); btrfs_abort_transaction(trans, ret); goto out; @@ -3278,7 +3283,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); - if (ret) { + if (unlikely(ret)) { mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, ret); goto out; @@ -3348,7 +3353,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) struct btrfs_space_info *space_info; space_info = btrfs_find_space_info(fs_info, sys_flags); - if (!space_info) { + if (unlikely(!space_info)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -3362,17 +3367,17 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = remove_chunk_item(trans, map, chunk_offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } - } else if (ret) { + } else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3381,7 +3386,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(fs_info, chunk_offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3397,7 +3402,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) btrfs_trans_release_chunk_metadata(trans); ret = btrfs_remove_block_group(trans, map); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3522,7 +3527,7 @@ again: mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } - if (ret == 0) { + if (unlikely(ret == 0)) { /* * On the first search we would find chunk tree with * offset -1, which is not possible. On subsequent @@ -4264,7 +4269,7 @@ error: * @flags: profile to validate * @extended: if true @flags is treated as an extended profile */ -static int alloc_profile_is_valid(u64 flags, int extended) +static int alloc_profile_is_valid(u64 flags, bool extended) { u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : BTRFS_BLOCK_GROUP_PROFILE_MASK); @@ -4458,7 +4463,7 @@ out_overflow: } /* - * Should be called with balance mutexe held + * Should be called with balance mutex held */ int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, @@ -5036,7 +5041,7 @@ again: /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); btrfs_trans_release_chunk_metadata(trans); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); } else { @@ -5696,7 +5701,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, item_size = btrfs_chunk_item_size(map->num_stripes); chunk = kzalloc(item_size, GFP_NOFS); - if (!chunk) { + if (unlikely(!chunk)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -7481,7 +7486,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) /* * Lockdep complains about possible circular locking dependency between * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores - * used for freeze procection of a fs (struct super_block.s_writers), + * used for freeze protection of a fs (struct super_block.s_writers), * which we take when starting a transaction, and extent buffers of the * chunk tree if we call read_one_dev() while holding a lock on an * extent buffer of the chunk tree. Since we are mounting the filesystem @@ -7914,8 +7919,6 @@ int btrfs_bg_type_to_factor(u64 flags) return btrfs_raid_array[index].ncopies; } - - static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) @@ -7929,7 +7932,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, int i; map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); - if (!map) { + if (unlikely(!map)) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", physical_offset, devid); @@ -7938,7 +7941,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } stripe_len = btrfs_calc_stripe_length(map); - if (physical_len != stripe_len) { + if (unlikely(physical_len != stripe_len)) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", physical_offset, devid, map->start, physical_len, @@ -7958,8 +7961,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, devid, physical_offset, physical_len); for (i = 0; i < map->num_stripes; i++) { - if (map->stripes[i].dev->devid == devid && - map->stripes[i].physical == physical_offset) { + if (unlikely(map->stripes[i].dev->devid == devid && + map->stripes[i].physical == physical_offset)) { found = true; if (map->verified_stripes >= map->num_stripes) { btrfs_err(fs_info, @@ -7972,7 +7975,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, break; } } - if (!found) { + if (unlikely(!found)) { btrfs_err(fs_info, "dev extent physical offset %llu devid %llu has no corresponding chunk", physical_offset, devid); @@ -7981,13 +7984,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, /* Make sure no dev extent is beyond device boundary */ dev = btrfs_find_device(fs_info->fs_devices, &args); - if (!dev) { + if (unlikely(!dev)) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; goto out; } - if (physical_offset + physical_len > dev->disk_total_bytes) { + if (unlikely(physical_offset + physical_len > dev->disk_total_bytes)) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", devid, physical_offset, physical_len, @@ -7999,8 +8002,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, if (dev->zone_info) { u64 zone_size = dev->zone_info->zone_size; - if (!IS_ALIGNED(physical_offset, zone_size) || - !IS_ALIGNED(physical_len, zone_size)) { + if (unlikely(!IS_ALIGNED(physical_offset, zone_size) || + !IS_ALIGNED(physical_len, zone_size))) { btrfs_err(fs_info, "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", devid, physical_offset, physical_len); @@ -8024,7 +8027,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) struct btrfs_chunk_map *map; map = rb_entry(node, struct btrfs_chunk_map, rb_node); - if (map->num_stripes != map->verified_stripes) { + if (unlikely(map->num_stripes != map->verified_stripes)) { btrfs_err(fs_info, "chunk %llu has missing dev extent, have %d expect %d", map->start, map->verified_stripes, map->num_stripes); @@ -8084,7 +8087,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) if (ret < 0) goto out; /* No dev extents at all? Not good */ - if (ret > 0) { + if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -8109,7 +8112,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) physical_len = btrfs_dev_extent_length(leaf, dext); /* Check if this dev extent overlaps with the previous one */ - if (devid == prev_devid && physical_offset < prev_dev_ext_end) { + if (unlikely(devid == prev_devid && physical_offset < prev_dev_ext_end)) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", devid, physical_offset, prev_dev_ext_end); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a56e873a3029..2cbf8080eade 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -34,7 +34,7 @@ struct btrfs_zoned_device_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) /* - * Arbitratry maximum size of one discard request to limit potentially long time + * Arbitrary maximum size of one discard request to limit potentially long time * spent in blkdev_issue_discard(). */ #define BTRFS_MAX_DISCARD_CHUNK_SIZE (SZ_1G) @@ -495,7 +495,7 @@ struct btrfs_discard_stripe { }; /* - * Context for IO subsmission for device stripe. + * Context for IO submission for device stripe. * * - Track the unfinished mirrors for mirror based profiles * Mirror based profiles are SINGLE/DUP/RAID1/RAID10. diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 5292cd341f70..6caba8be7c84 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -34,11 +34,9 @@ struct workspace { int level; }; -static struct workspace_manager wsm; - -struct list_head *zlib_get_workspace(unsigned int level) +struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level) { - struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level); + struct list_head *ws = btrfs_get_workspace(fs_info, BTRFS_COMPRESS_ZLIB, level); struct workspace *workspace = list_entry(ws, struct workspace, list); workspace->level = level; @@ -55,8 +53,25 @@ void zlib_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zlib_alloc_workspace(unsigned int level) +/* + * For s390 hardware acceleration, the buffer size should be at least + * ZLIB_DFLTCC_BUF_SIZE to achieve the best performance. + * + * But if bs > ps we can have large enough folios that meet the s390 hardware + * handling. + */ +static bool need_special_buffer(struct btrfs_fs_info *fs_info) +{ + if (!zlib_deflate_dfltcc_enabled()) + return false; + if (btrfs_min_folio_size(fs_info) >= ZLIB_DFLTCC_BUF_SIZE) + return false; + return true; +} + +struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level) { + const u32 blocksize = fs_info->sectorsize; struct workspace *workspace; int workspacesize; @@ -69,19 +84,15 @@ struct list_head *zlib_alloc_workspace(unsigned int level) workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN); workspace->level = level; workspace->buf = NULL; - /* - * In case of s390 zlib hardware support, allocate lager workspace - * buffer. If allocator fails, fall back to a single page buffer. - */ - if (zlib_deflate_dfltcc_enabled()) { + if (need_special_buffer(fs_info)) { workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE, __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | GFP_NOIO); workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE; } if (!workspace->buf) { - workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - workspace->buf_size = PAGE_SIZE; + workspace->buf = kmalloc(blocksize, GFP_KERNEL); + workspace->buf_size = blocksize; } if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -133,11 +144,15 @@ static int copy_data_into_buffer(struct address_space *mapping, return 0; } -int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, +int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); + struct address_space *mapping = inode->vfs_inode.i_mapping; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); int ret; char *data_in = NULL; char *cfolio_out; @@ -146,7 +161,8 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, struct folio *out_folio = NULL; unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; - const unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const unsigned long max_out = nr_dest_folios << min_folio_shift; + const u32 blocksize = fs_info->sectorsize; const u64 orig_end = start + len; *out_folios = 0; @@ -155,9 +171,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = zlib_deflateInit(&workspace->strm, workspace->level); if (unlikely(ret != Z_OK)) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_err(inode->root->fs_info, + btrfs_err(fs_info, "zlib compression init failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); ret = -EIO; @@ -167,7 +181,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -179,7 +193,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, workspace->strm.next_in = workspace->buf; workspace->strm.avail_in = 0; workspace->strm.next_out = cfolio_out; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = min_folio_size; while (workspace->strm.total_in < len) { /* @@ -191,10 +205,11 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned int copy_length = min(bytes_left, workspace->buf_size); /* - * This can only happen when hardware zlib compression is - * enabled. + * For s390 hardware accelerated zlib, and our folio is smaller + * than the copy_length, we need to fill the buffer so that + * we can take full advantage of hardware acceleration. */ - if (copy_length > PAGE_SIZE) { + if (need_special_buffer(fs_info)) { ret = copy_data_into_buffer(mapping, workspace, start, copy_length); if (ret < 0) @@ -225,9 +240,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); if (unlikely(ret != Z_OK)) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_warn(inode->root->fs_info, + btrfs_warn(fs_info, "zlib compression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); @@ -237,7 +250,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, } /* we're making it bigger, give up */ - if (workspace->strm.total_in > 8192 && + if (workspace->strm.total_in > blocksize * 2 && workspace->strm.total_in < workspace->strm.total_out) { ret = -E2BIG; @@ -252,7 +265,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -260,7 +273,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, cfolio_out = folio_address(out_folio); folios[nr_folios] = out_folio; nr_folios++; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = min_folio_size; workspace->strm.next_out = cfolio_out; } /* we're all done */ @@ -278,7 +291,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = zlib_deflate(&workspace->strm, Z_FINISH); if (ret == Z_STREAM_END) break; - if (ret != Z_OK && ret != Z_BUF_ERROR) { + if (unlikely(ret != Z_OK && ret != Z_BUF_ERROR)) { zlib_deflateEnd(&workspace->strm); ret = -EIO; goto out; @@ -288,7 +301,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -296,7 +309,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, cfolio_out = folio_address(out_folio); folios[nr_folios] = out_folio; nr_folios++; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = min_folio_size; workspace->strm.next_out = cfolio_out; } } @@ -322,20 +335,22 @@ out: int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); + const u32 min_folio_size = btrfs_min_folio_size(fs_info); int ret = 0, ret2; int wbits = MAX_WBITS; char *data_in; size_t total_out = 0; unsigned long folio_in_index = 0; size_t srclen = cb->compressed_len; - unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); unsigned long buf_start; struct folio **folios_in = cb->compressed_folios; data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; - workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); + workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size); workspace->strm.total_in = 0; workspace->strm.total_out = 0; @@ -396,7 +411,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; - workspace->strm.avail_in = min(tmp, PAGE_SIZE); + workspace->strm.avail_in = min(tmp, min_folio_size); } } if (unlikely(ret != Z_STREAM_END)) { @@ -484,8 +499,7 @@ out: return ret; } -const struct btrfs_compress_op btrfs_zlib_compress = { - .workspace_manager = &wsm, +const struct btrfs_compress_levels btrfs_zlib_compress = { .min_level = 1, .max_level = 9, .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index ea662036f441..e00036672f33 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -274,7 +274,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return ret; } *nr_zones = ret; - if (!ret) + if (unlikely(!ret)) return -EIO; /* Populate cache */ @@ -315,7 +315,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) if (ret < 0) return ret; /* No dev extents at all? Not good */ - if (ret > 0) + if (unlikely(ret > 0)) return -EUCLEAN; } @@ -503,7 +503,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; } - if (nreported != zone_info->nr_zones) { + if (unlikely(nreported != zone_info->nr_zones)) { btrfs_err(device->fs_info, "inconsistent number of zones on %s (%u/%u)", rcu_dereference(device->name), nreported, @@ -513,7 +513,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } if (max_active_zones) { - if (nactive > max_active_zones) { + if (unlikely(nactive > max_active_zones)) { + if (bdev_max_active_zones(bdev) == 0) { + max_active_zones = 0; + zone_info->max_active_zones = 0; + goto validate; + } btrfs_err(device->fs_info, "zoned: %u active zones on %s exceeds max_active_zones %u", nactive, rcu_dereference(device->name), @@ -526,6 +531,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); } +validate: /* Validate superblock log */ nr_zones = BTRFS_NR_SB_LOG_ZONES; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -544,7 +550,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (ret) goto out; - if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { + if (unlikely(nr_zones != BTRFS_NR_SB_LOG_ZONES)) { btrfs_err(device->fs_info, "zoned: failed to read super block log zone info at devid %llu zone %u", device->devid, sb_zone); @@ -562,7 +568,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) ret = sb_write_pointer(device->bdev, &zone_info->sb_zones[sb_pos], &sb_wp); - if (ret != -ENOENT && ret) { + if (unlikely(ret != -ENOENT && ret)) { btrfs_err(device->fs_info, "zoned: super block log zone corrupted devid %llu zone %u", device->devid, sb_zone); @@ -895,7 +901,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, zones); if (ret < 0) return ret; - if (ret != BTRFS_NR_SB_LOG_ZONES) + if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES)) return -EIO; return sb_log_location(bdev, zones, rw, bytenr_ret); @@ -1247,7 +1253,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, root = btrfs_extent_root(fs_info, key.objectid); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); /* We should not find the exact match */ - if (!ret) + if (unlikely(!ret)) ret = -EUCLEAN; if (ret < 0) return ret; @@ -1268,8 +1274,8 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, else length = fs_info->nodesize; - if (!(found_key.objectid >= cache->start && - found_key.objectid + length <= cache->start + cache->length)) { + if (unlikely(!(found_key.objectid >= cache->start && + found_key.objectid + length <= cache->start + cache->length))) { return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; @@ -1351,7 +1357,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, return 0; } - if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + if (unlikely(zone.type == BLK_ZONE_TYPE_CONVENTIONAL)) { btrfs_err(fs_info, "zoned: unexpected conventional zone %llu on device %s (devid %llu)", zone.start << SECTOR_SHIFT, rcu_dereference(device->name), @@ -1393,7 +1399,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, struct zone_info *info, unsigned long *active) { - if (info->alloc_offset == WP_MISSING_DEV) { + if (unlikely(info->alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", info->physical); @@ -1422,13 +1428,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); - if (zone_info[0].alloc_offset == WP_MISSING_DEV) { + if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[0].physical); return -EIO; } - if (zone_info[1].alloc_offset == WP_MISSING_DEV) { + if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[1].physical); @@ -1441,14 +1447,14 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, if (zone_info[1].alloc_offset == WP_CONVENTIONAL) zone_info[1].alloc_offset = last_alloc; - if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { + if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); return -EIO; } if (test_bit(0, active) != test_bit(1, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else if (test_bit(0, active)) { set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); @@ -1483,16 +1489,16 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, if (zone_info[i].alloc_offset == WP_CONVENTIONAL) zone_info[i].alloc_offset = last_alloc; - if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && + !btrfs_test_opt(fs_info, DEGRADED))) { btrfs_err(fs_info, "zoned: write pointer offset mismatch of zones in %s profile", btrfs_bg_type_to_raid_name(map->type)); return -EIO; } if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_test_opt(fs_info, DEGRADED) && - !btrfs_zone_activate(bg)) { + if (unlikely(!btrfs_test_opt(fs_info, DEGRADED) && + !btrfs_zone_activate(bg))) { return -EIO; } } else { @@ -1548,7 +1554,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else { if (test_bit(0, active)) @@ -1580,7 +1586,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, continue; if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else { if (test_bit(0, active)) @@ -1637,7 +1643,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return 0; /* Sanity check */ - if (!IS_ALIGNED(length, fs_info->zone_size)) { + if (unlikely(!IS_ALIGNED(length, fs_info->zone_size))) { btrfs_err(fs_info, "zoned: block group %llu len %llu unaligned to zone size %llu", logical, length, fs_info->zone_size); @@ -1750,7 +1756,7 @@ out: return -EINVAL; } - if (cache->alloc_offset > cache->zone_capacity) { + if (unlikely(cache->alloc_offset > cache->zone_capacity)) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", cache->alloc_offset, cache->zone_capacity, @@ -2081,7 +2087,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc, NULL, NULL); - if (ret || !bioc || mapped_length < PAGE_SIZE) { + if (unlikely(ret || !bioc || mapped_length < PAGE_SIZE)) { ret = -EIO; goto out_put_bioc; } @@ -2139,7 +2145,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, if (physical_pos == wp) return 0; - if (physical_pos > wp) + if (unlikely(physical_pos > wp)) return -EUCLEAN; length = wp - physical_pos; @@ -2458,16 +2464,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) return ret; } -void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) +int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; u64 min_alloc_bytes; if (!btrfs_is_zoned(fs_info)) - return; + return 0; block_group = btrfs_lookup_block_group(fs_info, logical); - ASSERT(block_group); + if (WARN_ON_ONCE(!block_group)) + return -ENOENT; /* No MIXED_BG on zoned btrfs. */ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) @@ -2484,16 +2491,21 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len out: btrfs_put_block_group(block_group); + return 0; } static void btrfs_zone_finish_endio_workfn(struct work_struct *work) { + int ret; struct btrfs_block_group *bg = container_of(work, struct btrfs_block_group, zone_finish_work); wait_on_extent_buffer_writeback(bg->last_eb); free_extent_buffer(bg->last_eb); - btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); + ret = do_zone_finish(bg, true); + if (ret) + btrfs_handle_fs_error(bg->fs_info, ret, + "Failed to finish block-group's zone"); btrfs_put_block_group(bg); } @@ -2515,7 +2527,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, refcount_inc(&eb->refs); bg->last_eb = eb; INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); - queue_work(system_unbound_wq, &bg->zone_finish_work); + queue_work(system_dfl_wq, &bg->zone_finish_work); } void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) @@ -2582,9 +2594,9 @@ again: spin_lock(&space_info->lock); space_info->total_bytes -= bg->length; space_info->disk_total -= bg->length * factor; + space_info->disk_total -= bg->zone_unusable; /* There is no allocation ever happened. */ ASSERT(bg->used == 0); - ASSERT(bg->zone_unusable == 0); /* No super block in a block group on the zoned setup. */ ASSERT(bg->bytes_super == 0); spin_unlock(&space_info->lock); diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 6e11533b8e14..17c5656580dd 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -83,7 +83,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, bool btrfs_zone_activate(struct btrfs_block_group *block_group); int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); -void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, +int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length); void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); @@ -234,8 +234,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, return true; } -static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) { } +static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + return 0; +} static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb) { } diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index ff0292615e1f..c9cddcfa337b 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -77,7 +77,6 @@ struct workspace { */ struct zstd_workspace_manager { - const struct btrfs_compress_op *ops; spinlock_t lock; struct list_head lru_list; struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL]; @@ -86,8 +85,6 @@ struct zstd_workspace_manager { struct timer_list timer; }; -static struct zstd_workspace_manager wsm; - static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL]; static inline struct workspace *list_to_workspace(struct list_head *list) @@ -112,19 +109,19 @@ static inline int clip_level(int level) */ static void zstd_reclaim_timer_fn(struct timer_list *timer) { + struct zstd_workspace_manager *zwsm = + container_of(timer, struct zstd_workspace_manager, timer); unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; - ASSERT(timer == &wsm.timer); - - spin_lock(&wsm.lock); + spin_lock(&zwsm->lock); - if (list_empty(&wsm.lru_list)) { - spin_unlock(&wsm.lock); + if (list_empty(&zwsm->lru_list)) { + spin_unlock(&zwsm->lock); return; } - list_for_each_prev_safe(pos, next, &wsm.lru_list) { + list_for_each_prev_safe(pos, next, &zwsm->lru_list) { struct workspace *victim = container_of(pos, struct workspace, lru_list); int level; @@ -141,15 +138,15 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_del(&victim->list); zstd_free_workspace(&victim->list); - if (list_empty(&wsm.idle_ws[level])) - clear_bit(level, &wsm.active_map); + if (list_empty(&zwsm->idle_ws[level])) + clear_bit(level, &zwsm->active_map); } - if (!list_empty(&wsm.lru_list)) - mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); + if (!list_empty(&zwsm->lru_list)) + mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); - spin_unlock(&wsm.lock); + spin_unlock(&zwsm->lock); } /* @@ -182,49 +179,56 @@ static void zstd_calc_ws_mem_sizes(void) } } -void zstd_init_workspace_manager(void) +int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info) { + struct zstd_workspace_manager *zwsm; struct list_head *ws; - int i; + ASSERT(fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] == NULL); + zwsm = kzalloc(sizeof(*zwsm), GFP_KERNEL); + if (!zwsm) + return -ENOMEM; zstd_calc_ws_mem_sizes(); + spin_lock_init(&zwsm->lock); + init_waitqueue_head(&zwsm->wait); + timer_setup(&zwsm->timer, zstd_reclaim_timer_fn, 0); - wsm.ops = &btrfs_zstd_compress; - spin_lock_init(&wsm.lock); - init_waitqueue_head(&wsm.wait); - timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0); - - INIT_LIST_HEAD(&wsm.lru_list); - for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) - INIT_LIST_HEAD(&wsm.idle_ws[i]); + INIT_LIST_HEAD(&zwsm->lru_list); + for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&zwsm->idle_ws[i]); + fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = zwsm; - ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); + ws = zstd_alloc_workspace(fs_info, ZSTD_BTRFS_MAX_LEVEL); if (IS_ERR(ws)) { btrfs_warn(NULL, "cannot preallocate zstd compression workspace"); } else { - set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map); - list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); + set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &zwsm->active_map); + list_add(ws, &zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); } + return 0; } -void zstd_cleanup_workspace_manager(void) +void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct workspace *workspace; - int i; - spin_lock_bh(&wsm.lock); - for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { - while (!list_empty(&wsm.idle_ws[i])) { - workspace = container_of(wsm.idle_ws[i].next, + if (!zwsm) + return; + fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = NULL; + spin_lock_bh(&zwsm->lock); + for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { + while (!list_empty(&zwsm->idle_ws[i])) { + workspace = container_of(zwsm->idle_ws[i].next, struct workspace, list); list_del(&workspace->list); list_del(&workspace->lru_list); zstd_free_workspace(&workspace->list); } } - spin_unlock_bh(&wsm.lock); - - timer_delete_sync(&wsm.timer); + spin_unlock_bh(&zwsm->lock); + timer_delete_sync(&zwsm->timer); + kfree(zwsm); } /* @@ -239,29 +243,31 @@ void zstd_cleanup_workspace_manager(void) * offer the opportunity to reclaim the workspace in favor of allocating an * appropriately sized one in the future. */ -static struct list_head *zstd_find_workspace(int level) +static struct list_head *zstd_find_workspace(struct btrfs_fs_info *fs_info, int level) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct list_head *ws; struct workspace *workspace; int i = clip_level(level); - spin_lock_bh(&wsm.lock); - for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { - if (!list_empty(&wsm.idle_ws[i])) { - ws = wsm.idle_ws[i].next; + ASSERT(zwsm); + spin_lock_bh(&zwsm->lock); + for_each_set_bit_from(i, &zwsm->active_map, ZSTD_BTRFS_MAX_LEVEL) { + if (!list_empty(&zwsm->idle_ws[i])) { + ws = zwsm->idle_ws[i].next; workspace = list_to_workspace(ws); list_del_init(ws); /* keep its place if it's a lower level using this */ workspace->req_level = level; if (clip_level(level) == workspace->level) list_del(&workspace->lru_list); - if (list_empty(&wsm.idle_ws[i])) - clear_bit(i, &wsm.active_map); - spin_unlock_bh(&wsm.lock); + if (list_empty(&zwsm->idle_ws[i])) + clear_bit(i, &zwsm->active_map); + spin_unlock_bh(&zwsm->lock); return ws; } } - spin_unlock_bh(&wsm.lock); + spin_unlock_bh(&zwsm->lock); return NULL; } @@ -276,30 +282,33 @@ static struct list_head *zstd_find_workspace(int level) * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ -struct list_head *zstd_get_workspace(int level) +struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct list_head *ws; unsigned int nofs_flag; + ASSERT(zwsm); + /* level == 0 means we can use any workspace */ if (!level) level = 1; again: - ws = zstd_find_workspace(level); + ws = zstd_find_workspace(fs_info, level); if (ws) return ws; nofs_flag = memalloc_nofs_save(); - ws = zstd_alloc_workspace(level); + ws = zstd_alloc_workspace(fs_info, level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ws)) { DEFINE_WAIT(wait); - prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&zwsm->wait, &wait, TASK_UNINTERRUPTIBLE); schedule(); - finish_wait(&wsm.wait, &wait); + finish_wait(&zwsm->wait, &wait); goto again; } @@ -318,34 +327,36 @@ again: * isn't set, it is also set here. Only the max level workspace tries and wakes * up waiting workspaces. */ -void zstd_put_workspace(struct list_head *ws) +void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct workspace *workspace = list_to_workspace(ws); - spin_lock_bh(&wsm.lock); + ASSERT(zwsm); + spin_lock_bh(&zwsm->lock); /* A node is only taken off the lru if we are the corresponding level */ if (clip_level(workspace->req_level) == workspace->level) { /* Hide a max level workspace from reclaim */ - if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { + if (list_empty(&zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { INIT_LIST_HEAD(&workspace->lru_list); } else { workspace->last_used = jiffies; - list_add(&workspace->lru_list, &wsm.lru_list); - if (!timer_pending(&wsm.timer)) - mod_timer(&wsm.timer, + list_add(&workspace->lru_list, &zwsm->lru_list); + if (!timer_pending(&zwsm->timer)) + mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); } } - set_bit(workspace->level, &wsm.active_map); - list_add(&workspace->list, &wsm.idle_ws[workspace->level]); + set_bit(workspace->level, &zwsm->active_map); + list_add(&workspace->list, &zwsm->idle_ws[workspace->level]); workspace->req_level = 0; - spin_unlock_bh(&wsm.lock); + spin_unlock_bh(&zwsm->lock); if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL)) - cond_wake_up(&wsm.wait); + cond_wake_up(&zwsm->wait); } void zstd_free_workspace(struct list_head *ws) @@ -357,8 +368,9 @@ void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zstd_alloc_workspace(int level) +struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level) { + const u32 blocksize = fs_info->sectorsize; struct workspace *workspace; workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); @@ -371,7 +383,7 @@ struct list_head *zstd_alloc_workspace(int level) workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); - workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf = kmalloc(blocksize, GFP_KERNEL); if (!workspace->mem || !workspace->buf) goto fail; @@ -384,11 +396,13 @@ fail: return ERR_PTR(-ENOMEM); } -int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, +int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); + struct address_space *mapping = inode->vfs_inode.i_mapping; zstd_cstream *stream; int ret = 0; int nr_folios = 0; @@ -399,7 +413,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long len = *total_out; const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; - unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const u32 blocksize = fs_info->sectorsize; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); + unsigned long max_out = nr_dest_folios * min_folio_size; unsigned int cur_len; workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); @@ -411,9 +427,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_err(inode->root->fs_info, + btrfs_err(fs_info, "zstd compression init level %d failed, root %llu inode %llu offset %llu", workspace->req_level, btrfs_root_id(inode->root), btrfs_ino(inode), start); @@ -431,7 +445,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, workspace->in_buf.size = cur_len; /* Allocate and map in the output buffer */ - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -439,7 +453,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); while (1) { size_t ret2; @@ -447,9 +461,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret2))) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_warn(inode->root->fs_info, + btrfs_warn(fs_info, "zstd compression level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), @@ -459,7 +471,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, } /* Check to see if we are making it bigger */ - if (tot_in + workspace->in_buf.pos > 8192 && + if (tot_in + workspace->in_buf.pos > blocksize * 2 && tot_in + workspace->in_buf.pos < tot_out + workspace->out_buf.pos) { ret = -E2BIG; @@ -475,13 +487,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, /* Check if we need more output space */ if (workspace->out_buf.pos == workspace->out_buf.size) { - tot_out += PAGE_SIZE; - max_out -= PAGE_SIZE; + tot_out += min_folio_size; + max_out -= min_folio_size; if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -489,8 +501,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, - PAGE_SIZE); + workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); } /* We've reached the end of the input */ @@ -522,9 +533,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret2 = zstd_end_stream(stream, &workspace->out_buf); if (unlikely(zstd_is_error(ret2))) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_err(inode->root->fs_info, + btrfs_err(fs_info, "zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), @@ -542,13 +551,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, goto out; } - tot_out += PAGE_SIZE; - max_out -= PAGE_SIZE; + tot_out += min_folio_size; + max_out -= min_folio_size; if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -556,7 +565,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); } if (tot_out >= tot_in) { @@ -578,13 +587,16 @@ out: int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); struct folio **folios_in = cb->compressed_folios; size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; + const u32 blocksize = fs_info->sectorsize; + const unsigned int min_folio_size = btrfs_min_folio_size(fs_info); unsigned long folio_in_index = 0; - unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); unsigned long buf_start; unsigned long total_out = 0; @@ -602,11 +614,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = PAGE_SIZE; + workspace->out_buf.size = blocksize; while (1) { size_t ret2; @@ -642,16 +654,16 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap_local(workspace->in_buf.src); folio_in_index++; - if (folio_in_index >= total_folios_in) { + if (unlikely(folio_in_index >= total_folios_in)) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } - srclen -= PAGE_SIZE; + srclen -= min_folio_size; workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); } } ret = 0; @@ -718,9 +730,7 @@ finish: return ret; } -const struct btrfs_compress_op btrfs_zstd_compress = { - /* ZSTD uses own workspace manager */ - .workspace_manager = NULL, +const struct btrfs_compress_levels btrfs_zstd_compress = { .min_level = ZSTD_BTRFS_MIN_LEVEL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 91dfd0231877..d1edb2ac3837 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -387,10 +387,9 @@ try_again: cachefiles_io_error(cache, "Rename security error %d", ret); } else { struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, + .mnt_idmap = &nop_mnt_idmap, .old_parent = dir, .old_dentry = rep, - .new_mnt_idmap = &nop_mnt_idmap, .new_parent = cache->graveyard, .new_dentry = grave, }; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8b202d789e93..322ed268f14a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1264,7 +1264,9 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping, 0, gfp_flags); if (IS_ERR(pages[index])) { - if (PTR_ERR(pages[index]) == -EINVAL) { + int err = PTR_ERR(pages[index]); + + if (err == -EINVAL) { pr_err_client(cl, "inode->i_blkbits=%hhu\n", inode->i_blkbits); } @@ -1273,7 +1275,7 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping, BUG_ON(ceph_wbc->locked_pages == 0); pages[index] = NULL; - return PTR_ERR(pages[index]); + return err; } } else { pages[index] = &folio->page; @@ -1687,6 +1689,7 @@ get_more_pages: process_folio_batch: rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc); + ceph_shift_unused_folios_left(&ceph_wbc.fbatch); if (rc) goto release_folios; @@ -1695,8 +1698,6 @@ process_folio_batch: goto release_folios; if (ceph_wbc.processed_in_fbatch) { - ceph_shift_unused_folios_left(&ceph_wbc.fbatch); - if (folio_batch_count(&ceph_wbc.fbatch) == 0 && ceph_wbc.locked_pages < ceph_wbc.max_pages) { doutc(cl, "reached end fbatch, trying for more\n"); diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index cab722619207..7026e794813c 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -133,6 +133,8 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) } static struct fscrypt_operations ceph_fscrypt_ops = { + .inode_info_offs = (int)offsetof(struct ceph_inode_info, i_crypt_info) - + (int)offsetof(struct ceph_inode_info, netfs.inode), .needs_bounce_pages = 1, .get_context = ceph_crypt_get_context, .set_context = ceph_crypt_set_context, diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index fdd404fc8112..f3fe786b4143 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct rb_node *rp; - int pathlen = 0; - u64 pathbase; char *path; mutex_lock(&mdsc->mutex); @@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p) if (req->r_inode) { seq_printf(s, " #%llx", ceph_ino(req->r_inode)); } else if (req->r_dentry) { - path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen, - &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_dentry->d_lock); @@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); @@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p) } if (req->r_old_dentry) { - path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen, - &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_old_dentry->d_lock); @@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_old_dentry, path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8478e7e75df6..32973c62c1a2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1271,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, /* If op failed, mark everyone involved for errors */ if (result) { - int pathlen = 0; - u64 base = 0; - char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, - &base, 0); + struct ceph_path_info path_info = {0}; + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); /* mark error on parent + clear complete */ mapping_set_error(req->r_parent->i_mapping, result); @@ -1288,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, mapping_set_error(req->r_old_inode->i_mapping, result); pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<<bad>>" : path, result); - ceph_mdsc_free_path(path, pathlen); + path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path_info(&path_info); } out: iput(req->r_old_inode); @@ -1347,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) int err = -EROFS; int op; char *path; - int pathlen; - u64 pathbase; if (ceph_snap(dir) == CEPH_SNAPDIR) { /* rmdir .snap/foo is RMSNAP */ @@ -1367,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) if (!dn) { try_async = false; } else { - path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0); if (IS_ERR(path)) { try_async = false; err = 0; } else { err = ceph_mds_check_access(mdsc, path, MAY_WRITE); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dn); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c02f100f8552..978acd3d4b32 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file) int flags, fmode, wanted; struct dentry *dentry; char *path; - int pathlen; - u64 pathbase; bool do_sync = false; int mask = MAY_READ; @@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file) if (!dentry) { do_sync = true; } else { - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); if (IS_ERR(path)) { do_sync = true; err = 0; } else { err = ceph_mds_check_access(mdsc, path, mask); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dentry); /* For none EACCES cases will let the MDS do the mds auth check */ @@ -614,15 +613,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, mapping_set_error(req->r_parent->i_mapping, result); if (result) { - int pathlen = 0; - u64 base = 0; - char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen, - &base, 0); + struct ceph_path_info path_info = {0}; + char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); pr_warn_client(cl, "async create failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<<bad>>" : path, result); - ceph_mdsc_free_path(path, pathlen); + path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path_info(&path_info); ceph_dir_clear_complete(req->r_parent); if (!d_unhashed(dentry)) @@ -791,8 +788,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, int mask; int err; char *path; - int pathlen; - u64 pathbase; doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n", dir, ceph_vinop(dir), dentry, dentry, @@ -814,7 +809,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if (!dn) { try_async = false; } else { - path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0); if (IS_ERR(path)) { try_async = false; err = 0; @@ -826,7 +822,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, mask |= MAY_WRITE; err = ceph_mds_check_access(mdsc, path, mask); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dn); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index fc543075b827..949f0badc944 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data) return 0; } +/* + * Check if the parent inode matches the vino from directory reply info + */ +static inline bool ceph_vino_matches_parent(struct inode *parent, + struct ceph_vino vino) +{ + return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap; +} + +/* + * Validate that the directory inode referenced by @req->r_parent matches the + * inode number and snapshot id contained in the reply's directory record. If + * they do not match – which can theoretically happen if the parent dentry was + * moved between the time the request was issued and the reply arrived – fall + * back to looking up the correct inode in the inode cache. + * + * A reference is *always* returned. Callers that receive a different inode + * than the original @parent are responsible for dropping the extra reference + * once the reply has been processed. + */ +static struct inode *ceph_get_reply_dir(struct super_block *sb, + struct inode *parent, + struct ceph_mds_reply_info_parsed *rinfo) +{ + struct ceph_vino vino; + + if (unlikely(!rinfo->diri.in)) + return parent; /* nothing to compare against */ + + /* If we didn't have a cached parent inode to begin with, just bail out. */ + if (!parent) + return NULL; + + vino.ino = le64_to_cpu(rinfo->diri.in->ino); + vino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + if (likely(ceph_vino_matches_parent(parent, vino))) + return parent; /* matches – use the original reference */ + + /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */ + WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n", + ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap); + + return ceph_get_inode(sb, vino, NULL); +} + /** * ceph_new_inode - allocate a new inode in advance of an expected create * @dir: parent directory for new inode @@ -665,6 +711,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); #ifdef CONFIG_FS_ENCRYPTION + ci->i_crypt_info = NULL; ci->fscrypt_auth = NULL; ci->fscrypt_auth_len = 0; #endif @@ -1523,6 +1570,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) struct ceph_vino tvino, dvino; struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_client *cl = fsc->client; + struct inode *parent_dir = NULL; int err = 0; doutc(cl, "%p is_dentry %d is_target %d\n", req, @@ -1536,10 +1584,17 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) } if (rinfo->head->is_dentry) { - struct inode *dir = req->r_parent; - - if (dir) { - err = ceph_fill_inode(dir, NULL, &rinfo->diri, + /* + * r_parent may be stale, in cases when R_PARENT_LOCKED is not set, + * so we need to get the correct inode + */ + parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo); + if (unlikely(IS_ERR(parent_dir))) { + err = PTR_ERR(parent_dir); + goto done; + } + if (parent_dir) { + err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri, rinfo->dirfrag, session, -1, &req->r_caps_reservation); if (err < 0) @@ -1548,14 +1603,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) WARN_ON_ONCE(1); } - if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && + if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { bool is_nokey = false; struct qstr dname; struct dentry *dn, *parent; struct fscrypt_str oname = FSTR_INIT(NULL, 0); - struct ceph_fname fname = { .dir = dir, + struct ceph_fname fname = { .dir = parent_dir, .name = rinfo->dname, .ctext = rinfo->altname, .name_len = rinfo->dname_len, @@ -1564,10 +1619,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) BUG_ON(!rinfo->head->is_target); BUG_ON(req->r_dentry); - parent = d_find_any_alias(dir); + parent = d_find_any_alias(parent_dir); BUG_ON(!parent); - err = ceph_fname_alloc_buffer(dir, &oname); + err = ceph_fname_alloc_buffer(parent_dir, &oname); if (err < 0) { dput(parent); goto done; @@ -1576,7 +1631,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey); if (err < 0) { dput(parent); - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); goto done; } dname.name = oname.name; @@ -1595,7 +1650,7 @@ retry_lookup: dname.len, dname.name, dn); if (!dn) { dput(parent); - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); err = -ENOMEM; goto done; } @@ -1610,12 +1665,12 @@ retry_lookup: ceph_snap(d_inode(dn)) != tvino.snap)) { doutc(cl, " dn %p points to wrong inode %p\n", dn, d_inode(dn)); - ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(parent_dir); d_delete(dn); dput(dn); goto retry_lookup; } - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); req->r_dentry = dn; dput(parent); @@ -1794,6 +1849,9 @@ retry_lookup: &dvino, ptvino); } done: + /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */ + if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent)) + iput(parent_dir); doutc(cl, "done err=%d\n", err); return err; } @@ -2487,22 +2545,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, int truncate_retry = 20; /* The RMW will take around 50ms */ struct dentry *dentry; char *path; - int pathlen; - u64 pathbase; bool do_sync = false; dentry = d_find_alias(inode); if (!dentry) { do_sync = true; } else { - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); if (IS_ERR(path)) { do_sync = true; err = 0; } else { err = ceph_mds_check_access(mdsc, path, MAY_WRITE); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dentry); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0f497c39ff82..73da2648fa0f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2221,7 +2221,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) int count; dput(dentry); d_prune_aliases(inode); - count = atomic_read(&inode->i_count); + count = icount_read(inode); if (count == 1) (*remaining)--; doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", @@ -2681,8 +2681,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) * ceph_mdsc_build_path - build a path string to a given dentry * @mdsc: mds client * @dentry: dentry to which path should be built - * @plen: returned length of string - * @pbase: returned base inode number + * @path_info: output path, length, base ino+snap, and freepath ownership flag * @for_wire: is this path going to be sent to the MDS? * * Build a string that represents the path to the dentry. This is mostly called @@ -2700,7 +2699,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) * foo/.snap/bar -> foo//bar */ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, - int *plen, u64 *pbase, int for_wire) + struct ceph_path_info *path_info, int for_wire) { struct ceph_client *cl = mdsc->fsc->client; struct dentry *cur; @@ -2810,16 +2809,28 @@ retry: return ERR_PTR(-ENAMETOOLONG); } - *pbase = base; - *plen = PATH_MAX - 1 - pos; + /* Initialize the output structure */ + memset(path_info, 0, sizeof(*path_info)); + + path_info->vino.ino = base; + path_info->pathlen = PATH_MAX - 1 - pos; + path_info->path = path + pos; + path_info->freepath = true; + + /* Set snap from dentry if available */ + if (d_inode(dentry)) + path_info->vino.snap = ceph_snap(d_inode(dentry)); + else + path_info->vino.snap = CEPH_NOSNAP; + doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry), - base, *plen, path + pos); + base, PATH_MAX - 1 - pos, path + pos); return path + pos; } static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry, - struct inode *dir, const char **ppath, int *ppathlen, - u64 *pino, bool *pfreepath, bool parent_locked) + struct inode *dir, struct ceph_path_info *path_info, + bool parent_locked) { char *path; @@ -2828,41 +2839,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry dir = d_inode_rcu(dentry->d_parent); if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP && !IS_ENCRYPTED(dir)) { - *pino = ceph_ino(dir); + path_info->vino.ino = ceph_ino(dir); + path_info->vino.snap = ceph_snap(dir); rcu_read_unlock(); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; + path_info->path = dentry->d_name.name; + path_info->pathlen = dentry->d_name.len; + path_info->freepath = false; return 0; } rcu_read_unlock(); - path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1); if (IS_ERR(path)) return PTR_ERR(path); - *ppath = path; - *pfreepath = true; + /* + * ceph_mdsc_build_path already fills path_info, including snap handling. + */ return 0; } -static int build_inode_path(struct inode *inode, - const char **ppath, int *ppathlen, u64 *pino, - bool *pfreepath) +static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct dentry *dentry; char *path; if (ceph_snap(inode) == CEPH_NOSNAP) { - *pino = ceph_ino(inode); - *ppathlen = 0; + path_info->vino.ino = ceph_ino(inode); + path_info->vino.snap = ceph_snap(inode); + path_info->pathlen = 0; + path_info->freepath = false; return 0; } dentry = d_find_alias(inode); - path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1); dput(dentry); if (IS_ERR(path)) return PTR_ERR(path); - *ppath = path; - *pfreepath = true; + /* + * ceph_mdsc_build_path already fills path_info, including snap from dentry. + * Override with inode's snap since that's what this function is for. + */ + path_info->vino.snap = ceph_snap(inode); return 0; } @@ -2872,26 +2889,32 @@ static int build_inode_path(struct inode *inode, */ static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, - const char *rpath, u64 rino, const char **ppath, - int *pathlen, u64 *ino, bool *freepath, + const char *rpath, u64 rino, + struct ceph_path_info *path_info, bool parent_locked) { struct ceph_client *cl = mdsc->fsc->client; int r = 0; + /* Initialize the output structure */ + memset(path_info, 0, sizeof(*path_info)); + if (rinode) { - r = build_inode_path(rinode, ppath, pathlen, ino, freepath); + r = build_inode_path(rinode, path_info); doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode), ceph_snap(rinode)); } else if (rdentry) { - r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino, - freepath, parent_locked); - doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); + r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked); + doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino, + path_info->pathlen, path_info->path); } else if (rpath || rino) { - *ino = rino; - *ppath = rpath; - *pathlen = rpath ? strlen(rpath) : 0; - doutc(cl, " path %.*s\n", *pathlen, rpath); + path_info->vino.ino = rino; + path_info->vino.snap = CEPH_NOSNAP; + path_info->path = rpath; + path_info->pathlen = rpath ? strlen(rpath) : 0; + path_info->freepath = false; + + doutc(cl, " path %.*s\n", path_info->pathlen, rpath); } return r; @@ -2968,11 +2991,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; struct ceph_mds_request_head_legacy *lhead; - const char *path1 = NULL; - const char *path2 = NULL; - u64 ino1 = 0, ino2 = 0; - int pathlen1 = 0, pathlen2 = 0; - bool freepath1 = false, freepath2 = false; + struct ceph_path_info path_info1 = {0}; + struct ceph_path_info path_info2 = {0}; struct dentry *old_dentry = NULL; int len; u16 releases; @@ -2982,25 +3002,49 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, u16 request_head_version = mds_supported_head_version(session); kuid_t caller_fsuid = req->r_cred->fsuid; kgid_t caller_fsgid = req->r_cred->fsgid; + bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, - req->r_parent, req->r_path1, req->r_ino1.ino, - &path1, &pathlen1, &ino1, &freepath1, - test_bit(CEPH_MDS_R_PARENT_LOCKED, - &req->r_req_flags)); + req->r_parent, req->r_path1, req->r_ino1.ino, + &path_info1, parent_locked); if (ret < 0) { msg = ERR_PTR(ret); goto out; } + /* + * When the parent directory's i_rwsem is *not* locked, req->r_parent may + * have become stale (e.g. after a concurrent rename) between the time the + * dentry was looked up and now. If we detect that the stored r_parent + * does not match the inode number we just encoded for the request, switch + * to the correct inode so that the MDS receives a valid parent reference. + */ + if (!parent_locked && req->r_parent && path_info1.vino.ino && + ceph_ino(req->r_parent) != path_info1.vino.ino) { + struct inode *old_parent = req->r_parent; + struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL); + if (!IS_ERR(correct_dir)) { + WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n", + ceph_ino(old_parent), path_info1.vino.ino); + /* + * Transfer CEPH_CAP_PIN from the old parent to the new one. + * The pin was taken earlier in ceph_mdsc_submit_request(). + */ + ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN); + iput(old_parent); + req->r_parent = correct_dir; + ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + } + } + /* If r_old_dentry is set, then assume that its parent is locked */ if (req->r_old_dentry && !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED)) old_dentry = req->r_old_dentry; ret = set_request_path_attr(mdsc, NULL, old_dentry, - req->r_old_dentry_dir, - req->r_path2, req->r_ino2.ino, - &path2, &pathlen2, &ino2, &freepath2, true); + req->r_old_dentry_dir, + req->r_path2, req->r_ino2.ino, + &path_info2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; @@ -3031,7 +3075,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, /* filepaths */ len += 2 * (1 + sizeof(u32) + sizeof(u64)); - len += pathlen1 + pathlen2; + len += path_info1.pathlen + path_info2.pathlen; /* cap releases */ len += sizeof(struct ceph_mds_request_release) * @@ -3039,9 +3083,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, !!req->r_old_inode_drop + !!req->r_old_dentry_drop); if (req->r_dentry_drop) - len += pathlen1; + len += path_info1.pathlen; if (req->r_old_dentry_drop) - len += pathlen2; + len += path_info2.pathlen; /* MClientRequest tail */ @@ -3154,8 +3198,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, lhead->ino = cpu_to_le64(req->r_deleg_ino); lhead->args = req->r_args; - ceph_encode_filepath(&p, end, ino1, path1); - ceph_encode_filepath(&p, end, ino2, path2); + ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path); + ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path); /* make note of release offset, in case we need to replay */ req->r_request_release_offset = p - msg->front.iov_base; @@ -3218,11 +3262,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, msg->hdr.data_off = cpu_to_le16(0); out_free2: - if (freepath2) - ceph_mdsc_free_path((char *)path2, pathlen2); + ceph_mdsc_free_path_info(&path_info2); out_free1: - if (freepath1) - ceph_mdsc_free_path((char *)path1, pathlen1); + ceph_mdsc_free_path_info(&path_info1); out: return msg; out_err: @@ -4579,24 +4621,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) struct ceph_pagelist *pagelist = recon_state->pagelist; struct dentry *dentry; struct ceph_cap *cap; - char *path; - int pathlen = 0, err; - u64 pathbase; + struct ceph_path_info path_info = {0}; + int err; u64 snap_follows; dentry = d_find_primary(inode); if (dentry) { /* set pathbase to parent dir when msg_version >= 2 */ - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, recon_state->msg_version >= 2); dput(dentry); if (IS_ERR(path)) { err = PTR_ERR(path); goto out_err; } - } else { - path = NULL; - pathbase = 0; } spin_lock(&ci->i_ceph_lock); @@ -4629,7 +4667,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v2.pathbase = cpu_to_le64(pathbase); + rec.v2.pathbase = cpu_to_le64(path_info.vino.ino); rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { @@ -4644,7 +4682,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) ts = inode_get_atime(inode); ceph_encode_timespec64(&rec.v1.atime, &ts); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v1.pathbase = cpu_to_le64(pathbase); + rec.v1.pathbase = cpu_to_le64(path_info.vino.ino); } if (list_empty(&ci->i_cap_snaps)) { @@ -4706,7 +4744,7 @@ encode_again: sizeof(struct ceph_filelock); rec.v2.flock_len = cpu_to_le32(struct_len); - struct_len += sizeof(u32) + pathlen + sizeof(rec.v2); + struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2); if (struct_v >= 2) struct_len += sizeof(u64); /* snap_follows */ @@ -4730,7 +4768,7 @@ encode_again: ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_32(pagelist, struct_len); } - ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); ceph_locks_to_pagelist(flocks, pagelist, num_fcntl_locks, num_flock_locks); @@ -4741,17 +4779,17 @@ out_freeflocks: } else { err = ceph_pagelist_reserve(pagelist, sizeof(u64) + sizeof(u32) + - pathlen + sizeof(rec.v1)); + path_info.pathlen + sizeof(rec.v1)); if (err) goto out_err; ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); - ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); } out_err: - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); if (!err) recon_state->nr_caps++; return err; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3e2a6fa7c19a..0428a5eaf28c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); -static inline void ceph_mdsc_free_path(char *path, int len) +/* + * Structure to group path-related output parameters for build_*_path functions + */ +struct ceph_path_info { + const char *path; + int pathlen; + struct ceph_vino vino; + bool freepath; +}; + +static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info) { - if (!IS_ERR_OR_NULL(path)) - __putname(path - (PATH_MAX - 1 - len)); + if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path)) + __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen)); } extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, - struct dentry *dentry, int *plen, u64 *base, + struct dentry *dentry, struct ceph_path_info *path_info, int for_wire); extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c3eb651862c5..db6c2db68f96 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -862,7 +862,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0); if (!fsc->inode_wq) goto fail_client; - fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1); + fsc->cap_wq = alloc_workqueue("ceph-cap", WQ_PERCPU, 1); if (!fsc->cap_wq) goto fail_inode_wq; @@ -1042,7 +1042,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .free_inode = ceph_free_inode, .write_inode = ceph_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cf176aab0f82..25d8bacbcf44 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -463,6 +463,7 @@ struct ceph_inode_info { unsigned long i_work_mask; #ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; u32 fscrypt_auth_len; u32 fscrypt_file_len; u8 *fscrypt_auth; diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 740f18b60c9d..456c4a2efb53 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -36,7 +36,7 @@ static void configfs_free_inode(struct inode *inode) static const struct super_operations configfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .free_inode = configfs_free_inode, }; diff --git a/fs/coredump.c b/fs/coredump.c index 5dce257c67fc..0d9a5d07a75d 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -635,7 +635,7 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) /* * Usermode helpers are childen of either - * system_unbound_wq or of kthreadd. So we know that + * system_dfl_wq or of kthreadd. So we know that * we're starting off with a clean file descriptor * table. So we should always be able to use * COREDUMP_PIDFD_NUMBER as our file descriptor value. @@ -1466,11 +1466,15 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write, ssize_t retval; char old_core_pattern[CORENAME_MAX_SIZE]; + if (write) + return proc_dostring(table, write, buffer, lenp, ppos); + retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE); error = proc_dostring(table, write, buffer, lenp, ppos); if (error) return error; + if (!check_coredump_socket()) { strscpy(core_pattern, old_core_pattern, retval + 1); return -EINVAL; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index b002e9b734f9..12daa85ed941 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -116,9 +116,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb, inode_nohighmem(inode); inode->i_data.a_ops = &cramfs_aops; break; - default: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: init_special_inode(inode, cramfs_inode->mode, old_decode_dev(cramfs_inode->size)); + break; + default: + printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + iget_failed(inode); + return ERR_PTR(-EIO); } inode->i_mode = cramfs_inode->mode; diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index b5dfb0aa405a..464b54610fd3 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -2,10 +2,9 @@ config FS_ENCRYPTION bool "FS Encryption (Per-file encryption)" select CRYPTO - select CRYPTO_HASH - select CRYPTO_HKDF select CRYPTO_SKCIPHER select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_SHA512 select KEYS help Enable encryption of files and directories. This @@ -32,8 +31,6 @@ config FS_ENCRYPTION_ALGS select CRYPTO_CBC select CRYPTO_CTS select CRYPTO_ECB - select CRYPTO_HMAC - select CRYPTO_SHA512 select CRYPTO_XTS config FS_ENCRYPTION_INLINE_CRYPT diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 486fcb2ecf13..5f5599020e94 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -113,7 +113,7 @@ out: int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; @@ -148,7 +148,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, */ for (i = 0; i < nr_pages; i++) { pages[i] = fscrypt_alloc_bounce_page(i == 0 ? GFP_NOFS : - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!pages[i]) break; } diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index b6ccab524fde..07f9cbfe3ea4 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -173,7 +173,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs, gfp_t gfp_flags) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; @@ -232,8 +232,9 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_ENCRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); @@ -255,7 +256,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + @@ -305,8 +306,9 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_DECRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index f9f6713e144f..8e4c213d418b 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -11,7 +11,6 @@ * This has not yet undergone a rigorous security audit. */ -#include <crypto/hash.h> #include <crypto/sha2.h> #include <crypto/skcipher.h> #include <linux/export.h> @@ -94,7 +93,7 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -138,7 +137,7 @@ static int fname_decrypt(const struct inode *inode, const struct fscrypt_str *iname, struct fscrypt_str *oname) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -274,8 +273,9 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { - return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, - orig_len, max_len, + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); + + return __fscrypt_fname_encrypted_size(&ci->ci_policy, orig_len, max_len, encrypted_len_ret); } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); @@ -543,7 +543,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name); */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { - const struct fscrypt_inode_info *ci = dir->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(dir); WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index d8b485b9881c..4e8e82a9ccf9 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -11,10 +11,10 @@ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H +#include <crypto/sha2.h> #include <linux/fscrypt.h> #include <linux/minmax.h> #include <linux/siphash.h> -#include <crypto/hash.h> #include <linux/blk-crypto.h> #define CONST_STRLEN(str) (sizeof(str) - 1) @@ -249,8 +249,8 @@ struct fscrypt_prepared_key { * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is - * allocated and stored in ->i_crypt_info. Once created, it remains until the - * inode is evicted. + * allocated and a pointer to it is stored in the file's in-memory inode. Once + * created, it remains until the inode is evicted. */ struct fscrypt_inode_info { @@ -381,12 +381,8 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 *encrypted_len_ret); /* hkdf.c */ -struct fscrypt_hkdf { - struct crypto_shash *hmac_tfm; -}; - -int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, - unsigned int master_key_size); +void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key, + unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as @@ -405,11 +401,9 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, #define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY \ 8 /* info=<empty> */ -int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen); - -void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); +void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT @@ -517,7 +511,7 @@ struct fscrypt_master_key_secret { * ->is_hw_wrapped=false, or by the "software secret" that hardware * derived from this master key if ->is_hw_wrapped=true. */ - struct fscrypt_hkdf hkdf; + struct hmac_sha512_key hkdf; /* * True if this key is a hardware-wrapped key; false if this key is a @@ -696,7 +690,7 @@ struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); -int fscrypt_get_test_dummy_key_identifier( +void fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int fscrypt_add_test_dummy_key(struct super_block *sb, @@ -732,8 +726,8 @@ void fscrypt_destroy_prepared_key(struct super_block *sb, int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, const u8 *raw_key); -int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, - const struct fscrypt_master_key *mk); +void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, + const struct fscrypt_master_key *mk); void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index b1ef506cd341..706f56d0076e 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation + * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): + * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". + * * This is used to derive keys from the fscrypt master keys (or from the * "software secrets" which hardware derives from the fscrypt master keys, in * the case that the fscrypt master keys are hardware-wrapped keys). @@ -7,10 +11,6 @@ * Copyright 2019 Google LLC */ -#include <crypto/hash.h> -#include <crypto/hkdf.h> -#include <crypto/sha2.h> - #include "fscrypt_private.h" /* @@ -24,7 +24,6 @@ * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two. */ -#define HKDF_HMAC_ALG "hmac(sha512)" #define HKDF_HASHLEN SHA512_DIGEST_SIZE /* @@ -44,54 +43,24 @@ */ /* - * Compute HKDF-Extract using the given master key as the input keying material, - * and prepare an HMAC transform object keyed by the resulting pseudorandom key. - * - * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many - * times without having to recompute HKDF-Extract each time. + * Compute HKDF-Extract using 'master_key' as the input keying material, and + * prepare the resulting HMAC key in 'hkdf'. Afterwards, 'hkdf' can be used for + * HKDF-Expand many times without having to recompute HKDF-Extract each time. */ -int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, - unsigned int master_key_size) +void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key, + unsigned int master_key_size) { - struct crypto_shash *hmac_tfm; static const u8 default_salt[HKDF_HASHLEN]; u8 prk[HKDF_HASHLEN]; - int err; - - hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, FSCRYPT_CRYPTOAPI_MASK); - if (IS_ERR(hmac_tfm)) { - fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld", - PTR_ERR(hmac_tfm)); - return PTR_ERR(hmac_tfm); - } - - if (WARN_ON_ONCE(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) { - err = -EINVAL; - goto err_free_tfm; - } - - err = hkdf_extract(hmac_tfm, master_key, master_key_size, - default_salt, HKDF_HASHLEN, prk); - if (err) - goto err_free_tfm; - - err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk)); - if (err) - goto err_free_tfm; - hkdf->hmac_tfm = hmac_tfm; - goto out; - -err_free_tfm: - crypto_free_shash(hmac_tfm); -out: + hmac_sha512_usingrawkey(default_salt, sizeof(default_salt), + master_key, master_key_size, prk); + hmac_sha512_preparekey(hkdf, prk, sizeof(prk)); memzero_explicit(prk, sizeof(prk)); - return err; } /* - * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which - * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen' + * HKDF-Expand (RFC 5869 section 2.3). Expand the HMAC key 'hkdf' into 'okmlen' * bytes of output keying material parameterized by the application-specific * 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context' * byte. This is thread-safe and may be called by multiple threads in parallel. @@ -100,30 +69,32 @@ out: * adds to its application-specific info strings to guarantee that it doesn't * accidentally repeat an info string when using HKDF for different purposes.) */ -int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen) -{ - SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); - u8 *full_info; - int err; - - full_info = kzalloc(infolen + 9, GFP_KERNEL); - if (!full_info) - return -ENOMEM; - desc->tfm = hkdf->hmac_tfm; - - memcpy(full_info, "fscrypt\0", 8); - full_info[8] = context; - memcpy(full_info + 9, info, infolen); - - err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9, - okm, okmlen); - kfree_sensitive(full_info); - return err; -} - -void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf) +void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen) { - crypto_free_shash(hkdf->hmac_tfm); + struct hmac_sha512_ctx ctx; + u8 counter = 1; + u8 tmp[HKDF_HASHLEN]; + + WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN); + + for (unsigned int i = 0; i < okmlen; i += HKDF_HASHLEN) { + hmac_sha512_init(&ctx, hkdf); + if (i != 0) + hmac_sha512_update(&ctx, &okm[i - HKDF_HASHLEN], + HKDF_HASHLEN); + hmac_sha512_update(&ctx, "fscrypt\0", 8); + hmac_sha512_update(&ctx, &context, 1); + hmac_sha512_update(&ctx, info, infolen); + hmac_sha512_update(&ctx, &counter, 1); + if (okmlen - i < HKDF_HASHLEN) { + hmac_sha512_final(&ctx, tmp); + memcpy(&okm[i], tmp, okmlen - i); + memzero_explicit(tmp, sizeof(tmp)); + } else { + hmac_sha512_final(&ctx, &okm[i]); + } + counter++; + } } diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index e0b32ac841f7..b97de0d1430f 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -199,13 +199,13 @@ int fscrypt_prepare_setflags(struct inode *inode, err = fscrypt_require_key(inode); if (err) return err; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; mk = ci->ci_master_key; down_read(&mk->mk_sem); if (mk->mk_present) - err = fscrypt_derive_dirhash_key(ci, mk); + fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; up_read(&mk->mk_sem); diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index caaff809765b..5dee7c498bc8 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -263,7 +263,7 @@ int fscrypt_derive_sw_secret(struct super_block *sb, bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { - return inode->i_crypt_info->ci_inlinecrypt; + return fscrypt_get_inode_info_raw(inode)->ci_inlinecrypt; } EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto); @@ -307,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, if (!fscrypt_inode_uses_inline_crypto(inode)) return; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); fscrypt_generate_dun(ci, first_lblk, dun); bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); @@ -385,22 +385,24 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { const struct bio_crypt_ctx *bc = bio->bi_crypt_context; + const struct fscrypt_inode_info *ci; u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; if (!!bc != fscrypt_inode_uses_inline_crypto(inode)) return false; if (!bc) return true; + ci = fscrypt_get_inode_info_raw(inode); /* * Comparing the key pointers is good enough, as all I/O for each key * uses the same pointer. I.e., there's currently no need to support * merging requests where the keys are the same but the pointers differ. */ - if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key) + if (bc->bc_key != ci->ci_enc_key.blk_key) return false; - fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun); + fscrypt_generate_dun(ci, next_lblk, next_dun); return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun); } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio); @@ -502,7 +504,7 @@ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) if (nr_blocks <= 1) return nr_blocks; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (!(fscrypt_policy_flags(&ci->ci_policy) & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) return nr_blocks; diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 7557f6a88b8f..3adbd7167055 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -42,7 +42,6 @@ struct fscrypt_keyring { static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret) { - fscrypt_destroy_hkdf(&secret->hkdf); memzero_explicit(secret, sizeof(*secret)); } @@ -587,21 +586,17 @@ static int add_master_key(struct super_block *sb, keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY; } - err = fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size); + fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size); /* * Now that the KDF context is initialized, the raw KDF key is * no longer needed. */ memzero_explicit(kdf_key, kdf_key_size); - if (err) - return err; /* Calculate the key identifier */ - err = fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0, - key_spec->u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - if (err) - return err; + fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0, + key_spec->u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); } return do_add_master_key(sb, secret, key_spec); } @@ -835,24 +830,17 @@ fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret) memcpy(secret->bytes, test_key, sizeof(test_key)); } -int fscrypt_get_test_dummy_key_identifier( +void fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_master_key_secret secret; - int err; fscrypt_get_test_dummy_secret(&secret); - - err = fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size); - if (err) - goto out; - err = fscrypt_hkdf_expand(&secret.hkdf, - HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, - NULL, 0, key_identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); -out: + fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size); + fscrypt_hkdf_expand(&secret.hkdf, + HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, NULL, 0, + key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); wipe_master_key_secret(&secret); - return err; } /** diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 4f3b9ecbfe4e..4bd3918f50e3 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -253,11 +253,8 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, sizeof(sb->s_uuid)); hkdf_infolen += sizeof(sb->s_uuid); } - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, - hkdf_context, hkdf_info, hkdf_infolen, - mode_key, mode->keysize); - if (err) - goto out_unlock; + fscrypt_hkdf_expand(&mk->mk_secret.hkdf, hkdf_context, hkdf_info, + hkdf_infolen, mode_key, mode->keysize); err = fscrypt_prepare_key(prep_key, mode_key, ci); memzero_explicit(mode_key, mode->keysize); if (err) @@ -278,36 +275,25 @@ out_unlock: * as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an * endianness swap in order to get the same results as on little endian CPUs. */ -static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, - u8 context, const u8 *info, - unsigned int infolen, siphash_key_t *key) +static void fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, + u8 context, const u8 *info, + unsigned int infolen, siphash_key_t *key) { - int err; - - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, - (u8 *)key, sizeof(*key)); - if (err) - return err; - + fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, + (u8 *)key, sizeof(*key)); BUILD_BUG_ON(sizeof(*key) != 16); BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2); le64_to_cpus(&key->key[0]); le64_to_cpus(&key->key[1]); - return 0; } -int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, - const struct fscrypt_master_key *mk) +void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, + const struct fscrypt_master_key *mk) { - int err; - - err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, - ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, - &ci->ci_dirhash_key); - if (err) - return err; + fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, + ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, + &ci->ci_dirhash_key); ci->ci_dirhash_key_initialized = true; - return 0; } void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, @@ -338,17 +324,12 @@ static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci, if (mk->mk_ino_hash_key_initialized) goto unlock; - err = fscrypt_derive_siphash_key(mk, - HKDF_CONTEXT_INODE_HASH_KEY, - NULL, 0, &mk->mk_ino_hash_key); - if (err) - goto unlock; + fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_INODE_HASH_KEY, + NULL, 0, &mk->mk_ino_hash_key); /* pairs with smp_load_acquire() above */ smp_store_release(&mk->mk_ino_hash_key_initialized, true); unlock: mutex_unlock(&fscrypt_mode_key_setup_mutex); - if (err) - return err; } /* @@ -402,13 +383,10 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, } else { u8 derived_key[FSCRYPT_MAX_RAW_KEY_SIZE]; - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, - HKDF_CONTEXT_PER_FILE_ENC_KEY, - ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, - derived_key, ci->ci_mode->keysize); - if (err) - return err; - + fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_PER_FILE_ENC_KEY, + ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, + derived_key, ci->ci_mode->keysize); err = fscrypt_set_per_file_enc_key(ci, derived_key); memzero_explicit(derived_key, ci->ci_mode->keysize); } @@ -416,11 +394,8 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, return err; /* Derive a secret dirhash key for directories that need it. */ - if (need_dirhash_key) { - err = fscrypt_derive_dirhash_key(ci, mk); - if (err) - return err; - } + if (need_dirhash_key) + fscrypt_derive_dirhash_key(ci, mk); return 0; } @@ -642,15 +617,16 @@ fscrypt_setup_encryption_info(struct inode *inode, goto out; /* - * For existing inodes, multiple tasks may race to set ->i_crypt_info. - * So use cmpxchg_release(). This pairs with the smp_load_acquire() in - * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with - * a RELEASE barrier so that other tasks can ACQUIRE it. + * For existing inodes, multiple tasks may race to set the inode's + * fscrypt info pointer. So use cmpxchg_release(). This pairs with the + * smp_load_acquire() in fscrypt_get_inode_info(). I.e., publish the + * pointer with a RELEASE barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { + if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) == + NULL) { /* - * We won the race and set ->i_crypt_info to our crypt_info. - * Now link it into the master key's inode list. + * We won the race and set the inode's fscrypt info to our + * crypt_info. Now link it into the master key's inode list. */ if (mk) { crypt_info->ci_master_key = mk; @@ -681,13 +657,13 @@ out: * %false unless the operation being performed is needed in * order for files (or directories) to be deleted. * - * Set up ->i_crypt_info, if it hasn't already been done. + * Set up the inode's encryption key, if it hasn't already been done. * - * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So + * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe. So * generally this shouldn't be called from within a filesystem transaction. * - * Return: 0 if ->i_crypt_info was set or was already set, *or* if the - * encryption key is unavailable. (Use fscrypt_has_encryption_key() to + * Return: 0 if the key is now set up, *or* if it couldn't be set up because the + * needed master key is absent. (Use fscrypt_has_encryption_key() to * distinguish these cases.) Also can return another -errno code. */ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) @@ -741,9 +717,9 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * ->i_ino doesn't need to be set yet. * @encrypt_ret: (output) set to %true if the new inode will be encrypted * - * If the directory is encrypted, set up its ->i_crypt_info in preparation for + * If the directory is encrypted, set up its encryption key in preparation for * encrypting the name of the new file. Also, if the new inode will be - * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. + * encrypted, set up its encryption key too and set *encrypt_ret=true. * * This isn't %GFP_NOFS-safe, and therefore it should be called before starting * any filesystem transaction to create the inode. For this reason, ->i_ino @@ -752,8 +728,8 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * This doesn't persist the new inode's encryption context. That still needs to * be done later by calling fscrypt_set_context(). * - * Return: 0 on success, -ENOKEY if the encryption key is missing, or another - * -errno code + * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode + * but the needed master key is absent, or another -errno code */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) @@ -800,8 +776,16 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); */ void fscrypt_put_encryption_info(struct inode *inode) { - put_crypt_info(inode->i_crypt_info); - inode->i_crypt_info = NULL; + /* + * Ideally we'd start with a lightweight IS_ENCRYPTED() check here + * before proceeding to retrieve and check the pointer. However, during + * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED. If + * an error occurs, it needs to be cleaned up regardless. + */ + struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode); + + put_crypt_info(*ci_addr); + *ci_addr = NULL; } EXPORT_SYMBOL(fscrypt_put_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6ad30ae07c06..bbb2f5ced988 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -727,7 +727,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) err = fscrypt_require_key(dir); if (err) return ERR_PTR(err); - return &dir->i_crypt_info->ci_policy; + return &fscrypt_get_inode_info_raw(dir)->ci_policy; } return fscrypt_get_dummy_policy(dir->i_sb); @@ -746,7 +746,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) */ int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); BUILD_BUG_ON(sizeof(union fscrypt_context) != FSCRYPT_SET_CONTEXT_MAX_SIZE); @@ -771,7 +771,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); */ int fscrypt_set_context(struct inode *inode, void *fs_data) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci; union fscrypt_context ctx; int ctxsize; @@ -783,6 +783,7 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) * This may be the first time the inode number is available, so do any * delayed key setup that requires the inode number. */ + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) fscrypt_hash_inode_number(ci, ci->ci_master_key); @@ -826,10 +827,8 @@ int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, policy->version = FSCRYPT_POLICY_V2; policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - err = fscrypt_get_test_dummy_key_identifier( + fscrypt_get_test_dummy_key_identifier( policy->v2.master_key_identifier); - if (err) - goto out; } else { err = -EINVAL; goto out; diff --git a/fs/dcache.c b/fs/dcache.c index 60046ae23d51..65cc11939654 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2509,8 +2509,8 @@ static inline unsigned start_dir_add(struct inode *dir) { preempt_disable_nested(); for (;;) { - unsigned n = dir->i_dir_seq; - if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) + unsigned n = READ_ONCE(dir->i_dir_seq); + if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1)) return n; cpu_relax(); } @@ -2922,6 +2922,7 @@ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) write_sequnlock(&rename_lock); } +EXPORT_SYMBOL(d_exchange); /** * d_ancestor - search for an ancestor diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index c12d649df6a5..661a99a7dfbe 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -362,7 +362,8 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) } EXPORT_SYMBOL_GPL(debugfs_lookup); -static struct dentry *start_creating(const char *name, struct dentry *parent) +static struct dentry *debugfs_start_creating(const char *name, + struct dentry *parent) { struct dentry *dentry; int error; @@ -428,7 +429,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); - dentry = start_creating(name, parent); + dentry = debugfs_start_creating(name, parent); if (IS_ERR(dentry)) return dentry; @@ -577,7 +578,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size); */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { - struct dentry *dentry = start_creating(name, parent); + struct dentry *dentry = debugfs_start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) @@ -624,7 +625,7 @@ struct dentry *debugfs_create_automount(const char *name, debugfs_automount_t f, void *data) { - struct dentry *dentry = start_creating(name, parent); + struct dentry *dentry = debugfs_start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) @@ -687,7 +688,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, if (!link) return ERR_PTR(-ENOMEM); - dentry = start_creating(name, parent); + dentry = debugfs_start_creating(name, parent); if (IS_ERR(dentry)) { kfree(link); return dentry; diff --git a/fs/dlm/config.c b/fs/dlm/config.c index a23fd524a6ee..a0d75b5c83c6 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -26,6 +26,7 @@ /* * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid (refers to <node>) * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight + * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/release_recover * /config/dlm/<cluster>/comms/<comm>/nodeid (refers to <comm>) * /config/dlm/<cluster>/comms/<comm>/local * /config/dlm/<cluster>/comms/<comm>/addr (write only) @@ -267,6 +268,7 @@ enum { enum { NODE_ATTR_NODEID = 0, NODE_ATTR_WEIGHT, + NODE_ATTR_RELEASE_RECOVER, }; struct dlm_clusters { @@ -280,6 +282,8 @@ struct dlm_spaces { struct dlm_space { struct config_group group; struct list_head members; + struct list_head members_gone; + int members_gone_count; struct mutex members_lock; int members_count; struct dlm_nodes *nds; @@ -310,6 +314,14 @@ struct dlm_node { int weight; int new; int comm_seq; /* copy of cm->seq when nd->nodeid is set */ + unsigned int release_recover; +}; + +struct dlm_member_gone { + int nodeid; + unsigned int release_recover; + + struct list_head list; /* space->members_gone */ }; static struct configfs_group_operations clusters_ops = { @@ -480,6 +492,7 @@ static struct config_group *make_space(struct config_group *g, const char *name) configfs_add_default_group(&nds->ns_group, &sp->group); INIT_LIST_HEAD(&sp->members); + INIT_LIST_HEAD(&sp->members_gone); mutex_init(&sp->members_lock); sp->members_count = 0; sp->nds = nds; @@ -587,10 +600,20 @@ static void drop_node(struct config_group *g, struct config_item *i) { struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); struct dlm_node *nd = config_item_to_node(i); + struct dlm_member_gone *mb_gone; + + mb_gone = kzalloc(sizeof(*mb_gone), GFP_KERNEL); + if (!mb_gone) + return; mutex_lock(&sp->members_lock); list_del(&nd->list); sp->members_count--; + + mb_gone->nodeid = nd->nodeid; + mb_gone->release_recover = nd->release_recover; + list_add(&mb_gone->list, &sp->members_gone); + sp->members_gone_count++; mutex_unlock(&sp->members_lock); config_item_put(i); @@ -815,12 +838,34 @@ static ssize_t node_weight_store(struct config_item *item, const char *buf, return len; } +static ssize_t node_release_recover_show(struct config_item *item, char *buf) +{ + struct dlm_node *n = config_item_to_node(item); + + return sprintf(buf, "%u\n", n->release_recover); +} + +static ssize_t node_release_recover_store(struct config_item *item, + const char *buf, size_t len) +{ + struct dlm_node *n = config_item_to_node(item); + int rc; + + rc = kstrtouint(buf, 0, &n->release_recover); + if (rc) + return rc; + + return len; +} + CONFIGFS_ATTR(node_, nodeid); CONFIGFS_ATTR(node_, weight); +CONFIGFS_ATTR(node_, release_recover); static struct configfs_attribute *node_attrs[] = { [NODE_ATTR_NODEID] = &node_attr_nodeid, [NODE_ATTR_WEIGHT] = &node_attr_weight, + [NODE_ATTR_RELEASE_RECOVER] = &node_attr_release_recover, NULL, }; @@ -882,9 +927,10 @@ static void put_comm(struct dlm_comm *cm) int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out) { + struct dlm_member_gone *mb_gone, *mb_safe; + struct dlm_config_node *nodes, *node; struct dlm_space *sp; struct dlm_node *nd; - struct dlm_config_node *nodes, *node; int rv, count; sp = get_space(lsname); @@ -898,7 +944,7 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, goto out; } - count = sp->members_count; + count = sp->members_count + sp->members_gone_count; nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS); if (!nodes) { @@ -917,6 +963,20 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, nd->new = 0; } + /* we delay the remove on nodes until here as configfs does + * not support addtional attributes for rmdir(). + */ + list_for_each_entry_safe(mb_gone, mb_safe, &sp->members_gone, list) { + node->nodeid = mb_gone->nodeid; + node->release_recover = mb_gone->release_recover; + node->gone = true; + node++; + + list_del(&mb_gone->list); + sp->members_gone_count--; + kfree(mb_gone); + } + *count_out = count; *nodes_out = nodes; rv = 0; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 13a3d0b26194..4ebd45f75276 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -17,8 +17,10 @@ struct dlm_config_node { int nodeid; int weight; + bool gone; int new; uint32_t comm_seq; + unsigned int release_recover; }; extern const struct rhashtable_params dlm_rhash_rsb_params; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 6dd3a524cd35..be938fdf17d9 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -5576,7 +5576,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, if (rl->rl_status == DLM_LKSTS_CONVERT && middle_conversion(lkb)) { /* We may need to adjust grmode depending on other granted locks. */ - log_limit(ls, "%s %x middle convert gr %d rq %d remote %d %x", + log_rinfo(ls, "%s %x middle convert gr %d rq %d remote %d %x", __func__, lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid); rsb_set_flag(r, RSB_RECOVER_CONVERT); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 1929327ffbe1..ddaa76558706 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -186,12 +186,17 @@ static struct kobj_type dlm_ktype = { static struct kset *dlm_kset; -static int do_uevent(struct dlm_ls *ls, int in) +static int do_uevent(struct dlm_ls *ls, int in, unsigned int release_recover) { - if (in) + char message[512] = {}; + char *envp[] = { message, NULL }; + + if (in) { kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE); - else - kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); + } else { + snprintf(message, 511, "RELEASE_RECOVER=%u", release_recover); + kobject_uevent_env(&ls->ls_kobj, KOBJ_OFFLINE, envp); + } log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving"); @@ -575,7 +580,7 @@ static int new_lockspace(const char *name, const char *cluster, current lockspace members are (via configfs) and then tells the lockspace to start running (via sysfs) in dlm_ls_start(). */ - error = do_uevent(ls, 1); + error = do_uevent(ls, 1, 0); if (error < 0) goto out_recoverd; @@ -592,7 +597,7 @@ static int new_lockspace(const char *name, const char *cluster, return 0; out_members: - do_uevent(ls, 0); + do_uevent(ls, 0, 0); dlm_clear_members(ls); kfree(ls->ls_node_array); out_recoverd: @@ -671,19 +676,20 @@ int dlm_new_user_lockspace(const char *name, const char *cluster, This is because there may be LKBs queued as ASTs that have been unlinked from their RSBs and are pending deletion once the AST has been delivered */ -static int lockspace_busy(struct dlm_ls *ls, int force) +static int lockspace_busy(struct dlm_ls *ls, unsigned int release_option) { struct dlm_lkb *lkb; unsigned long id; int rv = 0; read_lock_bh(&ls->ls_lkbxa_lock); - if (force == 0) { + if (release_option == DLM_RELEASE_NO_LOCKS) { xa_for_each(&ls->ls_lkbxa, id, lkb) { rv = 1; break; } - } else if (force == 1) { + } else if (release_option == DLM_RELEASE_UNUSED) { + /* TODO: handle this UNUSED option as NO_LOCKS in later patch */ xa_for_each(&ls->ls_lkbxa, id, lkb) { if (lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV) { @@ -698,11 +704,11 @@ static int lockspace_busy(struct dlm_ls *ls, int force) return rv; } -static int release_lockspace(struct dlm_ls *ls, int force) +static int release_lockspace(struct dlm_ls *ls, unsigned int release_option) { int busy, rv; - busy = lockspace_busy(ls, force); + busy = lockspace_busy(ls, release_option); spin_lock_bh(&lslist_lock); if (ls->ls_create_count == 1) { @@ -730,8 +736,9 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_device_deregister(ls); - if (force < 3 && dlm_user_daemon_available()) - do_uevent(ls, 0); + if (release_option != DLM_RELEASE_NO_EVENT && + dlm_user_daemon_available()) + do_uevent(ls, 0, (release_option == DLM_RELEASE_RECOVER)); dlm_recoverd_stop(ls); @@ -782,25 +789,24 @@ static int release_lockspace(struct dlm_ls *ls, int force) * lockspace must continue to function as usual, participating in recoveries, * until this returns. * - * Force has 4 possible values: - * 0 - don't destroy lockspace if it has any LKBs - * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs - * 2 - destroy lockspace regardless of LKBs - * 3 - destroy lockspace as part of a forced shutdown + * See DLM_RELEASE defines for release_option values and their meaning. */ -int dlm_release_lockspace(void *lockspace, int force) +int dlm_release_lockspace(void *lockspace, unsigned int release_option) { struct dlm_ls *ls; int error; + if (release_option > __DLM_RELEASE_MAX) + return -EINVAL; + ls = dlm_find_lockspace_local(lockspace); if (!ls) return -EINVAL; dlm_put_lockspace(ls); mutex_lock(&ls_lock); - error = release_lockspace(ls, force); + error = release_lockspace(ls, release_option); if (!error) ls_count--; if (!ls_count) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index e4373bce1bc2..9a0b6c2b6b01 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1703,7 +1703,7 @@ static int work_start(void) return -ENOMEM; } - process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH, 0); + process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH | WQ_PERCPU, 0); if (!process_workqueue) { log_print("can't start dlm_process"); destroy_workqueue(io_workqueue); diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 4887c8a05318..a44d16da7187 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -52,7 +52,7 @@ static int __init init_dlm(void) if (error) goto out_user; - dlm_wq = alloc_workqueue("dlm_wq", 0, 0); + dlm_wq = alloc_workqueue("dlm_wq", WQ_PERCPU, 0); if (!dlm_wq) { error = -ENOMEM; goto out_plock; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index b0864c93230f..c0f557a80a75 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -478,7 +478,8 @@ static void dlm_lsop_recover_prep(struct dlm_ls *ls) ls->ls_ops->recover_prep(ls->ls_ops_arg); } -static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) +static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb, + unsigned int release_recover) { struct dlm_slot slot; uint32_t seq; @@ -495,7 +496,7 @@ static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) error = dlm_comm_seq(memb->nodeid, &seq, false); - if (!error && seq == memb->comm_seq) + if (!release_recover && !error && seq == memb->comm_seq) return; slot.nodeid = memb->nodeid; @@ -552,6 +553,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) struct dlm_member *memb, *safe; struct dlm_config_node *node; int i, error, neg = 0, low = -1; + unsigned int release_recover; /* previously removed members that we've not finished removing need to * count as a negative change so the "neg" recovery steps will happen @@ -569,11 +571,21 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { node = find_config_node(rv, memb->nodeid); - if (node && !node->new) + if (!node) { + log_error(ls, "remove member %d invalid", + memb->nodeid); + return -EFAULT; + } + + if (!node->new && !node->gone) continue; - if (!node) { - log_rinfo(ls, "remove member %d", memb->nodeid); + release_recover = 0; + + if (node->gone) { + release_recover = node->release_recover; + log_rinfo(ls, "remove member %d%s", memb->nodeid, + release_recover ? " (release_recover)" : ""); } else { /* removed and re-added */ log_rinfo(ls, "remove member %d comm_seq %u %u", @@ -584,13 +596,16 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) list_move(&memb->list, &ls->ls_nodes_gone); remove_remote_member(memb->nodeid); ls->ls_num_nodes--; - dlm_lsop_recover_slot(ls, memb); + dlm_lsop_recover_slot(ls, memb, release_recover); } /* add new members to ls_nodes */ for (i = 0; i < rv->nodes_count; i++) { node = &rv->nodes[i]; + if (node->gone) + continue; + if (dlm_is_member(ls, node->nodeid)) continue; error = dlm_add_member(ls, node); diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index be4240f09abd..3ac020fb8139 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -842,7 +842,7 @@ static void recover_conversion(struct dlm_rsb *r) */ if (((lkb->lkb_grmode == DLM_LOCK_PR) && (other_grmode == DLM_LOCK_CW)) || ((lkb->lkb_grmode == DLM_LOCK_CW) && (other_grmode == DLM_LOCK_PR))) { - log_limit(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL", + log_rinfo(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL", __func__, lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid, other_lkid, other_grmode); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 5cb3896be826..51daf4acbe31 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -425,7 +425,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params) dlm_put_lockspace(ls); if (error) - dlm_release_lockspace(lockspace, 0); + dlm_release_lockspace(lockspace, DLM_RELEASE_NO_LOCKS); else error = ls->ls_device.minor; @@ -436,7 +436,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params) { dlm_lockspace_t *lockspace; struct dlm_ls *ls; - int error, force = 0; + int error, force = DLM_RELEASE_NO_LOCKS; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -446,7 +446,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params) return -ENOENT; if (params->flags & DLM_USER_LSFLG_FORCEFREE) - force = 2; + force = DLM_RELEASE_NORMAL; lockspace = ls; dlm_put_lockspace(ls); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 72fbe1316ab8..abd954c6a14e 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -634,10 +634,9 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_lock; } - rd.old_mnt_idmap = &nop_mnt_idmap; + rd.mnt_idmap = &nop_mnt_idmap; rd.old_parent = lower_old_dir_dentry; rd.old_dentry = lower_old_dentry; - rd.new_mnt_idmap = &nop_mnt_idmap; rd.new_parent = lower_new_dir_dentry; rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 4bb4002e3cdf..1f4d8ce56667 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -127,7 +127,7 @@ static int efivarfs_unfreeze_fs(struct super_block *sb); static const struct super_operations efivarfs_ops = { .statfs = efivarfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .alloc_inode = efivarfs_alloc_inode, .free_inode = efivarfs_free_inode, .show_options = efivarfs_show_options, diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 3b1ba571c728..8ca29962a3dd 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -475,6 +475,10 @@ static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence) const struct file_operations erofs_file_fops = { .llseek = erofs_file_llseek, .read_iter = erofs_file_read_iter, + .unlocked_ioctl = erofs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = erofs_compat_ioctl, +#endif .mmap_prepare = erofs_file_mmap_prepare, .get_unmapped_area = thp_get_unmapped_area, .splice_read = filemap_splice_read, diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index debf469ad6bd..32b4f5aa60c9 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -123,4 +123,8 @@ const struct file_operations erofs_dir_fops = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = erofs_readdir, + .unlocked_ioctl = erofs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = erofs_compat_ioctl, +#endif }; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 377ee12b8b96..3d5738f80072 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -12,10 +12,12 @@ /* to allow for x86 boot sectors and other oddities. */ #define EROFS_SUPER_OFFSET 1024 -#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 -#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 -#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 +#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 +#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 +#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 #define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008 +#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010 + /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 9a2f59721522..cb780c095d28 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -5,6 +5,7 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" +#include <linux/compat.h> #include <trace/events/erofs.h> static int erofs_fill_symlink(struct inode *inode, void *kaddr, @@ -213,10 +214,7 @@ static int erofs_fill_inode(struct inode *inode) switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &erofs_generic_iops; - if (erofs_inode_is_data_compressed(vi->datalayout)) - inode->i_fop = &generic_ro_fops; - else - inode->i_fop = &erofs_file_fops; + inode->i_fop = &erofs_file_fops; break; case S_IFDIR: inode->i_op = &erofs_dir_iops; @@ -341,6 +339,40 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, return 0; } +static int erofs_ioctl_get_volume_label(struct inode *inode, void __user *arg) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + int ret; + + if (!sbi->volume_name) + ret = clear_user(arg, 1); + else + ret = copy_to_user(arg, sbi->volume_name, + strlen(sbi->volume_name)); + return ret ? -EFAULT : 0; +} + +long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + void __user *argp = (void __user *)arg; + + switch (cmd) { + case FS_IOC_GETFSLABEL: + return erofs_ioctl_get_volume_label(inode, argp); + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long erofs_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + return erofs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + const struct inode_operations erofs_generic_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 4ccc5f0ee8df..f7f622836198 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -153,6 +153,7 @@ struct erofs_sb_info { /* used for statfs, f_files - f_favail */ u64 inos; + char *volume_name; u32 feature_compat; u32 feature_incompat; @@ -234,6 +235,7 @@ EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX) +EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX) static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid) { @@ -535,6 +537,10 @@ static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { static inline void erofs_fscache_submit_bio(struct bio *bio) {} #endif +long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long erofs_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg); + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 1b529ace4db0..f3f8d8c066e4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -343,6 +343,13 @@ static int erofs_read_superblock(struct super_block *sb) sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec); super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); + if (dsb->volume_name[0]) { + sbi->volume_name = kstrndup(dsb->volume_name, + sizeof(dsb->volume_name), GFP_KERNEL); + if (!sbi->volume_name) + return -ENOMEM; + } + /* parse on-disk compression configurations */ ret = z_erofs_parse_cfgs(sb, dsb); if (ret < 0) @@ -822,6 +829,7 @@ static void erofs_sb_free(struct erofs_sb_info *sbi) kfree(sbi->domain_id); if (sbi->dif0.file) fput(sbi->dif0.file); + kfree(sbi->volume_name); kfree(sbi); } @@ -1018,10 +1026,22 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) return 0; } +static void erofs_evict_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + dax_break_layout_final(inode); +#endif + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); +} + const struct super_operations erofs_sops = { .put_super = erofs_put_super, .alloc_inode = erofs_alloc_inode, .free_inode = erofs_free_inode, + .evict_inode = erofs_evict_inode, .statfs = erofs_statfs, .show_options = erofs_show_options, }; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index eaa9efd766ee..396536d9a862 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -482,6 +482,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb) erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2; struct erofs_xattr_prefix_item *pfs; int ret = 0, i, len; + bool plain = erofs_sb_has_plain_xattr_pfx(sbi); if (!sbi->xattr_prefix_count) return 0; @@ -490,9 +491,15 @@ int erofs_xattr_prefixes_init(struct super_block *sb) if (!pfs) return -ENOMEM; - if (sbi->packed_inode) - buf.mapping = sbi->packed_inode->i_mapping; - else + if (!plain) { + if (erofs_sb_has_metabox(sbi)) + (void)erofs_init_metabuf(&buf, sb, true); + else if (sbi->packed_inode) + buf.mapping = sbi->packed_inode->i_mapping; + else + plain = true; + } + if (plain) (void)erofs_init_metabuf(&buf, sb, false); for (i = 0; i < sbi->xattr_prefix_count; i++) { diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 2d73297003d2..bc80cfe482f7 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -823,9 +823,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) } rcu_read_unlock(); } - } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { - DBG_BUGON(1); - return -EFSCORRUPTED; } if (pcl) { @@ -1835,7 +1832,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); - if (err) + if (err || !(map->m_flags & EROFS_MAP_ENCODED)) return; /* expand ra for the trailing edge if readahead */ @@ -1847,7 +1844,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, end = round_up(end, PAGE_SIZE); } else { end = round_up(map->m_la, PAGE_SIZE); - if (!map->m_llen) + if (!(map->m_flags & EROFS_MAP_ENCODED) || !map->m_llen) return; } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index a93efd95c555..e5581dbeb4c2 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -394,10 +394,10 @@ static int z_erofs_map_blocks_fo(struct inode *inode, .map = map, .in_mbox = erofs_inode_in_metabox(inode), }; - int err = 0; - unsigned int endoff, afmt; + unsigned int endoff; unsigned long initial_lcn; unsigned long long ofs, end; + int err; ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) && @@ -462,8 +462,8 @@ static int z_erofs_map_blocks_fo(struct inode *inode, map->m_pa = vi->z_fragmentoff; map->m_plen = vi->z_idata_size; if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { - erofs_err(sb, "invalid tail-packing pclustersize %llu", - map->m_plen); + erofs_err(sb, "ztailpacking inline data across blocks @ nid %llu", + vi->nid); err = -EFSCORRUPTED; goto unmap_out; } @@ -482,20 +482,15 @@ static int z_erofs_map_blocks_fo(struct inode *inode, err = -EFSCORRUPTED; goto unmap_out; } - afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ? - Z_EROFS_COMPRESSION_INTERLACED : - Z_EROFS_COMPRESSION_SHIFTED; + if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) + map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) { + map->m_algorithmformat = vi->z_algorithmtype[1]; } else { - afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? - vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; - if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { - erofs_err(sb, "inconsistent algorithmtype %u for nid %llu", - afmt, vi->nid); - err = -EFSCORRUPTED; - goto unmap_out; - } + map->m_algorithmformat = vi->z_algorithmtype[0]; } - map->m_algorithmformat = afmt; if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && @@ -626,9 +621,9 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; - int err, headnr; - erofs_off_t pos; struct z_erofs_map_header *h; + erofs_off_t pos; + int err = 0; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { /* @@ -642,7 +637,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) return -ERESTARTSYS; - err = 0; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; @@ -679,15 +673,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) vi->z_idata_size = le16_to_cpu(h->h_idata_size); - headnr = 0; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || - vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", - headnr + 1, vi->z_algorithmtype[headnr], vi->nid); - err = -EOPNOTSUPP; - goto out_unlock; - } - if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { @@ -726,6 +711,30 @@ out_unlock: return err; } +static int z_erofs_map_sanity_check(struct inode *inode, + struct erofs_map_blocks *map) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + + if (!(map->m_flags & EROFS_MAP_ENCODED)) + return 0; + if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) { + erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel", + map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid); + return -EOPNOTSUPP; + } + if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX && + !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) { + erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + map->m_algorithmformat, EROFS_I(inode)->nid); + return -EFSCORRUPTED; + } + if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || + map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) + return -EOPNOTSUPP; + return 0; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { @@ -746,10 +755,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, else err = z_erofs_map_blocks_fo(inode, map, flags); } - if (!err && (map->m_flags & EROFS_MAP_ENCODED) && - unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || - map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) - err = -EOPNOTSUPP; + if (!err) + err = z_erofs_map_sanity_check(inode, map); if (err) map->m_llen = 0; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index b22d6f819f78..ee7c4b683ec3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -46,10 +46,10 @@ * * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) - * 3) ep->lock (rwlock) + * 3) ep->lock (spinlock) * * The acquire order is the one listed above, from 1 to 3. - * We need a rwlock (ep->lock) because we manipulate objects + * We need a spinlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need @@ -195,7 +195,7 @@ struct eventpoll { struct list_head rdllist; /* Lock which protects rdllist and ovflist */ - rwlock_t lock; + spinlock_t lock; /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; @@ -741,10 +741,10 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) * in a lockless way. */ lockdep_assert_irqs_enabled(); - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, @@ -752,7 +752,7 @@ static void ep_done_scan(struct eventpoll *ep, { struct epitem *epi, *nepi; - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. @@ -793,7 +793,7 @@ static void ep_done_scan(struct eventpoll *ep, wake_up(&ep->wq); } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } static void ep_get(struct eventpoll *ep) @@ -868,10 +868,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) rb_erase_cached(&epi->rbn, &ep->rbr); - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* @@ -1152,7 +1152,7 @@ static int ep_alloc(struct eventpoll **pep) return -ENOMEM; mutex_init(&ep->mtx); - rwlock_init(&ep->lock); + spin_lock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); @@ -1240,99 +1240,9 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, #endif /* CONFIG_KCMP */ /* - * Adds a new entry to the tail of the list in a lockless way, i.e. - * multiple CPUs are allowed to call this function concurrently. - * - * Beware: it is necessary to prevent any other modifications of the - * existing list until all changes are completed, in other words - * concurrent list_add_tail_lockless() calls should be protected - * with a read lock, where write lock acts as a barrier which - * makes sure all list_add_tail_lockless() calls are fully - * completed. - * - * Also an element can be locklessly added to the list only in one - * direction i.e. either to the tail or to the head, otherwise - * concurrent access will corrupt the list. - * - * Return: %false if element has been already added to the list, %true - * otherwise. - */ -static inline bool list_add_tail_lockless(struct list_head *new, - struct list_head *head) -{ - struct list_head *prev; - - /* - * This is simple 'new->next = head' operation, but cmpxchg() - * is used in order to detect that same element has been just - * added to the list from another CPU: the winner observes - * new->next == new. - */ - if (!try_cmpxchg(&new->next, &new, head)) - return false; - - /* - * Initially ->next of a new element must be updated with the head - * (we are inserting to the tail) and only then pointers are atomically - * exchanged. XCHG guarantees memory ordering, thus ->next should be - * updated before pointers are actually swapped and pointers are - * swapped before prev->next is updated. - */ - - prev = xchg(&head->prev, new); - - /* - * It is safe to modify prev->next and new->prev, because a new element - * is added only to the tail and new->next is updated before XCHG. - */ - - prev->next = new; - new->prev = prev; - - return true; -} - -/* - * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, - * i.e. multiple CPUs are allowed to call this function concurrently. - * - * Return: %false if epi element has been already chained, %true otherwise. - */ -static inline bool chain_epi_lockless(struct epitem *epi) -{ - struct eventpoll *ep = epi->ep; - - /* Fast preliminary check */ - if (epi->next != EP_UNACTIVE_PTR) - return false; - - /* Check that the same epi has not been just chained from another CPU */ - if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) - return false; - - /* Atomically exchange tail */ - epi->next = xchg(&ep->ovflist, epi); - - return true; -} - -/* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. - * - * This callback takes a read lock in order not to contend with concurrent - * events from another file descriptor, thus all modifications to ->rdllist - * or ->ovflist are lockless. Read lock is paired with the write lock from - * ep_start/done_scan(), which stops all list modifications and guarantees - * that lists state is seen correctly. - * - * Another thing worth to mention is that ep_poll_callback() can be called - * concurrently for the same @epi from different CPUs if poll table was inited - * with several wait queues entries. Plural wakeup from different CPUs of a - * single wait queue is serialized by wq.lock, but the case when multiple wait - * queues are used should be detected accordingly. This is detected using - * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { @@ -1343,7 +1253,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v unsigned long flags; int ewake = 0; - read_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1372,12 +1282,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { - if (chain_epi_lockless(epi)) + if (epi->next == EP_UNACTIVE_PTR) { + epi->next = READ_ONCE(ep->ovflist); + WRITE_ONCE(ep->ovflist, epi); ep_pm_stay_awake_rcu(epi); + } } else if (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */ - if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) - ep_pm_stay_awake_rcu(epi); + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake_rcu(epi); } /* @@ -1410,7 +1323,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v pwake++; out_unlock: - read_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -1745,7 +1658,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, } /* We have to drop the new item inside our item list to keep track of it */ - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); @@ -1762,7 +1675,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, pwake++; } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); /* We have to call this outside the lock */ if (pwake) @@ -1826,7 +1739,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1837,7 +1750,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, if (waitqueue_active(&ep->poll_wait)) pwake++; } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ @@ -2089,7 +2002,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, init_wait(&wait); wait.func = ep_autoremove_wake_function; - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it @@ -2108,7 +2021,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (!eavail) __add_wait_queue_exclusive(&ep->wq, &wait); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); if (!eavail) timed_out = !ep_schedule_timeout(to) || @@ -2124,7 +2037,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, eavail = 1; if (!list_empty_careful(&wait.entry)) { - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its @@ -2135,7 +2048,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } } } diff --git a/fs/exec.c b/fs/exec.c index 2a1e5e4042a1..4a89918b761f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -599,7 +599,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { - unsigned long ret; + int ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; @@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error) + if (!error && !write) validate_coredump_safety(); return error; } diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 0a056d97e640..cf0a0970c095 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -227,6 +227,8 @@ static bool ext4_has_stable_inodes(struct super_block *sb) } const struct fscrypt_operations ext4_cryptops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 01a6e2de7fc3..6cb784a56b3b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1182,6 +1182,14 @@ struct ext4_inode_info { __u32 i_csum_seed; kprojid_t i_projid; + +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; +#endif + +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif }; /* diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df4051613b29..ba4fd9aba1c1 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) "nonexistent device\n", __func__, __LINE__); return; } - if (atomic_read(&inode->i_count) > 1) { + if (icount_read(inode) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", __func__, __LINE__, inode->i_ino, - atomic_read(&inode->i_count)); + icount_read(inode)); return; } if (inode->i_nlink) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5898d92ba19f..8b18802e83eb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3995,7 +3995,7 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) list_splice_tail(&freed_data_list, &sbi->s_discard_list); spin_unlock(&sbi->s_md_lock); if (wake) - queue_work(system_unbound_wq, &sbi->s_discard_work); + queue_work(system_dfl_wq, &sbi->s_discard_work); } else { list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) kmem_cache_free(ext4_free_data_cachep, entry); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 699c15db28a8..7f2d4014d128 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1417,7 +1417,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) static int ext4_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); @@ -1470,6 +1470,12 @@ static void init_once(void *foo) init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + ei->i_crypt_info = NULL; +#endif +#ifdef CONFIG_FS_VERITY + ei->i_verity_info = NULL; +#endif } static int __init init_inodecache(void) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index d9203228ce97..b0acb0c50313 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -389,6 +389,8 @@ static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations ext4_verityops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_verity_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), .begin_enable_verity = ext4_begin_enable_verity, .end_enable_verity = ext4_end_enable_verity, .get_verity_descriptor = ext4_get_verity_descriptor, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 46be7560548c..6e465bbc85ee 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -907,6 +907,12 @@ struct f2fs_inode_info { unsigned int atomic_write_cnt; loff_t original_i_size; /* original i_size before atomic write */ +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */ +#endif +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; /* filesystem verity info */ +#endif }; static inline void get_read_extent_info(struct extent_info *ext, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e16c4e2830c2..2619cbbd7d2d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -480,6 +480,12 @@ static void init_once(void *foo) struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; inode_init_once(&fi->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + fi->i_crypt_info = NULL; +#endif +#ifdef CONFIG_FS_VERITY + fi->i_verity_info = NULL; +#endif } #ifdef CONFIG_QUOTA @@ -1744,7 +1750,7 @@ static int f2fs_drop_inode(struct inode *inode) if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ - atomic_inc(&inode->i_count); + __iget(inode); spin_unlock(&inode->i_lock); /* should remain fi->extent_tree for writepage */ @@ -1763,12 +1769,12 @@ static int f2fs_drop_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); spin_lock(&inode->i_lock); - atomic_dec(&inode->i_count); + iput(inode); } trace_f2fs_drop_inode(inode, 0); return 0; } - ret = generic_drop_inode(inode); + ret = inode_generic_drop(inode); if (!ret) ret = fscrypt_drop_inode(inode); trace_f2fs_drop_inode(inode, ret); @@ -3570,6 +3576,8 @@ static struct block_device **f2fs_get_devices(struct super_block *sb, } static const struct fscrypt_operations f2fs_cryptops = { + .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_crypt_info) - + (int)offsetof(struct f2fs_inode_info, vfs_inode), .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, @@ -3581,7 +3589,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .has_stable_inodes = f2fs_has_stable_inodes, .get_devices = f2fs_get_devices, }; -#endif +#endif /* CONFIG_FS_ENCRYPTION */ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 2287f238ae09..f0ab9a3c7a82 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -287,6 +287,8 @@ static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations f2fs_verityops = { + .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_verity_info) - + (int)offsetof(struct f2fs_inode_info, vfs_inode), .begin_enable_verity = f2fs_begin_enable_verity, .end_enable_verity = f2fs_end_enable_verity, .get_verity_descriptor = f2fs_get_verity_descriptor, diff --git a/fs/fcntl.c b/fs/fcntl.c index 5598e4d57422..72f8433d9109 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -355,8 +355,7 @@ static bool rw_hint_valid(u64 hint) } } -static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_get_rw_hint(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; @@ -367,8 +366,7 @@ static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, return 0; } -static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_set_rw_hint(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; @@ -547,10 +545,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: - err = fcntl_get_rw_hint(filp, cmd, arg); + err = fcntl_get_rw_hint(filp, arg); break; case F_SET_RW_HINT: - err = fcntl_set_rw_hint(filp, cmd, arg); + err = fcntl_set_rw_hint(filp, arg); break; default: break; diff --git a/fs/fhandle.c b/fs/fhandle.c index 68a7d2861c58..052f9c9368fb 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -11,6 +11,7 @@ #include <linux/personality.h> #include <linux/uaccess.h> #include <linux/compat.h> +#include <linux/nsfs.h> #include "internal.h" #include "mount.h" @@ -189,6 +190,11 @@ static int get_path_anchor(int fd, struct path *root) return 0; } + if (fd == FD_NSFS_ROOT) { + nsfs_get_root(root); + return 0; + } + return -EBADF; } @@ -208,6 +214,14 @@ static int vfs_dentry_acceptable(void *context, struct dentry *dentry) return 1; /* + * Verify that the decoded dentry itself has a valid id mapping. + * In case the decoded dentry is the mountfd root itself, this + * verifies that the mountfd inode itself has a valid id mapping. + */ + if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(dentry))) + return 0; + + /* * It's racy as we're not taking rename_lock but we're able to ignore * permissions and we just need an approximation whether we were able * to follow a path to the file. diff --git a/fs/file.c b/fs/file.c index 6d2275c3be9c..28743b742e3c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1330,7 +1330,10 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; - return do_dup2(files, file, fd, flags); + err = do_dup2(files, file, fd, flags); + if (err < 0) + return err; + return 0; out_unlock: spin_unlock(&files->file_lock); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a07b8cf73ae2..2b35e80037fe 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) } struct inode_switch_wbs_context { - struct rcu_work work; + /* List of queued switching contexts for the wb */ + struct llist_node list; /* * Multiple inodes can be switched at once. The switching procedure @@ -378,7 +379,6 @@ struct inode_switch_wbs_context { * array embedded into struct inode_switch_wbs_context. Otherwise * an inode could be left in a non-consistent state. */ - struct bdi_writeback *new_wb; struct inode *inodes[]; }; @@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode, * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, * the specific list @inode was on is ignored and the @inode is put on * ->b_dirty which is always correct including from ->b_dirty_time. - * The transfer preserves @inode->dirtied_when ordering. If the @inode - * was clean, it means it was on the b_attached list, so move it onto - * the b_attached list of @new_wb. + * If the @inode was clean, it means it was on the b_attached list, so + * move it onto the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { inode->i_wb = new_wb; if (inode->i_state & I_DIRTY_ALL) { - struct inode *pos; - - list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) - if (time_after_eq(inode->dirtied_when, - pos->dirtied_when)) - break; + /* + * We need to keep b_dirty list sorted by + * dirtied_time_when. However properly sorting the + * inode in the list gets too expensive when switching + * many inodes. So just attach inode at the end of the + * dirty list and clobber the dirtied_time_when. + */ + inode->dirtied_time_when = jiffies; inode_io_list_move_locked(inode, new_wb, - pos->i_io_list.prev); + &new_wb->b_dirty); } else { inode_cgwb_move_to_attached(inode, new_wb); } @@ -486,13 +487,11 @@ skip_switch: return switched; } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static void process_inode_switch_wbs(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw) { - struct inode_switch_wbs_context *isw = - container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; - struct bdi_writeback *new_wb = isw->new_wb; unsigned long nr_switched = 0; struct inode **inodep; @@ -502,6 +501,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) */ down_read(&bdi->wb_switch_rwsem); + inodep = isw->inodes; /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions @@ -512,6 +512,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ +relock: if (old_wb < new_wb) { spin_lock(&old_wb->list_lock); spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); @@ -520,10 +521,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } - for (inodep = isw->inodes; *inodep; inodep++) { + while (*inodep) { WARN_ON_ONCE((*inodep)->i_wb != old_wb); if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) nr_switched++; + inodep++; + if (*inodep && need_resched()) { + spin_unlock(&new_wb->list_lock); + spin_unlock(&old_wb->list_lock); + cond_resched(); + goto relock; + } } spin_unlock(&new_wb->list_lock); @@ -543,6 +551,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) atomic_dec(&isw_nr_in_flight); } +void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback, + switch_work); + struct inode_switch_wbs_context *isw, *next_isw; + struct llist_node *list; + + /* + * Grab out reference to wb so that it cannot get freed under us + * after we process all the isw items. + */ + wb_get(new_wb); + while (1) { + list = llist_del_all(&new_wb->switch_wbs_ctxs); + /* Nothing to do? */ + if (!list) + break; + /* + * In addition to synchronizing among switchers, I_WB_SWITCH + * tells the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be + * visible. + */ + synchronize_rcu(); + + llist_for_each_entry_safe(isw, next_isw, list, list) + process_inode_switch_wbs(new_wb, isw); + } + wb_put(new_wb); +} + static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { @@ -572,6 +612,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode, return true; } +static void wb_queue_isw(struct bdi_writeback *wb, + struct inode_switch_wbs_context *isw) +{ + if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) + queue_work(isw_wq, &wb->switch_work); +} + /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode @@ -585,6 +632,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) struct backing_dev_info *bdi = inode_to_bdi(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb = NULL; /* noop if seems to be already in progress */ if (inode->i_state & I_WB_SWITCH) @@ -609,40 +657,35 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (!memcg_css) goto out_free; - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); - if (!isw->new_wb) + if (!new_wb) goto out_free; - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) goto out_free; isw->inodes[0] = inode; - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1); + wb_queue_isw(new_wb, isw); return; out_free: atomic_dec(&isw_nr_in_flight); - if (isw->new_wb) - wb_put(isw->new_wb); + if (new_wb) + wb_put(new_wb); kfree(isw); } -static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, +static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw, struct list_head *list, int *nr) { struct inode *inode; list_for_each_entry(inode, list, i_io_list) { - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) continue; isw->inodes[*nr] = inode; @@ -666,6 +709,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb; int nr; bool restart = false; @@ -678,12 +722,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) for (memcg_css = wb->memcg_css->parent; memcg_css; memcg_css = memcg_css->parent) { - isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); - if (isw->new_wb) + new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); + if (new_wb) break; } - if (unlikely(!isw->new_wb)) - isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + if (unlikely(!new_wb)) + new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ nr = 0; spin_lock(&wb->list_lock); @@ -695,27 +739,22 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * bandwidth restrictions, as writeback of inode metadata is not * accounted for. */ - restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr); if (!restart) - restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time, + &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ if (nr == 0) { atomic_dec(&isw_nr_in_flight); - wb_put(isw->new_wb); + wb_put(new_wb); kfree(isw); return restart; } - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + trace_inode_switch_wbs_queue(wb, new_wb, nr); + wb_queue_isw(new_wb, isw); return restart; } @@ -1123,7 +1162,7 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, dirty = dirty * 10 / 8; /* issue the writeback work */ - work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); + work = kzalloc(sizeof(*work), GFP_NOWAIT); if (work) { work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; @@ -1180,7 +1219,7 @@ void cgroup_writeback_umount(struct super_block *sb) static int __init cgroup_writeback_init(void) { - isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); + isw_wq = alloc_workqueue("inode_switch_wbs", WQ_PERCPU, 0); if (!isw_wq) return -ENOMEM; return 0; @@ -1767,7 +1806,7 @@ static int writeback_single_inode(struct inode *inode, int ret = 0; spin_lock(&inode->i_lock); - if (!atomic_read(&inode->i_count)) + if (!icount_read(inode)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else WARN_ON(inode->i_state & I_WILL_FREE); @@ -2442,7 +2481,7 @@ static int dirtytime_interval_handler(const struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) - mod_delayed_work(system_wq, &dirtytime_work, 0); + mod_delayed_work(system_percpu_wq, &dirtytime_work, 0); return ret; } diff --git a/fs/fsopen.c b/fs/fsopen.c index 1aaf4cb2afb2..f645c99204eb 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -18,50 +18,56 @@ #include "internal.h" #include "mount.h" +static inline const char *fetch_message_locked(struct fc_log *log, size_t len, + bool *need_free) +{ + const char *p; + int index; + + if (unlikely(log->head == log->tail)) + return ERR_PTR(-ENODATA); + + index = log->tail & (ARRAY_SIZE(log->buffer) - 1); + p = log->buffer[index]; + if (unlikely(strlen(p) > len)) + return ERR_PTR(-EMSGSIZE); + + log->buffer[index] = NULL; + *need_free = log->need_free & (1 << index); + log->need_free &= ~(1 << index); + log->tail++; + + return p; +} + /* * Allow the user to read back any error, warning or informational messages. + * Only one message is returned for each read(2) call. */ static ssize_t fscontext_read(struct file *file, char __user *_buf, size_t len, loff_t *pos) { struct fs_context *fc = file->private_data; - struct fc_log *log = fc->log.log; - unsigned int logsize = ARRAY_SIZE(log->buffer); - ssize_t ret; - char *p; + ssize_t err; + const char *p __free(kfree) = NULL, *message; bool need_free; - int index, n; + int n; - ret = mutex_lock_interruptible(&fc->uapi_mutex); - if (ret < 0) - return ret; - - if (log->head == log->tail) { - mutex_unlock(&fc->uapi_mutex); - return -ENODATA; - } - - index = log->tail & (logsize - 1); - p = log->buffer[index]; - need_free = log->need_free & (1 << index); - log->buffer[index] = NULL; - log->need_free &= ~(1 << index); - log->tail++; + err = mutex_lock_interruptible(&fc->uapi_mutex); + if (err < 0) + return err; + message = fetch_message_locked(fc->log.log, len, &need_free); mutex_unlock(&fc->uapi_mutex); + if (IS_ERR(message)) + return PTR_ERR(message); - ret = -EMSGSIZE; - n = strlen(p); - if (n > len) - goto err_free; - ret = -EFAULT; - if (copy_to_user(_buf, p, n) != 0) - goto err_free; - ret = n; - -err_free: if (need_free) - kfree(p); - return ret; + p = message; + + n = strlen(message); + if (copy_to_user(_buf, message, n)) + return -EFAULT; + return n; } static int fscontext_release(struct inode *inode, struct file *file) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e80cd8f2c049..66a1ba8c56b5 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -119,7 +119,7 @@ void fuse_check_timeout(struct work_struct *work) goto abort_conn; out: - queue_delayed_work(system_wq, &fc->timeout.work, + queue_delayed_work(system_percpu_wq, &fc->timeout.work, fuse_timeout_timer_freq); return; @@ -1893,7 +1893,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num) { + while (num && ap->num_folios < num_pages) { struct folio *folio; unsigned int folio_offset; unsigned int nr_bytes; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2d817d7cab26..5c569c3cb53f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1199,7 +1199,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, if (attr->blksize != 0) blkbits = ilog2(attr->blksize); else - blkbits = inode->i_sb->s_blocksize_bits; + blkbits = fc->blkbits; stat->blksize = 1 << blkbits; } @@ -1377,6 +1377,7 @@ retry: generic_fillattr(idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; + stat->blksize = 1 << fi->cached_i_blkbits; if (test_bit(FUSE_I_BTIME, &fi->state)) { stat->btime = fi->i_btime; stat->result_mask |= STATX_BTIME; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5525a4520b0f..4adcf09d4b01 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2960,7 +2960,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, .nodeid_out = ff_out->nodeid, .fh_out = ff_out->fh, .off_out = pos_out, - .len = len, + .len = min_t(size_t, len, UINT_MAX & PAGE_MASK), .flags = flags }; struct fuse_write_out outarg; @@ -3026,6 +3026,9 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, fc->no_copy_file_range = 1; err = -EOPNOTSUPP; } + if (!err && outarg.size > len) + err = -EIO; + if (err) goto out; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ec248d13c8bf..cc428d04be3e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -210,6 +210,12 @@ struct fuse_inode { /** Reference to backing file in passthrough mode */ struct fuse_backing *fb; #endif + + /* + * The underlying inode->i_blkbits value will not be modified, + * so preserve the blocksize specified by the server. + */ + u8 cached_i_blkbits; }; /** FUSE inode state bits */ @@ -969,6 +975,14 @@ struct fuse_conn { /* Request timeout (in jiffies). 0 = no timeout */ unsigned int req_timeout; } timeout; + + /* + * This is a workaround until fuse uses iomap for reads. + * For fuseblk servers, this represents the blocksize passed in at + * mount time and for regular fuse servers, this is equivalent to + * inode->i_blkbits. + */ + u8 blkbits; }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 67c2318bfc42..7485a41af892 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -289,6 +289,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, } } + if (attr->blksize) + fi->cached_i_blkbits = ilog2(attr->blksize); + else + fi->cached_i_blkbits = fc->blkbits; + /* * Don't set the sticky bit in i_mode, unless we want the VFS * to check permissions. This prevents failures due to the @@ -1204,7 +1209,7 @@ static const struct super_operations fuse_super_operations = { .free_inode = fuse_free_inode, .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, .sync_fs = fuse_sync_fs, @@ -1268,7 +1273,7 @@ static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout) { fc->timeout.req_timeout = secs_to_jiffies(timeout); INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout); - queue_delayed_work(system_wq, &fc->timeout.work, + queue_delayed_work(system_percpu_wq, &fc->timeout.work, fuse_timeout_timer_freq); } @@ -1805,10 +1810,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err = -EINVAL; if (!sb_set_blocksize(sb, ctx->blksize)) goto err; + /* + * This is a workaround until fuse hooks into iomap for reads. + * Use PAGE_SIZE for the blocksize else if the writeback cache + * is enabled, buffered writes go through iomap and a read may + * overwrite partially written data if blocksize < PAGE_SIZE + */ + fc->blkbits = sb->s_blocksize_bits; + if (ctx->blksize != PAGE_SIZE && + !sb_set_blocksize(sb, PAGE_SIZE)) + goto err; #endif } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; + fc->blkbits = sb->s_blocksize_bits; } sb->s_subtype = ctx->subtype; diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 607ef735ad4a..eb97ac009e75 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -237,6 +237,11 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) if (!file) goto out; + /* read/write/splice/mmap passthrough only relevant for regular files */ + res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL; + if (!d_is_reg(file->f_path.dentry)) + goto out_fput; + backing_sb = file_inode(file)->i_sb; res = -ELOOP; if (backing_sb->s_stack_depth >= fc->max_stack_depth) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index c826e7ca49f5..76c8fd0bfc75 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1016,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (kaddr) *kaddr = fs->window_kaddr + offset; if (pfn) - *pfn = fs->window_phys_addr + offset; + *pfn = PHYS_PFN(fs->window_phys_addr + offset); return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 72d95185a39f..bc67fa058c84 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1442,6 +1442,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int ret; if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; @@ -1450,14 +1451,20 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) locks_lock_file_wait(file, fl); return -EIO; } - if (cmd == F_CANCELLK) - return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); - else if (IS_GETLK(cmd)) - return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); - else if (lock_is_unlock(fl)) - return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); - else - return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); + down_read(&ls->ls_sem); + ret = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + if (cmd == F_CANCELLK) + ret = dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (IS_GETLK(cmd)) + ret = dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (lock_is_unlock(fl)) + ret = dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); + else + ret = dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); + } + up_read(&ls->ls_sem); + return ret; } static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b6fd1cb17de7..b677c0e6b9ab 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -481,11 +481,9 @@ done: /** * do_promote - promote as many requests as possible on the current queue * @gl: The glock - * - * Returns true on success (i.e., progress was made or there are no waiters). */ -static bool do_promote(struct gfs2_glock *gl) +static void do_promote(struct gfs2_glock *gl) { struct gfs2_holder *gh, *current_gh; @@ -496,13 +494,10 @@ static bool do_promote(struct gfs2_glock *gl) if (!may_grant(gl, current_gh, gh)) { /* * If we get here, it means we may not grant this - * holder for some reason. If this holder is at the - * head of the list, it means we have a blocked holder - * at the head, so return false. + * holder for some reason. */ - if (list_is_first(&gh->gh_list, &gl->gl_holders)) - return false; - do_error(gl, 0); + if (current_gh) + do_error(gl, 0); /* Fail queued try locks */ break; } set_bit(HIF_HOLDER, &gh->gh_iflags); @@ -511,7 +506,6 @@ static bool do_promote(struct gfs2_glock *gl) if (!current_gh) current_gh = gh; } - return true; } /** @@ -646,8 +640,10 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) } /* Fast path - we got what we asked for */ - if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) + if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { + clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); gfs2_demote_wake(gl); + } if (gl->gl_state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { int rv; @@ -693,54 +689,33 @@ __acquires(&gl->gl_lockref.lock) const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; - unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) && gh && !(gh->gh_flags & LM_FLAG_NOEXP)) goto skip_inval; - lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP); GLOCK_BUG_ON(gl, gl->gl_state == target); GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); - if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && - glops->go_inval) { - /* - * If another process is already doing the invalidate, let that - * finish first. The glock state machine will get back to this - * holder again later. - */ - if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS, - &gl->gl_flags)) - return; - do_error(gl, 0); /* Fail queued try locks */ - } - gl->gl_req = target; - set_bit(GLF_BLOCKING, &gl->gl_flags); - if ((gl->gl_req == LM_ST_UNLOCKED) || - (gl->gl_state == LM_ST_EXCLUSIVE) || - (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) - clear_bit(GLF_BLOCKING, &gl->gl_flags); - if (!glops->go_inval && !glops->go_sync) + if (!glops->go_inval || !glops->go_sync) goto skip_inval; spin_unlock(&gl->gl_lockref.lock); - if (glops->go_sync) { - ret = glops->go_sync(gl); - /* If we had a problem syncing (due to io errors or whatever, - * we should not invalidate the metadata or tell dlm to - * release the glock to other nodes. - */ - if (ret) { - if (cmpxchg(&sdp->sd_log_error, 0, ret)) { - fs_err(sdp, "Error %d syncing glock \n", ret); - gfs2_dump_glock(NULL, gl, true); - } - spin_lock(&gl->gl_lockref.lock); - goto skip_inval; + ret = glops->go_sync(gl); + /* If we had a problem syncing (due to io errors or whatever, + * we should not invalidate the metadata or tell dlm to + * release the glock to other nodes. + */ + if (ret) { + if (cmpxchg(&sdp->sd_log_error, 0, ret)) { + fs_err(sdp, "Error %d syncing glock\n", ret); + gfs2_dump_glock(NULL, gl, true); } + spin_lock(&gl->gl_lockref.lock); + goto skip_inval; } - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) { + + if (target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) { /* * The call to go_sync should have cleared out the ail list. * If there are still items, we have a problem. We ought to @@ -755,12 +730,10 @@ __acquires(&gl->gl_lockref.lock) gfs2_dump_glock(NULL, gl, true); } glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); - clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } spin_lock(&gl->gl_lockref.lock); skip_inval: - gl->gl_lockref.count++; /* * Check for an error encountered since we called go_sync and go_inval. * If so, we can't withdraw from the glock code because the withdraw @@ -803,38 +776,41 @@ skip_inval: if (!test_bit(GLF_CANCELING, &gl->gl_flags)) clear_bit(GLF_LOCK, &gl->gl_flags); clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); + gl->gl_lockref.count++; gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); return; - } else { - clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } } if (ls->ls_ops->lm_lock) { set_bit(GLF_PENDING_REPLY, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); - ret = ls->ls_ops->lm_lock(gl, target, lck_flags); + ret = ls->ls_ops->lm_lock(gl, target, gh ? gh->gh_flags : 0); spin_lock(&gl->gl_lockref.lock); - if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && - target == LM_ST_UNLOCKED && - test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { + if (!ret) { + /* The operation will be completed asynchronously. */ + gl->gl_lockref.count++; + return; + } + clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); + + if (ret == -ENODEV && gl->gl_target == LM_ST_UNLOCKED && + target == LM_ST_UNLOCKED) { /* * The lockspace has been released and the lock has * been unlocked implicitly. */ - } else if (ret) { - fs_err(sdp, "lm_lock ret %d\n", ret); - target = gl->gl_state | LM_OUT_ERROR; } else { - /* The operation will be completed asynchronously. */ + fs_err(sdp, "lm_lock ret %d\n", ret); + GLOCK_BUG_ON(gl, !gfs2_withdrawing_or_withdrawn(sdp)); return; } - clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); } /* Complete the operation now. */ finish_xmote(gl, target); + gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); } @@ -855,11 +831,20 @@ __acquires(&gl->gl_lockref.lock) return; set_bit(GLF_LOCK, &gl->gl_flags); - /* While a demote is in progress, the GLF_LOCK flag must be set. */ + /* + * The GLF_DEMOTE_IN_PROGRESS flag is only set intermittently during + * locking operations. We have just started a locking operation by + * setting the GLF_LOCK flag, so the GLF_DEMOTE_IN_PROGRESS flag must + * be cleared. + */ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); - if (test_bit(GLF_DEMOTE, &gl->gl_flags) && - gl->gl_demote_state != gl->gl_state) { + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { + if (gl->gl_demote_state == gl->gl_state) { + gfs2_demote_wake(gl); + goto promote; + } + if (find_first_holder(gl)) goto out_unlock; if (nonblock) @@ -869,31 +854,31 @@ __acquires(&gl->gl_lockref.lock) gl->gl_target = gl->gl_demote_state; do_xmote(gl, NULL, gl->gl_target); return; - } else { - if (test_bit(GLF_DEMOTE, &gl->gl_flags)) - gfs2_demote_wake(gl); - if (do_promote(gl)) - goto out_unlock; - gh = find_first_waiter(gl); - if (!gh) - goto out_unlock; - gl->gl_target = gh->gh_state; - if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) - do_error(gl, 0); /* Fail queued try locks */ - do_xmote(gl, gh, gl->gl_target); - return; } +promote: + do_promote(gl); + if (find_first_holder(gl)) + goto out_unlock; + gh = find_first_waiter(gl); + if (!gh) + goto out_unlock; + if (nonblock) + goto out_sched; + gl->gl_target = gh->gh_state; + if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + do_error(gl, 0); /* Fail queued try locks */ + do_xmote(gl, gh, gl->gl_target); + return; + out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_atomic(); gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); return; out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_atomic(); } /** @@ -1462,6 +1447,24 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) va_end(args); } +static bool gfs2_should_queue_trylock(struct gfs2_glock *gl, + struct gfs2_holder *gh) +{ + struct gfs2_holder *current_gh, *gh2; + + current_gh = find_first_holder(gl); + if (current_gh && !may_grant(gl, current_gh, gh)) + return false; + + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) + continue; + if (!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + return false; + } + return true; +} + static inline bool pid_is_meaningful(const struct gfs2_holder *gh) { if (!(gh->gh_flags & GL_NOPID)) @@ -1480,27 +1483,20 @@ static inline bool pid_is_meaningful(const struct gfs2_holder *gh) */ static inline void add_to_queue(struct gfs2_holder *gh) -__releases(&gl->gl_lockref.lock) -__acquires(&gl->gl_lockref.lock) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_holder *gh2; - int try_futile = 0; GLOCK_BUG_ON(gl, gh->gh_owner_pid == NULL); if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) GLOCK_BUG_ON(gl, true); - if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { - if (test_bit(GLF_LOCK, &gl->gl_flags)) { - struct gfs2_holder *current_gh; - - current_gh = find_first_holder(gl); - try_futile = !may_grant(gl, current_gh, gh); - } - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) - goto fail; + if ((gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) && + !gfs2_should_queue_trylock(gl, gh)) { + gh->gh_error = GLR_TRYFAILED; + gfs2_holder_wake(gh); + return; } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { @@ -1512,15 +1508,6 @@ __acquires(&gl->gl_lockref.lock) continue; goto trap_recursive; } - list_for_each_entry(gh2, &gl->gl_holders, gh_list) { - if (try_futile && - !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { -fail: - gh->gh_error = GLR_TRYFAILED; - gfs2_holder_wake(gh); - return; - } - } trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); @@ -2321,8 +2308,6 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'y'; if (test_bit(GLF_LFLUSH, gflags)) *p++ = 'f'; - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) - *p++ = 'i'; if (test_bit(GLF_PENDING_REPLY, gflags)) *p++ = 'R'; if (test_bit(GLF_HAVE_REPLY, gflags)) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 9339a3bff6ee..d041b922b45e 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -68,6 +68,10 @@ enum { * also be granted in SHARED. The preferred state is whichever is compatible * with other granted locks, or the specified state if no other locks exist. * + * In addition, when a lock is already held in EX mode locally, a SHARED or + * DEFERRED mode request with the LM_FLAG_ANY flag set will be granted. + * (The LM_FLAG_ANY flag is only use for SHARED mode requests currently.) + * * LM_FLAG_NODE_SCOPE * This holder agrees to share the lock within this node. In other words, * the glock is held in EX mode according to DLM, but local holders on the diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index d4ad82f47eee..5a0ea416cfda 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -319,7 +319,6 @@ enum { GLF_DEMOTE_IN_PROGRESS = 5, GLF_DIRTY = 6, GLF_LFLUSH = 7, - GLF_INVALIDATE_IN_PROGRESS = 8, GLF_HAVE_REPLY = 9, GLF_INITIAL = 10, GLF_HAVE_FROZEN_REPLY = 11, @@ -376,7 +375,6 @@ struct gfs2_glock { enum { GIF_QD_LOCKED = 1, GIF_SW_PAGED = 3, - GIF_FREE_VFS_INODE = 5, GIF_GLOP_PENDING = 6, }; @@ -658,6 +656,8 @@ struct lm_lockstruct { struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ char *ls_lvb_bits; + struct rw_semaphore ls_sem; + spinlock_t ls_recover_spin; /* protects following fields */ unsigned long ls_recover_flags; /* DFL_ */ uint32_t ls_recover_mount; /* gen in first recover_done cb */ @@ -823,7 +823,6 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; wait_queue_head_t sd_log_flush_wait; int sd_log_error; /* First log error */ - wait_queue_head_t sd_withdraw_wait; unsigned int sd_log_tail; unsigned int sd_log_flush_tail; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index cee5d199d2d8..4f00af7dd256 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -58,6 +58,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, /** * gfs2_update_reply_times - Update locking statistics * @gl: The glock to update + * @blocking: The operation may have been blocking * * This assumes that gl->gl_dstamp has been set earlier. * @@ -72,12 +73,12 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, * TRY_1CB flags are set are classified as non-blocking. All * other DLM requests are counted as (potentially) blocking. */ -static inline void gfs2_update_reply_times(struct gfs2_glock *gl) +static inline void gfs2_update_reply_times(struct gfs2_glock *gl, + bool blocking) { struct gfs2_pcpu_lkstats *lks; const unsigned gltype = gl->gl_name.ln_type; - unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ? - GFS2_LKS_SRTTB : GFS2_LKS_SRTT; + unsigned index = blocking ? GFS2_LKS_SRTTB : GFS2_LKS_SRTT; s64 rtt; preempt_disable(); @@ -119,14 +120,18 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl) static void gdlm_ast(void *arg) { struct gfs2_glock *gl = arg; + bool blocking; unsigned ret; + blocking = test_bit(GLF_BLOCKING, &gl->gl_flags); + gfs2_update_reply_times(gl, blocking); + clear_bit(GLF_BLOCKING, &gl->gl_flags); + /* If the glock is dead, we only react to a dlm_unlock() reply. */ if (__lockref_is_dead(&gl->gl_lockref) && gl->gl_lksb.sb_status != -DLM_EUNLOCK) return; - gfs2_update_reply_times(gl); BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr) @@ -157,14 +162,6 @@ static void gdlm_ast(void *arg) } ret = gl->gl_req; - if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) { - if (gl->gl_req == LM_ST_SHARED) - ret = LM_ST_DEFERRED; - else if (gl->gl_req == LM_ST_DEFERRED) - ret = LM_ST_SHARED; - else - BUG(); - } /* * The GLF_INITIAL flag is initially set for new glocks. Upon the @@ -241,7 +238,7 @@ static bool down_conversion(int cur, int req) } static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, - const int cur, const int req) + const int req, bool blocking) { u32 lkf = 0; @@ -256,15 +253,6 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, lkf |= DLM_LKF_NOQUEUEBAST; } - if (gfs_flags & LM_FLAG_ANY) { - if (req == DLM_LOCK_PR) - lkf |= DLM_LKF_ALTCW; - else if (req == DLM_LOCK_CW) - lkf |= DLM_LKF_ALTPR; - else - BUG(); - } - if (!test_bit(GLF_INITIAL, &gl->gl_flags)) { lkf |= DLM_LKF_CONVERT; @@ -274,7 +262,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, * "upward" lock conversions or else DLM will reject the * request as invalid. */ - if (!down_conversion(cur, req)) + if (blocking) lkf |= DLM_LKF_QUECVT; } @@ -294,14 +282,20 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, unsigned int flags) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; + bool blocking; int cur, req; u32 lkf; char strname[GDLM_STRNAME_BYTES] = ""; int error; + gl->gl_req = req_state; cur = make_mode(gl->gl_name.ln_sbd, gl->gl_state); req = make_mode(gl->gl_name.ln_sbd, req_state); - lkf = make_flags(gl, flags, cur, req); + blocking = !down_conversion(cur, req) && + !(flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)); + lkf = make_flags(gl, flags, req, blocking); + if (blocking) + set_bit(GLF_BLOCKING, &gl->gl_flags); gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); if (test_bit(GLF_INITIAL, &gl->gl_flags)) { @@ -318,8 +312,13 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, */ again: - error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, - GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + } + up_read(&ls->ls_sem); if (error == -EBUSY) { msleep(20); goto again; @@ -341,17 +340,10 @@ static void gdlm_put_lock(struct gfs2_glock *gl) return; } - clear_bit(GLF_BLOCKING, &gl->gl_flags); gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_update_request_times(gl); - /* don't want to call dlm if we've unmounted the lock protocol */ - if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { - gfs2_glock_free(gl); - return; - } - /* * When the lockspace is released, all remaining glocks will be * unlocked automatically. This is more efficient than unlocking them @@ -369,13 +361,23 @@ static void gdlm_put_lock(struct gfs2_glock *gl) flags |= DLM_LKF_VALBLK; again: - error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags, - NULL, gl); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags, + NULL, gl); + } + up_read(&ls->ls_sem); if (error == -EBUSY) { msleep(20); goto again; } + if (error == -ENODEV) { + gfs2_glock_free(gl); + return; + } + if (error) { fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n", gl->gl_name.ln_type, @@ -386,7 +388,12 @@ again: static void gdlm_cancel(struct gfs2_glock *gl) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; - dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); + + down_read(&ls->ls_sem); + if (likely(ls->ls_dlm != NULL)) { + dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); + } + up_read(&ls->ls_sem); } /* @@ -567,7 +574,11 @@ static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name) struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; - error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) + error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + up_read(&ls->ls_sem); if (error) { fs_err(sdp, "%s lkid %x error %d\n", name, lksb->sb_lkid, error); @@ -594,9 +605,14 @@ static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags, memset(strname, 0, GDLM_STRNAME_BYTES); snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num); - error = dlm_lock(ls->ls_dlm, mode, lksb, flags, - strname, GDLM_STRNAME_BYTES - 1, - 0, sync_wait_cb, ls, NULL); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_lock(ls->ls_dlm, mode, lksb, flags, + strname, GDLM_STRNAME_BYTES - 1, + 0, sync_wait_cb, ls, NULL); + } + up_read(&ls->ls_sem); if (error) { fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n", name, lksb->sb_lkid, flags, mode, error); @@ -1323,6 +1339,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) */ INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func); + ls->ls_dlm = NULL; spin_lock_init(&ls->ls_recover_spin); ls->ls_recover_flags = 0; ls->ls_recover_mount = 0; @@ -1357,6 +1374,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) * create/join lockspace */ + init_rwsem(&ls->ls_sem); error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE, &gdlm_lockspace_ops, sdp, &ops_result, &ls->ls_dlm); @@ -1400,7 +1418,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) return 0; fail_release: - dlm_release_lockspace(ls->ls_dlm, 2); + dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL); fail_free: free_recover_size(ls); fail: @@ -1436,10 +1454,12 @@ static void gdlm_unmount(struct gfs2_sbd *sdp) /* mounted_lock and control_lock will be purged in dlm recovery */ release: + down_write(&ls->ls_sem); if (ls->ls_dlm) { - dlm_release_lockspace(ls->ls_dlm, 2); + dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL); ls->ls_dlm = NULL; } + up_write(&ls->ls_sem); free_recover_size(ls); } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 0727f60ad028..9d65719353fa 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -151,7 +151,8 @@ static int __init init_gfs2_fs(void) error = -ENOMEM; gfs2_recovery_wq = alloc_workqueue("gfs2_recovery", - WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU, + 0); if (!gfs2_recovery_wq) goto fail_wq1; @@ -160,7 +161,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_control_wq) goto fail_wq2; - gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", 0, 0); + gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", WQ_PERCPU, 0); if (!gfs2_freeze_wq) goto fail_wq3; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index efe99b732551..aa15183f9a16 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1193,13 +1193,15 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) error = -ENOMEM; sdp->sd_glock_wq = alloc_workqueue("gfs2-glock/%s", - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0, + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE | WQ_PERCPU, + 0, sdp->sd_fsname); if (!sdp->sd_glock_wq) goto fail_iput; sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s", - WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname); + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU, 0, + sdp->sd_fsname); if (!sdp->sd_delete_wq) goto fail_glock_wq; @@ -1754,7 +1756,7 @@ static void gfs2_evict_inodes(struct super_block *sb) spin_unlock(&inode->i_lock); continue; } - atomic_inc(&inode->i_count); + __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b42e2110084b..644b2d1e7276 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1050,7 +1050,7 @@ static int gfs2_drop_inode(struct inode *inode) if (test_bit(SDF_EVICTING, &sdp->sd_flags)) return 1; - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /** diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 26036ffc3f33..1c2507a27318 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -52,7 +52,6 @@ {(1UL << GLF_DEMOTE_IN_PROGRESS), "p" }, \ {(1UL << GLF_DIRTY), "y" }, \ {(1UL << GLF_LFLUSH), "f" }, \ - {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ {(1UL << GLF_PENDING_REPLY), "R" }, \ {(1UL << GLF_HAVE_REPLY), "r" }, \ {(1UL << GLF_INITIAL), "a" }, \ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 24864a66074b..56412f63f3bb 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -309,7 +309,7 @@ void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...) va_end(args); } -int gfs2_withdraw(struct gfs2_sbd *sdp) +void gfs2_withdraw(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; const struct lm_lockops *lm = ls->ls_ops; @@ -322,7 +322,7 @@ int gfs2_withdraw(struct gfs2_sbd *sdp) wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG, TASK_UNINTERRUPTIBLE); - return -1; + return; } new = old | BIT(SDF_WITHDRAWN) | BIT(SDF_WITHDRAW_IN_PROG); } while (unlikely(!try_cmpxchg(&sdp->sd_flags, &old, new))); @@ -350,8 +350,6 @@ int gfs2_withdraw(struct gfs2_sbd *sdp) if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname); - - return -1; } /* @@ -473,46 +471,36 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, /* * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw - * Returns: -1 if this call withdrew the machine, - * -2 if it was already withdrawn */ -int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, - const char *function, char *file, - unsigned int line) +void gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, + unsigned int line) { - int me; - gfs2_lm(sdp, "fatal: invalid metadata block - " "bh = %llu (bad magic number), " "function = %s, file = %s, line = %u\n", (unsigned long long)bh->b_blocknr, function, file, line); - me = gfs2_withdraw(sdp); - return (me) ? -1 : -2; + gfs2_withdraw(sdp); } /* * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw - * Returns: -1 if this call withdrew the machine, - * -2 if it was already withdrawn */ -int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, - u16 type, u16 t, const char *function, - char *file, unsigned int line) +void gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, const char *function, + char *file, unsigned int line) { - int me; - gfs2_lm(sdp, "fatal: invalid metadata block - " "bh = %llu (type: exp=%u, found=%u), " "function = %s, file = %s, line = %u\n", (unsigned long long)bh->b_blocknr, type, t, function, file, line); - me = gfs2_withdraw(sdp); - return (me) ? -1 : -2; + gfs2_withdraw(sdp); } /* @@ -521,14 +509,14 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, * 0 if it was already withdrawn */ -int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, - unsigned int line) +void gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, + unsigned int line) { gfs2_lm(sdp, "fatal: I/O error - " "function = %s, file = %s, line = %u\n", function, file, line); - return gfs2_withdraw(sdp); + gfs2_withdraw(sdp); } /* diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 27d03b641024..da0373b1e82b 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -91,9 +91,9 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, gfs2_consist_rgrpd_i((rgd), __func__, __FILE__, __LINE__) -int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, - const char *function, - char *file, unsigned int line); +void gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, + char *file, unsigned int line); static inline int gfs2_meta_check(struct gfs2_sbd *sdp, struct buffer_head *bh) @@ -108,10 +108,10 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp, return 0; } -int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, - u16 type, u16 t, - const char *function, - char *file, unsigned int line); +void gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, + const char *function, + char *file, unsigned int line); static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, struct buffer_head *bh, @@ -122,12 +122,16 @@ static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; u32 magic = be32_to_cpu(mh->mh_magic); u16 t = be32_to_cpu(mh->mh_type); - if (unlikely(magic != GFS2_MAGIC)) - return gfs2_meta_check_ii(sdp, bh, function, - file, line); - if (unlikely(t != type)) - return gfs2_metatype_check_ii(sdp, bh, type, t, function, - file, line); + if (unlikely(magic != GFS2_MAGIC)) { + gfs2_meta_check_ii(sdp, bh, function, + file, line); + return -EIO; + } + if (unlikely(t != type)) { + gfs2_metatype_check_ii(sdp, bh, type, t, function, + file, line); + return -EIO; + } return 0; } @@ -144,8 +148,8 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type, } -int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, - char *file, unsigned int line); +void gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, + char *file, unsigned int line); int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, bool verbose); @@ -228,6 +232,6 @@ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) __printf(2, 3) void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...); -int gfs2_withdraw(struct gfs2_sbd *sdp); +void gfs2_withdraw(struct gfs2_sbd *sdp); #endif /* __UTIL_DOT_H__ */ diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index 34e9804e0f36..c2f840c49e60 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -21,12 +21,12 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->tree = tree; fd->bnode = NULL; - ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); + ptr = kzalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); if (!ptr) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", tree->cnid, __builtin_return_address(0)); switch (tree->cnid) { case HFS_CAT_CNID: @@ -48,7 +48,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; @@ -115,6 +115,12 @@ int hfs_brec_find(struct hfs_find_data *fd) __be32 data; int height, res; + fd->record = -1; + fd->keyoffset = -1; + fd->keylength = -1; + fd->entryoffset = -1; + fd->entrylength = -1; + tree = fd->tree; if (fd->bnode) hfs_bnode_put(fd->bnode); diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c index 28307bc9ec1e..5e84833a4743 100644 --- a/fs/hfs/bitmap.c +++ b/fs/hfs/bitmap.c @@ -158,7 +158,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) } } - hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); + hfs_dbg("pos %u, num_bits %u\n", pos, *num_bits); HFS_SB(sb)->free_ablocks -= *num_bits; hfs_bitmap_dirty(sb); out: @@ -200,7 +200,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) if (!count) return 0; - hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count); + hfs_dbg("start %u, count %u\n", start, count); /* are all of the bits in range? */ if ((start + count) > HFS_SB(sb)->fs_ablocks) return -2; diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index e8cd1a31f247..fcfffe75d84e 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -200,7 +200,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, { struct page *src_page, *dst_page; - hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -221,7 +221,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) struct page *page; void *ptr; - hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -243,16 +243,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg("node %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - hfs_dbg_cont(BNODE_MOD, " %d", key_off); + hfs_dbg(" key_off %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -260,18 +260,18 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1; else tmp = node->tree->max_key_len + 1; - hfs_dbg_cont(BNODE_MOD, " (%d,%d", - tmp, hfs_bnode_read_u8(node, key_off)); + hfs_dbg(" (%d,%d", + tmp, hfs_bnode_read_u8(node, key_off)); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg(", cnid %d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u8(node, key_off); - hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); + hfs_dbg(" (%d)", tmp); } } - hfs_dbg_cont(BNODE_MOD, "\n"); + hfs_dbg("\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -361,7 +361,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + hfs_dbg("cnid %d, node %d, refcnt 1\n", node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); @@ -401,7 +401,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -546,7 +546,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -559,7 +559,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 896396554bcc..e49a141c87e5 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -94,7 +94,7 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -179,6 +179,7 @@ int hfs_brec_remove(struct hfs_find_data *fd) struct hfs_btree *tree; struct hfs_bnode *node, *parent; int end_off, rec_off, data_off, size; + int src, dst, len; tree = fd->tree; node = fd->bnode; @@ -191,7 +192,7 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg("rec %d, len %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -208,10 +209,14 @@ again: } hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); - if (rec_off == end_off) - goto skip; size = fd->keylength + fd->entrylength; + if (rec_off == end_off) { + src = fd->keyoffset; + hfs_bnode_clear(node, src, size); + goto skip; + } + do { data_off = hfs_bnode_read_u16(node, rec_off); hfs_bnode_write_u16(node, rec_off + 2, data_off - size); @@ -219,9 +224,23 @@ again: } while (rec_off >= end_off); /* fill hole */ - hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size, - data_off - fd->keyoffset - size); + dst = fd->keyoffset; + src = fd->keyoffset + size; + len = data_off - src; + + hfs_bnode_move(node, dst, src, len); + + src = dst + len; + len = data_off - src; + + hfs_bnode_clear(node, src, len); + skip: + /* + * Remove the obsolete offset to free space. + */ + hfs_bnode_write_u16(node, end_off, 0); + hfs_bnode_dump(node); if (!fd->record) hfs_brec_update_parent(fd); @@ -242,7 +261,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg("this %d, new %d, next %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -378,7 +397,7 @@ again: newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1; else fd->keylength = newkeylen = tree->max_key_len + 1; - hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg("rec %d, keylength %d, newkeylen %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index e86e1e235658..22e62fe7448b 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -364,7 +364,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg("node %u\n", node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index d63880e7d9d6..caebabb6642f 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -87,7 +87,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i int entry_size; int err; - hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg("name %s, cnid %u, i_nlink %d\n", str->name, cnid, inode->i_nlink); if (dir->i_size >= HFS_MAX_VALENCE) return -ENOSPC; @@ -211,6 +211,124 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid, return hfs_brec_find(fd); } +static inline +void hfs_set_next_unused_CNID(struct super_block *sb, + u32 deleted_cnid, u32 found_cnid) +{ + if (found_cnid < HFS_FIRSTUSER_CNID) { + atomic64_cmpxchg(&HFS_SB(sb)->next_id, + deleted_cnid + 1, HFS_FIRSTUSER_CNID); + } else { + atomic64_cmpxchg(&HFS_SB(sb)->next_id, + deleted_cnid + 1, found_cnid + 1); + } +} + +/* + * hfs_correct_next_unused_CNID() + * + * Correct the next unused CNID of Catalog Tree. + */ +static +int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) +{ + struct hfs_btree *cat_tree; + struct hfs_bnode *node; + s64 leaf_head; + s64 leaf_tail; + s64 node_id; + + hfs_dbg("cnid %u, next_id %lld\n", + cnid, atomic64_read(&HFS_SB(sb)->next_id)); + + if ((cnid + 1) < atomic64_read(&HFS_SB(sb)->next_id)) { + /* next ID should be unchanged */ + return 0; + } + + cat_tree = HFS_SB(sb)->cat_tree; + leaf_head = cat_tree->leaf_head; + leaf_tail = cat_tree->leaf_tail; + + if (leaf_head > leaf_tail) { + pr_err("node is corrupted: leaf_head %lld, leaf_tail %lld\n", + leaf_head, leaf_tail); + return -ERANGE; + } + + node = hfs_bnode_find(cat_tree, leaf_tail); + if (IS_ERR(node)) { + pr_err("fail to find leaf node: node ID %lld\n", + leaf_tail); + return -ENOENT; + } + + node_id = leaf_tail; + + do { + int i; + + if (node_id != leaf_tail) { + node = hfs_bnode_find(cat_tree, node_id); + if (IS_ERR(node)) + return -ENOENT; + } + + hfs_dbg("node %lld, leaf_tail %lld, leaf_head %lld\n", + node_id, leaf_tail, leaf_head); + + hfs_bnode_dump(node); + + for (i = node->num_recs - 1; i >= 0; i--) { + hfs_cat_rec rec; + u16 off, len, keylen; + int entryoffset; + int entrylength; + u32 found_cnid; + + len = hfs_brec_lenoff(node, i, &off); + keylen = hfs_brec_keylen(node, i); + if (keylen == 0) { + pr_err("fail to get the keylen: " + "node_id %lld, record index %d\n", + node_id, i); + return -EINVAL; + } + + entryoffset = off + keylen; + entrylength = len - keylen; + + if (entrylength > sizeof(rec)) { + pr_err("unexpected record length: " + "entrylength %d\n", + entrylength); + return -EINVAL; + } + + hfs_bnode_read(node, &rec, entryoffset, entrylength); + + if (rec.type == HFS_CDR_DIR) { + found_cnid = be32_to_cpu(rec.dir.DirID); + hfs_dbg("found_cnid %u\n", found_cnid); + hfs_set_next_unused_CNID(sb, cnid, found_cnid); + hfs_bnode_put(node); + return 0; + } else if (rec.type == HFS_CDR_FIL) { + found_cnid = be32_to_cpu(rec.file.FlNum); + hfs_dbg("found_cnid %u\n", found_cnid); + hfs_set_next_unused_CNID(sb, cnid, found_cnid); + hfs_bnode_put(node); + return 0; + } + } + + hfs_bnode_put(node); + + node_id = node->prev; + } while (node_id >= leaf_head); + + return -ENOENT; +} /* * hfs_cat_delete() @@ -225,7 +343,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) struct hfs_readdir_data *rd; int res, type; - hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid); sb = dir->i_sb; res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); if (res) @@ -271,6 +389,11 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) dir->i_size--; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); + + res = hfs_correct_next_unused_CNID(sb, cnid); + if (res) + goto out; + res = 0; out: hfs_find_exit(&fd); @@ -294,7 +417,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, int entry_size, type; int err; - hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg("cnid %u - (ino %lu, name %s) - (ino %lu, name %s)\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); sb = src_dir->i_sb; diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 580c62981dbd..a097908b269d 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -209,12 +209,12 @@ static void hfs_dump_extent(struct hfs_extent *extent) { int i; - hfs_dbg(EXTENT, " "); + hfs_dbg("extent: "); for (i = 0; i < 3; i++) - hfs_dbg_cont(EXTENT, " %u:%u", - be16_to_cpu(extent[i].block), - be16_to_cpu(extent[i].count)); - hfs_dbg_cont(EXTENT, "\n"); + hfs_dbg(" block %u, count %u", + be16_to_cpu(extent[i].block), + be16_to_cpu(extent[i].count)); + hfs_dbg("\n"); } static int hfs_add_extent(struct hfs_extent *extent, u16 offset, @@ -411,10 +411,11 @@ int hfs_extend_file(struct inode *inode) goto out; } - hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { if (!HFS_I(inode)->first_blocks) { - hfs_dbg(EXTENT, "first extents\n"); + hfs_dbg("first_extent: start %u, len %u\n", + start, len); /* no extents yet */ HFS_I(inode)->first_extents[0].block = cpu_to_be16(start); HFS_I(inode)->first_extents[0].count = cpu_to_be16(len); @@ -456,7 +457,7 @@ out: return res; insert_extent: - hfs_dbg(EXTENT, "insert new extent\n"); + hfs_dbg("insert new extent\n"); res = hfs_ext_write_extent(inode); if (res) goto out; @@ -481,7 +482,7 @@ void hfs_file_truncate(struct inode *inode) u32 size; int res; - hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n", + hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)HFS_I(inode)->phys_size, inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 7c5a7ecfa246..fff149af89da 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -9,12 +9,6 @@ #ifndef _LINUX_HFS_FS_H #define _LINUX_HFS_FS_H -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include <linux/slab.h> #include <linux/types.h> #include <linux/mutex.h> @@ -24,35 +18,10 @@ #include <asm/byteorder.h> #include <linux/uaccess.h> +#include <linux/hfs_common.h> #include "hfs.h" -#define DBG_BNODE_REFS 0x00000001 -#define DBG_BNODE_MOD 0x00000002 -#define DBG_CAT_MOD 0x00000004 -#define DBG_INODE 0x00000008 -#define DBG_SUPER 0x00000010 -#define DBG_EXTENT 0x00000020 -#define DBG_BITMAP 0x00000040 - -//#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD|DBG_CAT_MOD|DBG_BITMAP) -//#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) -//#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) -#define DBG_MASK (0) - -#define hfs_dbg(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ -} while (0) - -#define hfs_dbg_cont(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - pr_cont(fmt, ##__VA_ARGS__); \ -} while (0) - - /* * struct hfs_inode_info * @@ -112,13 +81,13 @@ struct hfs_sb_info { the extents b-tree */ struct hfs_btree *cat_tree; /* Information about the catalog b-tree */ - u32 file_count; /* The number of + atomic64_t file_count; /* The number of regular files in the filesystem */ - u32 folder_count; /* The number of + atomic64_t folder_count; /* The number of directories in the filesystem */ - u32 next_id; /* The next available + atomic64_t next_id; /* The next available file id number */ u32 clumpablks; /* The number of allocation blocks to try to add when diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index bf4cb7e78396..9cd449913dc8 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -183,6 +183,10 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t { struct super_block *sb = dir->i_sb; struct inode *inode = new_inode(sb); + s64 next_id; + s64 file_count; + s64 folder_count; + if (!inode) return NULL; @@ -190,7 +194,9 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); spin_lock_init(&HFS_I(inode)->open_dir_lock); hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name); - inode->i_ino = HFS_SB(sb)->next_id++; + next_id = atomic64_inc_return(&HFS_SB(sb)->next_id); + BUG_ON(next_id > U32_MAX); + inode->i_ino = (u32)next_id; inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); @@ -202,7 +208,8 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60; if (S_ISDIR(mode)) { inode->i_size = 2; - HFS_SB(sb)->folder_count++; + folder_count = atomic64_inc_return(&HFS_SB(sb)->folder_count); + BUG_ON(folder_count > U32_MAX); if (dir->i_ino == HFS_ROOT_CNID) HFS_SB(sb)->root_dirs++; inode->i_op = &hfs_dir_inode_operations; @@ -211,7 +218,8 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask; } else if (S_ISREG(mode)) { HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks; - HFS_SB(sb)->file_count++; + file_count = atomic64_inc_return(&HFS_SB(sb)->file_count); + BUG_ON(file_count > U32_MAX); if (dir->i_ino == HFS_ROOT_CNID) HFS_SB(sb)->root_files++; inode->i_op = &hfs_file_inode_operations; @@ -241,16 +249,19 @@ void hfs_delete_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; - hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); if (S_ISDIR(inode->i_mode)) { - HFS_SB(sb)->folder_count--; + BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX); + atomic64_dec(&HFS_SB(sb)->folder_count); if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) HFS_SB(sb)->root_dirs--; set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); hfs_mark_mdb_dirty(sb); return; } - HFS_SB(sb)->file_count--; + + BUG_ON(atomic64_read(&HFS_SB(sb)->file_count) > U32_MAX); + atomic64_dec(&HFS_SB(sb)->file_count); if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) HFS_SB(sb)->root_files--; if (S_ISREG(inode->i_mode)) { @@ -425,7 +436,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_cat_rec rec; int res; - hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); res = hfs_ext_write_extent(inode); if (res) return res; diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 8082eb01127c..53f3fae60217 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -150,11 +150,11 @@ int hfs_mdb_get(struct super_block *sb) /* These parameters are read from and written to the MDB */ HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks); - HFS_SB(sb)->next_id = be32_to_cpu(mdb->drNxtCNID); + atomic64_set(&HFS_SB(sb)->next_id, be32_to_cpu(mdb->drNxtCNID)); HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls); HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs); - HFS_SB(sb)->file_count = be32_to_cpu(mdb->drFilCnt); - HFS_SB(sb)->folder_count = be32_to_cpu(mdb->drDirCnt); + atomic64_set(&HFS_SB(sb)->file_count, be32_to_cpu(mdb->drFilCnt)); + atomic64_set(&HFS_SB(sb)->folder_count, be32_to_cpu(mdb->drDirCnt)); /* TRY to get the alternate (backup) MDB. */ sect = part_start + part_size - 2; @@ -172,7 +172,7 @@ int hfs_mdb_get(struct super_block *sb) pr_warn("continuing without an alternate MDB\n"); } - HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL); + HFS_SB(sb)->bitmap = kzalloc(8192, GFP_KERNEL); if (!HFS_SB(sb)->bitmap) goto out; @@ -273,11 +273,17 @@ void hfs_mdb_commit(struct super_block *sb) /* These parameters may have been modified, so write them back */ mdb->drLsMod = hfs_mtime(); mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks); - mdb->drNxtCNID = cpu_to_be32(HFS_SB(sb)->next_id); + BUG_ON(atomic64_read(&HFS_SB(sb)->next_id) > U32_MAX); + mdb->drNxtCNID = + cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->next_id)); mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files); mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs); - mdb->drFilCnt = cpu_to_be32(HFS_SB(sb)->file_count); - mdb->drDirCnt = cpu_to_be32(HFS_SB(sb)->folder_count); + BUG_ON(atomic64_read(&HFS_SB(sb)->file_count) > U32_MAX); + mdb->drFilCnt = + cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->file_count)); + BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX); + mdb->drDirCnt = + cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->folder_count)); /* write MDB to disk */ mark_buffer_dirty(HFS_SB(sb)->mdb_bh); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 388a318297ec..47f50fa555a4 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -319,6 +319,10 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc) int silent = fc->sb_flags & SB_SILENT; int res; + atomic64_set(&sbi->file_count, 0); + atomic64_set(&sbi->folder_count, 0); + atomic64_set(&sbi->next_id, 0); + /* load_nls_default does not fail */ if (sbi->nls_disk && !sbi->nls_io) sbi->nls_io = load_nls_default(); diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index eeebe80c6be4..ba26980cc503 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -139,7 +139,7 @@ int hfsplus_find_attr(struct super_block *sb, u32 cnid, { int err = 0; - hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); + hfs_dbg("name %s, cnid %d\n", name ? name : NULL, cnid); if (!HFSPLUS_SB(sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); @@ -201,7 +201,7 @@ int hfsplus_create_attr(struct inode *inode, int entry_size; int err; - hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n", + hfs_dbg("name %s, ino %ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -310,7 +310,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) struct super_block *sb = inode->i_sb; struct hfs_find_data fd; - hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n", + hfs_dbg("name %s, ino %ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -356,7 +356,7 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) int err = 0; struct hfs_find_data fd; - hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid); + hfs_dbg("cnid %d\n", cnid); if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 901e83d65d20..afc9c89e8c6a 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -18,12 +18,12 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->tree = tree; fd->bnode = NULL; - ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); + ptr = kzalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); if (!ptr) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", tree->cnid, __builtin_return_address(0)); mutex_lock_nested(&tree->tree_lock, hfsplus_btree_lock_class(tree)); @@ -34,7 +34,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; @@ -158,6 +158,12 @@ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) __be32 data; int height, res; + fd->record = -1; + fd->keyoffset = -1; + fd->keylength = -1; + fd->entryoffset = -1; + fd->entrylength = -1; + tree = fd->tree; if (fd->bnode) hfs_bnode_put(fd->bnode); diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index bd8dcea85588..1b3af8c87cad 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -31,7 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, if (!len) return size; - hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); + hfs_dbg("size %u, offset %u, len %u\n", size, offset, len); mutex_lock(&sbi->alloc_mutex); mapping = sbi->alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -90,14 +90,14 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, else end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32; } - hfs_dbg(BITMAP, "bitmap full\n"); + hfs_dbg("bitmap full\n"); start = size; goto out; found: start = offset + (curr - pptr) * 32 + i; if (start >= size) { - hfs_dbg(BITMAP, "bitmap full\n"); + hfs_dbg("bitmap full\n"); goto out; } /* do any partial u32 at the start */ @@ -155,7 +155,7 @@ done: *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); - hfs_dbg(BITMAP, "-> %u,%u\n", start, *max); + hfs_dbg("start %u, max %u\n", start, *max); out: mutex_unlock(&sbi->alloc_mutex); return start; @@ -174,7 +174,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) return 0; - hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count); + hfs_dbg("offset %u, count %u\n", offset, count); /* are all of the bits in range? */ if ((offset + count) > sbi->total_blocks) return -ENOENT; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 14f4995588ff..63e652ad1e0d 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -18,47 +18,6 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" -static inline -bool is_bnode_offset_valid(struct hfs_bnode *node, int off) -{ - bool is_valid = off < node->tree->node_size; - - if (!is_valid) { - pr_err("requested invalid offset: " - "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d\n", - node->this, node->type, node->height, - node->tree->node_size, off); - } - - return is_valid; -} - -static inline -int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) -{ - unsigned int node_size; - - if (!is_bnode_offset_valid(node, off)) - return 0; - - node_size = node->tree->node_size; - - if ((off + len) > node_size) { - int new_len = (int)node_size - off; - - pr_err("requested length has been corrected: " - "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d, " - "requested_len %d, corrected_len %d\n", - node->this, node->type, node->height, - node->tree->node_size, off, len, new_len); - - return new_len; - } - - return len; -} /* Copy a specified range of bytes from the raw data of a node */ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) @@ -214,7 +173,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct page **src_page, **dst_page; int l; - hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -272,7 +231,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) void *src_ptr, *dst_ptr; int l; - hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -392,16 +351,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg("node %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - hfs_dbg(BNODE_MOD, " %d", key_off); + hfs_dbg(" key_off %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -410,17 +369,17 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = hfs_bnode_read_u16(node, key_off) + 2; else tmp = node->tree->max_key_len + 2; - hfs_dbg_cont(BNODE_MOD, " (%d", tmp); + hfs_dbg(" (%d", tmp); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg(", cnid %d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u16(node, key_off); - hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); + hfs_dbg(" (%d)", tmp); } } - hfs_dbg_cont(BNODE_MOD, "\n"); + hfs_dbg("\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -456,7 +415,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node) /* move down? */ if (!node->prev && !node->next) - hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n"); + hfs_dbg("btree delete level\n"); if (!node->parent) { tree->root = 0; tree->depth = 0; @@ -511,7 +470,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + hfs_dbg("cnid %d, node %d, refcnt 1\n", node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); @@ -551,7 +510,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -697,7 +656,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -710,7 +669,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 1918544a7871..b4645102feec 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -92,7 +92,7 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -193,7 +193,7 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg("rec %d, len %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -246,7 +246,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg("this %d - new %d - next %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -383,7 +383,7 @@ again: newkeylen = hfs_bnode_read_u16(node, 14) + 2; else fd->keylength = newkeylen = tree->max_key_len + 2; - hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg("rec %d, keylength %d, newkeylen %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; @@ -395,7 +395,7 @@ again: end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { - hfs_dbg(BNODE_MOD, "splitting index node\n"); + hfs_dbg("splitting index node\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 9e1732a2b92a..7cc5aea14572 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -393,6 +393,12 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) len = hfs_brec_lenoff(node, 2, &off16); off = off16; + if (!is_bnode_offset_valid(node, off)) { + hfs_bnode_put(node); + return ERR_PTR(-EIO); + } + len = check_and_correct_requested_length(node, off, len); + off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); data = kmap_local_page(*pagep); @@ -428,7 +434,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) kunmap_local(data); nidx = node->next; if (!nidx) { - hfs_dbg(BNODE_MOD, "create new bmap node\n"); + hfs_dbg("create new bmap node\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); @@ -454,7 +460,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg("node %u\n", node->this); BUG_ON(!node->this); tree = node->tree; nidx = node->this; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 1995bafee839..02c1eee4a4b8 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -259,7 +259,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, int entry_size; int err; - hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg("name %s, cnid %u, i_nlink %d\n", str->name, cnid, inode->i_nlink); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) @@ -336,7 +336,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) int err, off; u16 type; - hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return err; @@ -441,7 +441,7 @@ int hfsplus_rename_cat(u32 cnid, int entry_size, type; int err; - hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg("cnid %u - ino %lu, name %s - ino %lu, name %s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 876bbb80fb4d..1b3e27a0d5e0 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -204,7 +204,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) fd.entrylength); type = be16_to_cpu(entry.type); len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN; - err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); + err = hfsplus_uni2asc_str(sb, &fd.key->cat.name, strbuf, &len); if (err) goto out; if (type == HFSPLUS_FOLDER) { diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index b1699b3c246a..8e886514d27f 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -275,7 +275,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, mutex_unlock(&hip->extents_lock); done: - hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n", + hfs_dbg("ino %lu, iblock %llu - dblock %u\n", inode->i_ino, (long long)iblock, dblock); mask = (1 << sbi->fs_shift) - 1; @@ -298,12 +298,12 @@ static void hfsplus_dump_extent(struct hfsplus_extent *extent) { int i; - hfs_dbg(EXTENT, " "); + hfs_dbg("extent "); for (i = 0; i < 8; i++) - hfs_dbg_cont(EXTENT, " %u:%u", - be32_to_cpu(extent[i].start_block), - be32_to_cpu(extent[i].block_count)); - hfs_dbg_cont(EXTENT, "\n"); + hfs_dbg(" start_block %u, block_count %u", + be32_to_cpu(extent[i].start_block), + be32_to_cpu(extent[i].block_count)); + hfs_dbg("\n"); } static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset, @@ -359,8 +359,7 @@ found: if (count <= block_nr) { err = hfsplus_block_free(sb, start, count); if (err) { - pr_err("can't free extent\n"); - hfs_dbg(EXTENT, " start: %u count: %u\n", + pr_err("can't free extent: start %u, count %u\n", start, count); } extent->block_count = 0; @@ -370,8 +369,7 @@ found: count -= block_nr; err = hfsplus_block_free(sb, start + count, block_nr); if (err) { - pr_err("can't free extent\n"); - hfs_dbg(EXTENT, " start: %u count: %u\n", + pr_err("can't free extent: start %u, count %u\n", start, count); } extent->block_count = cpu_to_be32(count); @@ -478,11 +476,12 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout) goto out; } - hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); if (hip->alloc_blocks <= hip->first_blocks) { if (!hip->first_blocks) { - hfs_dbg(EXTENT, "first extents\n"); + hfs_dbg("first_extent: start %u, len %u\n", + start, len); /* no extents yet */ hip->first_extents[0].start_block = cpu_to_be32(start); hip->first_extents[0].block_count = cpu_to_be32(len); @@ -521,7 +520,7 @@ out: return res; insert_extent: - hfs_dbg(EXTENT, "insert new extent\n"); + hfs_dbg("insert new extent\n"); res = hfsplus_ext_write_extent_locked(inode); if (res) goto out; @@ -546,7 +545,7 @@ void hfsplus_file_truncate(struct inode *inode) u32 alloc_cnt, blk_cnt, start; int res; - hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n", + hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)hip->phys_size, inode->i_size); if (inode->i_size > hip->phys_size) { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 96a5c24813dd..89e8b19c127b 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -11,47 +11,14 @@ #ifndef _LINUX_HFSPLUS_FS_H #define _LINUX_HFSPLUS_FS_H -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include <linux/fs.h> #include <linux/mutex.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/fs_context.h> +#include <linux/hfs_common.h> #include "hfsplus_raw.h" -#define DBG_BNODE_REFS 0x00000001 -#define DBG_BNODE_MOD 0x00000002 -#define DBG_CAT_MOD 0x00000004 -#define DBG_INODE 0x00000008 -#define DBG_SUPER 0x00000010 -#define DBG_EXTENT 0x00000020 -#define DBG_BITMAP 0x00000040 -#define DBG_ATTR_MOD 0x00000080 - -#if 0 -#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) -#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) -#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) -#endif -#define DBG_MASK (0) - -#define hfs_dbg(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ -} while (0) - -#define hfs_dbg_cont(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - pr_cont(fmt, ##__VA_ARGS__); \ -} while (0) - /* Runtime config options */ #define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ @@ -521,8 +488,12 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, const struct hfsplus_unistr *s2); int hfsplus_strcmp(const struct hfsplus_unistr *s1, const struct hfsplus_unistr *s2); -int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, - char *astr, int *len_p); +int hfsplus_uni2asc_str(struct super_block *sb, + const struct hfsplus_unistr *ustr, char *astr, + int *len_p); +int hfsplus_uni2asc_xattr_str(struct super_block *sb, + const struct hfsplus_attr_unistr *ustr, + char *astr, int *len_p); int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, int max_unistr_len, const char *astr, int len); int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str); @@ -577,6 +548,48 @@ hfsplus_btree_lock_class(struct hfs_btree *tree) return class; } +static inline +bool is_bnode_offset_valid(struct hfs_bnode *node, int off) +{ + bool is_valid = off < node->tree->node_size; + + if (!is_valid) { + pr_err("requested invalid offset: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d\n", + node->this, node->type, node->height, + node->tree->node_size, off); + } + + return is_valid; +} + +static inline +int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) +{ + unsigned int node_size; + + if (!is_bnode_offset_valid(node, off)) + return 0; + + node_size = node->tree->node_size; + + if ((off + len) > node_size) { + int new_len = (int)node_size - off; + + pr_err("requested length has been corrected: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d, " + "requested_len %d, corrected_len %d\n", + node->this, node->type, node->height, + node->tree->node_size, off, len, new_len); + + return new_len; + } + + return len; +} + /* compatibility */ #define hfsp_mt2ut(t) (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) } #define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 86351bdc8985..16bc4abc67e0 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -68,13 +68,26 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) if (!(inode->i_state & I_NEW)) return inode; - INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); - spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock); - mutex_init(&HFSPLUS_I(inode)->extents_lock); - HFSPLUS_I(inode)->flags = 0; + atomic_set(&HFSPLUS_I(inode)->opencnt, 0); + HFSPLUS_I(inode)->first_blocks = 0; + HFSPLUS_I(inode)->clump_blocks = 0; + HFSPLUS_I(inode)->alloc_blocks = 0; + HFSPLUS_I(inode)->cached_start = U32_MAX; + HFSPLUS_I(inode)->cached_blocks = 0; + memset(HFSPLUS_I(inode)->first_extents, 0, sizeof(hfsplus_extent_rec)); + memset(HFSPLUS_I(inode)->cached_extents, 0, sizeof(hfsplus_extent_rec)); HFSPLUS_I(inode)->extent_state = 0; + mutex_init(&HFSPLUS_I(inode)->extents_lock); HFSPLUS_I(inode)->rsrc_inode = NULL; - atomic_set(&HFSPLUS_I(inode)->opencnt, 0); + HFSPLUS_I(inode)->create_date = 0; + HFSPLUS_I(inode)->linkid = 0; + HFSPLUS_I(inode)->flags = 0; + HFSPLUS_I(inode)->fs_blocks = 0; + HFSPLUS_I(inode)->userflags = 0; + HFSPLUS_I(inode)->subfolders = 0; + INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); + spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock); + HFSPLUS_I(inode)->phys_size = 0; if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || inode->i_ino == HFSPLUS_ROOT_CNID) { @@ -150,7 +163,7 @@ static int hfsplus_write_inode(struct inode *inode, { int err; - hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); err = hfsplus_ext_write_extent(inode); if (err) @@ -165,7 +178,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { - hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { @@ -184,7 +197,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) if (!wait) return 0; - hfs_dbg(SUPER, "hfsplus_sync_fs\n"); + hfs_dbg("starting...\n"); /* * Explicitly write out the special metadata inodes. @@ -215,6 +228,10 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) vhdr->folder_count = cpu_to_be32(sbi->folder_count); vhdr->file_count = cpu_to_be32(sbi->file_count); + hfs_dbg("free_blocks %u, next_cnid %u, folder_count %u, file_count %u\n", + sbi->free_blocks, sbi->next_cnid, + sbi->folder_count, sbi->file_count); + if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr)); write_backup = 1; @@ -240,6 +257,8 @@ out: if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(sb->s_bdev); + hfs_dbg("finished: err %d\n", error); + return error; } @@ -288,7 +307,7 @@ static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); - hfs_dbg(SUPER, "hfsplus_put_super\n"); + hfs_dbg("starting...\n"); cancel_delayed_work_sync(&sbi->sync_work); @@ -310,6 +329,8 @@ static void hfsplus_put_super(struct super_block *sb) kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); call_rcu(&sbi->rcu, delayed_free); + + hfs_dbg("finished\n"); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -524,7 +545,7 @@ static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc) if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) { - err = -EINVAL; + err = -EIO; goto out_put_root; } inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 36b6cf2a3abb..11e08a4a18b2 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -40,6 +40,18 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, p1 = s1->unicode; p2 = s2->unicode; + if (len1 > HFSPLUS_MAX_STRLEN) { + len1 = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(s1->length), len1); + } + + if (len2 > HFSPLUS_MAX_STRLEN) { + len2 = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(s2->length), len2); + } + while (1) { c1 = c2 = 0; @@ -74,6 +86,18 @@ int hfsplus_strcmp(const struct hfsplus_unistr *s1, p1 = s1->unicode; p2 = s2->unicode; + if (len1 > HFSPLUS_MAX_STRLEN) { + len1 = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(s1->length), len1); + } + + if (len2 > HFSPLUS_MAX_STRLEN) { + len2 = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(s2->length), len2); + } + for (len = min(len1, len2); len > 0; len--) { c1 = be16_to_cpu(*p1); c2 = be16_to_cpu(*p2); @@ -119,9 +143,8 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc) return NULL; } -int hfsplus_uni2asc(struct super_block *sb, - const struct hfsplus_unistr *ustr, - char *astr, int *len_p) +static int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, + int max_len, char *astr, int *len_p) { const hfsplus_unichr *ip; struct nls_table *nls = HFSPLUS_SB(sb)->nls; @@ -134,8 +157,8 @@ int hfsplus_uni2asc(struct super_block *sb, ip = ustr->unicode; ustrlen = be16_to_cpu(ustr->length); - if (ustrlen > HFSPLUS_MAX_STRLEN) { - ustrlen = HFSPLUS_MAX_STRLEN; + if (ustrlen > max_len) { + ustrlen = max_len; pr_err("invalid length %u has been corrected to %d\n", be16_to_cpu(ustr->length), ustrlen); } @@ -256,6 +279,21 @@ out: return res; } +inline int hfsplus_uni2asc_str(struct super_block *sb, + const struct hfsplus_unistr *ustr, char *astr, + int *len_p) +{ + return hfsplus_uni2asc(sb, ustr, HFSPLUS_MAX_STRLEN, astr, len_p); +} + +inline int hfsplus_uni2asc_xattr_str(struct super_block *sb, + const struct hfsplus_attr_unistr *ustr, + char *astr, int *len_p) +{ + return hfsplus_uni2asc(sb, (const struct hfsplus_unistr *)ustr, + HFSPLUS_ATTR_MAX_STRLEN, astr, len_p); +} + /* * Convert one or more ASCII characters into a single unicode character. * Returns the number of ASCII characters corresponding to the unicode char. diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 18dc3d254d21..ece4d29c0ab9 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -64,7 +64,7 @@ static void hfsplus_init_header_node(struct inode *attr_file, u32 used_bmp_bytes; u64 tmp; - hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n", + hfs_dbg("clump %u, node_size %u\n", clump_size, node_size); /* The end of the node contains list of record offsets */ @@ -132,7 +132,7 @@ static int hfsplus_create_attributes_file(struct super_block *sb) struct page *page; int old_state = HFSPLUS_EMPTY_ATTR_TREE; - hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID); + hfs_dbg("ino %d\n", HFSPLUS_ATTR_CNID); check_attr_tree_state_again: switch (atomic_read(&sbi->attr_tree_state)) { @@ -735,9 +735,9 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) goto end_listxattr; xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN; - if (hfsplus_uni2asc(inode->i_sb, - (const struct hfsplus_unistr *)&fd.key->attr.key_name, - strbuf, &xattr_name_len)) { + if (hfsplus_uni2asc_xattr_str(inode->i_sb, + &fd.key->attr.key_name, strbuf, + &xattr_name_len)) { pr_err("unicode conversion failed\n"); res = -EIO; goto end_listxattr; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 01e516175bcd..1e1acf5775ab 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -261,7 +261,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations hostfs_sbops = { .alloc_inode = hostfs_alloc_inode, .free_inode = hostfs_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = hostfs_evict_inode, .statfs = hostfs_statfs, .show_options = hostfs_show_options, diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index a59e8fa630db..34008442ee26 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i) struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct inode *parent; if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; - if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) { + if (hpfs_inode->i_rddir_off && !icount_read(i)) { if (*hpfs_inode->i_rddir_off) pr_err("write_inode: some position still there\n"); kfree(hpfs_inode->i_rddir_off); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 09d4baef29cf..be4be99304bc 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -517,14 +517,16 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, /* * If folio is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) while holding - * the fault mutex. The mutex will prevent faults - * until we finish removing the folio. + * unmapped in caller or hugetlb_vmdelete_list() skips + * unmapping it due to fail to grab lock. Unmap (again) + * while holding the fault mutex. The mutex will prevent + * faults until we finish removing the folio. Hold folio + * lock to guarantee no concurrent migration. */ + folio_lock(folio); if (unlikely(folio_mapped(folio))) hugetlb_unmap_file_folio(h, mapping, folio, index); - folio_lock(folio); /* * We must remove the folio from page cache before removing * the region/ reserve map (hugetlb_unreserve_pages). In diff --git a/fs/init.c b/fs/init.c index eef5124885e3..07f592ccdba8 100644 --- a/fs/init.c +++ b/fs/init.c @@ -149,7 +149,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) else if (!(S_ISBLK(mode) || S_ISCHR(mode))) return -EINVAL; - dentry = kern_path_create(AT_FDCWD, filename, &path, 0); + dentry = start_creating_path(AT_FDCWD, filename, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -158,7 +158,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -173,7 +173,7 @@ int __init init_link(const char *oldname, const char *newname) if (error) return error; - new_dentry = kern_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out; @@ -191,7 +191,7 @@ int __init init_link(const char *oldname, const char *newname) error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, NULL); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); return error; @@ -203,14 +203,14 @@ int __init init_symlink(const char *oldname, const char *newname) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, newname, &path, 0); + dentry = start_creating_path(AT_FDCWD, newname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); error = security_path_symlink(&path, dentry, oldname); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, oldname); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -225,7 +225,8 @@ int __init init_mkdir(const char *pathname, umode_t mode) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, pathname, &path, + LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); mode = mode_strip_umask(d_inode(path.dentry), mode); @@ -236,7 +237,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } diff --git a/fs/inode.c b/fs/inode.c index 01ebdc40021e..ec9339024ac3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -534,7 +534,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate) { if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) return; if (!(inode->i_sb->s_flags & SB_ACTIVE)) return; @@ -550,11 +550,11 @@ static void __inode_add_lru(struct inode *inode, bool rotate) struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit) { - void *bit_address; + void *bit_address; - bit_address = inode_state_wait_address(inode, bit); - init_wait_var_entry(wqe, bit_address, 0); - return __var_waitqueue(bit_address); + bit_address = inode_state_wait_address(inode, bit); + init_wait_var_entry(wqe, bit_address, 0); + return __var_waitqueue(bit_address); } EXPORT_SYMBOL(inode_bit_waitqueue); @@ -871,11 +871,11 @@ void evict_inodes(struct super_block *sb) again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) continue; spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_count)) { + if (icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } @@ -937,7 +937,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * unreclaimable for a while. Remove them lazily here; iput, * sync, or the last page cache deletion will requeue them. */ - if (atomic_read(&inode->i_count) || + if (icount_read(inode) || (inode->i_state & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); @@ -1279,6 +1279,8 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; + might_sleep(); + again: spin_lock(&inode_hash_lock); old = find_inode(inode->i_sb, head, test, data, true); @@ -1382,6 +1384,8 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *new; + might_sleep(); + again: inode = find_inode(sb, head, test, data, false); if (inode) { @@ -1422,6 +1426,9 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + + might_sleep(); + again: inode = find_inode_fast(sb, head, ino, false); if (inode) { @@ -1605,6 +1612,9 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; + + might_sleep(); + again: inode = ilookup5_nowait(sb, hashval, test, data); if (inode) { @@ -1630,6 +1640,9 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + + might_sleep(); + again: inode = find_inode_fast(sb, head, ino, false); @@ -1780,6 +1793,8 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); + might_sleep(); + while (1) { struct inode *old = NULL; spin_lock(&inode_hash_lock); @@ -1826,6 +1841,8 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, { struct inode *old; + might_sleep(); + inode->i_state |= I_CREATING; old = inode_insert5(inode, hashval, test, NULL, data); @@ -1838,11 +1855,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, EXPORT_SYMBOL(insert_inode_locked4); -int generic_delete_inode(struct inode *inode) +int inode_just_drop(struct inode *inode) { return 1; } -EXPORT_SYMBOL(generic_delete_inode); +EXPORT_SYMBOL(inode_just_drop); /* * Called when we're dropping the last reference @@ -1866,7 +1883,7 @@ static void iput_final(struct inode *inode) if (op->drop_inode) drop = op->drop_inode(inode); else - drop = generic_drop_inode(inode); + drop = inode_generic_drop(inode); if (!drop && !(inode->i_state & I_DONTCACHE) && @@ -1908,20 +1925,45 @@ static void iput_final(struct inode *inode) */ void iput(struct inode *inode) { - if (!inode) + might_sleep(); + if (unlikely(!inode)) return; - BUG_ON(inode->i_state & I_CLEAR); + retry: - if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { - if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { - atomic_inc(&inode->i_count); - spin_unlock(&inode->i_lock); - trace_writeback_lazytime_iput(inode); - mark_inode_dirty_sync(inode); - goto retry; - } - iput_final(inode); + lockdep_assert_not_held(&inode->i_lock); + VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode); + /* + * Note this assert is technically racy as if the count is bogusly + * equal to one, then two CPUs racing to further drop it can both + * conclude it's fine. + */ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode); + + if (atomic_add_unless(&inode->i_count, -1, 1)) + return; + + if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) { + trace_writeback_lazytime_iput(inode); + mark_inode_dirty_sync(inode); + goto retry; + } + + spin_lock(&inode->i_lock); + if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) { + spin_unlock(&inode->i_lock); + goto retry; } + + if (!atomic_dec_and_test(&inode->i_count)) { + spin_unlock(&inode->i_lock); + return; + } + + /* + * iput_final() drops ->i_lock, we can't assert on it as the inode may + * be deallocated by the time the call returns. + */ + iput_final(inode); } EXPORT_SYMBOL(iput); @@ -2189,7 +2231,7 @@ static int __remove_privs(struct mnt_idmap *idmap, return notify_change(idmap, dentry, &newattrs, NULL); } -int file_remove_privs_flags(struct file *file, unsigned int flags) +static int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); @@ -2214,7 +2256,6 @@ int file_remove_privs_flags(struct file *file, unsigned int flags) inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) @@ -2519,21 +2560,28 @@ void __init inode_init(void) void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; - if (S_ISCHR(mode)) { + switch (inode->i_mode & S_IFMT) { + case S_IFCHR: inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; - } else if (S_ISBLK(mode)) { + break; + case S_IFBLK: if (IS_ENABLED(CONFIG_BLOCK)) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; - } else if (S_ISFIFO(mode)) + break; + case S_IFIFO: inode->i_fop = &pipefifo_fops; - else if (S_ISSOCK(mode)) - ; /* leave it no_open_fops */ - else + break; + case S_IFSOCK: + /* leave it no_open_fops */ + break; + default: printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" " inode %s:%lu\n", mode, inode->i_sb->s_id, inode->i_ino); + break; + } } EXPORT_SYMBOL(init_special_inode); @@ -2911,10 +2959,18 @@ EXPORT_SYMBOL(mode_strip_sgid); * * TODO: add a proper inode dumping routine, this is a stub to get debug off the * ground. + * + * TODO: handle getting to fs type with get_kernel_nofault()? + * See dump_mapping() above. */ void dump_inode(struct inode *inode, const char *reason) { - pr_warn("%s encountered for inode %px", reason, inode); + struct super_block *sb = inode->i_sb; + + pr_warn("%s encountered for inode %px\n" + "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", + reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, + inode->i_flags, inode->i_state, atomic_read(&inode->i_count)); } EXPORT_SYMBOL(dump_inode); diff --git a/fs/internal.h b/fs/internal.h index 38e8aab27bbd..a33d18ee5b74 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -355,3 +355,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); void pidfs_get_root(struct path *path); +void nsfs_get_root(struct path *path); diff --git a/fs/ioctl.c b/fs/ioctl.c index 0248cb8db2d3..1c152c2b1b67 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -41,7 +41,7 @@ * * Returns 0 on success, -errno on error. */ -int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -54,7 +54,6 @@ int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) out: return error; } -EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { @@ -426,7 +425,7 @@ static int ioctl_file_dedupe_range(struct file *file, goto out; } - size = offsetof(struct file_dedupe_range, info[count]); + size = struct_size(same, info, count); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index fd827398afd2..8b847a1e27f1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -304,6 +304,9 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, size_t size = i_size_read(iter->inode) - iomap->offset; size_t offset = offset_in_folio(folio, iomap->offset); + if (WARN_ON_ONCE(!iomap->inline_data)) + return -EIO; + if (folio_test_uptodate(folio)) return 0; @@ -894,7 +897,7 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, return true; } -static void iomap_write_end_inline(const struct iomap_iter *iter, +static bool iomap_write_end_inline(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t copied) { const struct iomap *iomap = &iter->iomap; @@ -903,12 +906,16 @@ static void iomap_write_end_inline(const struct iomap_iter *iter, WARN_ON_ONCE(!folio_test_uptodate(folio)); BUG_ON(!iomap_inline_data_valid(iomap)); + if (WARN_ON_ONCE(!iomap->inline_data)) + return false; + flush_dcache_folio(folio); addr = kmap_local_folio(folio, pos); memcpy(iomap_inline_data(iomap, pos), addr, copied); kunmap_local(addr); mark_inode_dirty(iter->inode); + return true; } /* @@ -921,10 +928,8 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; - if (srcmap->type == IOMAP_INLINE) { - iomap_write_end_inline(iter, folio, pos, copied); - return true; - } + if (srcmap->type == IOMAP_INLINE) + return iomap_write_end_inline(iter, folio, pos, copied); if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { size_t bh_written; @@ -1396,6 +1401,9 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, /* warn about zeroing folios beyond eof that won't write back */ WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); + trace_iomap_zero_iter(iter->inode, folio_pos(folio) + offset, + bytes); + folio_zero_range(folio, offset, bytes); folio_mark_accessed(folio); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b84f6af2eb4c..46aa85af13dc 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -519,6 +519,9 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) loff_t pos = iomi->pos; u64 copied; + if (WARN_ON_ONCE(!inline_data)) + return -EIO; + if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) return -EIO; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 6ad66e6ba653..a61c1dae4742 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -84,6 +84,7 @@ DEFINE_RANGE_EVENT(iomap_release_folio); DEFINE_RANGE_EVENT(iomap_invalidate_folio); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); DEFINE_RANGE_EVENT(iomap_dio_rw_queued); +DEFINE_RANGE_EVENT(iomap_zero_iter); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index a6c692cac616..9adf36e6364b 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -70,6 +70,24 @@ static struct kernfs_open_node *of_on(struct kernfs_open_file *of) !list_empty(&of->list)); } +/* Get active reference to kernfs node for an open file */ +static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of) +{ + /* Skip if file was already released */ + if (unlikely(of->released)) + return NULL; + + if (!kernfs_get_active(of->kn)) + return NULL; + + return of; +} + +static void kernfs_put_active_of(struct kernfs_open_file *of) +{ + return kernfs_put_active(of->kn); +} + /** * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * @@ -139,7 +157,7 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) if (ops->seq_stop) ops->seq_stop(sf, v); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) @@ -152,7 +170,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); @@ -238,7 +256,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; @@ -252,7 +270,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) else len = -EINVAL; - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); if (len < 0) @@ -323,7 +341,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; @@ -335,7 +353,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) else len = -EINVAL; - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); if (len > 0) @@ -357,13 +375,13 @@ static void kernfs_vma_open(struct vm_area_struct *vma) if (!of->vm_ops) return; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return; if (of->vm_ops->open) of->vm_ops->open(vma); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); } static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) @@ -375,14 +393,14 @@ static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) if (!of->vm_ops) return VM_FAULT_SIGBUS; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -395,7 +413,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) if (!of->vm_ops) return VM_FAULT_SIGBUS; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return VM_FAULT_SIGBUS; ret = 0; @@ -404,7 +422,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) else file_update_time(file); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -418,14 +436,14 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, if (!of->vm_ops) return -EINVAL; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -455,7 +473,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) mutex_lock(&of->mutex); rc = -ENODEV; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) goto out_unlock; ops = kernfs_ops(of->kn); @@ -490,7 +508,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) } vma->vm_ops = &kernfs_vm_ops; out_put: - kernfs_put_active(of->kn); + kernfs_put_active_of(of); out_unlock: mutex_unlock(&of->mutex); @@ -852,7 +870,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); __poll_t ret; - if (!kernfs_get_active(kn)) + if (!kernfs_get_active_of(of)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; if (kn->attr.ops->poll) @@ -860,7 +878,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) else ret = kernfs_generic_poll(of, wait); - kernfs_put_active(kn); + kernfs_put_active_of(of); return ret; } @@ -875,7 +893,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { mutex_unlock(&of->mutex); return -ENODEV; } @@ -886,7 +904,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) else ret = generic_file_llseek(file, offset, whence); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); return ret; } diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index e384a69fbece..76eaf64b9d9e 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -57,7 +57,7 @@ static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) const struct super_operations kernfs_sops = { .statfs = kernfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = kernfs_evict_inode, .show_options = kernfs_sop_show_options, diff --git a/fs/locks.c b/fs/locks.c index 559f02aa4172..04a3f0e20724 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2328,8 +2328,8 @@ out: * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX * locks, the ->lock() interface may return asynchronously, before the lock has * been granted or denied by the underlying filesystem, if (and only if) - * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations - * flags need to be set. + * lm_grant is set. Additionally FOP_ASYNC_LOCK in file_operations fop_flags + * need to be set. * * Callers expecting ->lock() to return asynchronously will only use F_SETLK, * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a diff --git a/fs/minix/inode.c b/fs/minix/inode.c index df9d11479caf..32db676127a9 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -492,8 +492,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev) inode->i_op = &minix_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &minix_aops; - } else + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { init_special_inode(inode, inode->i_mode, rdev); + } else { + printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + make_bad_inode(inode); + } } /* diff --git a/fs/mount.h b/fs/mount.h index 97737051a8b9..79c85639a7ba 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -17,11 +17,7 @@ struct mnt_namespace { }; struct user_namespace *user_ns; struct ucounts *ucounts; - u64 seq; /* Sequence number to prevent loops */ - union { - wait_queue_head_t poll; - struct rcu_head mnt_ns_rcu; - }; + wait_queue_head_t poll; u64 seq_origin; /* Sequence number of origin mount namespace */ u64 event; #ifdef CONFIG_FSNOTIFY @@ -30,8 +26,6 @@ struct mnt_namespace { #endif unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ - struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; @@ -149,7 +143,7 @@ static inline void detach_mounts(struct dentry *dentry) static inline void get_mnt_ns(struct mnt_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } extern seqlock_t mount_lock; @@ -173,7 +167,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry) static inline bool is_anon_ns(struct mnt_namespace *ns) { - return ns->seq == 0; + return ns->ns.ns_id == 0; } static inline bool anon_ns_root(const struct mount *m) diff --git a/fs/namei.c b/fs/namei.c index cd43ff89fbaa..507ca0d7878d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1449,6 +1449,10 @@ static int follow_automount(struct path *path, int *count, unsigned lookup_flags dentry->d_inode) return -EISDIR; + /* No need to trigger automounts if mountpoint crossing is disabled. */ + if (lookup_flags & LOOKUP_NO_XDEV) + return -EXDEV; + if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; @@ -1472,6 +1476,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, /* Allow the filesystem to manage the transit without i_rwsem * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { + if (lookup_flags & LOOKUP_NO_XDEV) { + ret = -EXDEV; + break; + } ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) @@ -1489,6 +1497,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; + if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) { + ret = -EXDEV; + break; + } continue; } } @@ -1630,12 +1642,8 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); - if (jumped) { - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - ret = -EXDEV; - else - nd->state |= ND_JUMPED; - } + if (jumped) + nd->state |= ND_JUMPED; if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) @@ -1827,6 +1835,20 @@ static struct dentry *lookup_slow(const struct qstr *name, return res; } +static struct dentry *lookup_slow_killable(const struct qstr *name, + struct dentry *dir, + unsigned int flags) +{ + struct inode *inode = dir->d_inode; + struct dentry *res; + + if (inode_lock_shared_killable(inode)) + return ERR_PTR(-EINTR); + res = __lookup_slow(name, dir, flags); + inode_unlock_shared(inode); + return res; +} + static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *restrict nd) { @@ -2744,7 +2766,8 @@ static int filename_parentat(int dfd, struct filename *name, } /* does lookup, returns the object with parent locked */ -static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) +static struct dentry *__start_removing_path(int dfd, struct filename *name, + struct path *path) { struct path parent_path __free(path_put) = {}; struct dentry *d; @@ -2756,18 +2779,42 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); + /* don't fail immediately if it's r/o, at least try to report other errors */ + error = mnt_want_write(parent_path.mnt); inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); - if (IS_ERR(d)) { - inode_unlock(parent_path.dentry->d_inode); - return d; - } + if (IS_ERR(d)) + goto unlock; + if (error) + goto fail; path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; + +fail: + dput(d); + d = ERR_PTR(error); +unlock: + inode_unlock(parent_path.dentry->d_inode); + if (!error) + mnt_drop_write(parent_path.mnt); + return d; } -struct dentry *kern_path_locked_negative(const char *name, struct path *path) +/** + * kern_path_parent: lookup path returning parent and target + * @name: path name + * @path: path to store parent in + * + * The path @name should end with a normal component, not "." or ".." or "/". + * A lookup is performed and if successful the parent information + * is store in @parent and the dentry is returned. + * + * The dentry maybe negative, the parent will be positive. + * + * Returns: dentry or error. + */ +struct dentry *kern_path_parent(const char *name, struct path *path) { struct path parent_path __free(path_put) = {}; struct filename *filename __free(putname) = getname_kernel(name); @@ -2780,35 +2827,35 @@ struct dentry *kern_path_locked_negative(const char *name, struct path *path) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); - inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, parent_path.dentry, LOOKUP_CREATE); - if (IS_ERR(d)) { - inode_unlock(parent_path.dentry->d_inode); + + d = lookup_noperm_unlocked(&last, parent_path.dentry); + if (IS_ERR(d)) return d; - } path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; } -struct dentry *kern_path_locked(const char *name, struct path *path) +struct dentry *start_removing_path(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); - struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); + struct dentry *res = __start_removing_path(AT_FDCWD, filename, path); putname(filename); return res; } -struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) +struct dentry *start_removing_user_path_at(int dfd, + const char __user *name, + struct path *path) { struct filename *filename = getname(name); - struct dentry *res = __kern_path_locked(dfd, filename, path); + struct dentry *res = __start_removing_path(dfd, filename, path); putname(filename); return res; } -EXPORT_SYMBOL(user_path_locked_at); +EXPORT_SYMBOL(start_removing_user_path_at); int kern_path(const char *name, unsigned int flags, struct path *path) { @@ -3011,6 +3058,47 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, EXPORT_SYMBOL(lookup_one_unlocked); /** + * lookup_one_positive_killable - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr olding pathname component to lookup + * @base: base directory to lookup from + * + * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns + * known positive or ERR_PTR(). This is what most of the users want. + * + * Note that pinned negative with unlocked parent _can_ become positive at any + * time, so callers of lookup_one_unlocked() need to be very careful; pinned + * positives have >d_inode stable, so this one avoids such problems. + * + * This can be used for in-kernel filesystem clients such as file servers. + * + * It should be called without the parent i_rwsem held, and will take + * the i_rwsem itself if necessary. If a fatal signal is pending or + * delivered, it will return %-EINTR if the lock is needed. + */ +struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base) +{ + int err; + struct dentry *ret; + + err = lookup_one_common(idmap, name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow_killable(name, base, 0); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} +EXPORT_SYMBOL(lookup_one_positive_killable); + +/** * lookup_one_positive_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: qstr holding pathname component to lookup @@ -4114,7 +4202,6 @@ static struct dentry *filename_create(int dfd, struct filename *name, unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; - int err2; int error; error = filename_parentat(dfd, name, reval_flag, path, &last, &type); @@ -4129,7 +4216,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, goto out; /* don't fail immediately if it's r/o, at least try to report other errors */ - err2 = mnt_want_write(path->mnt); + error = mnt_want_write(path->mnt); /* * Do the final lookup. Suppress 'create' if there is a trailing * '/', and a directory wasn't requested. @@ -4142,25 +4229,24 @@ static struct dentry *filename_create(int dfd, struct filename *name, if (IS_ERR(dentry)) goto unlock; - if (unlikely(err2)) { - error = err2; + if (unlikely(error)) goto fail; - } + return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: inode_unlock(path->dentry->d_inode); - if (!err2) + if (!error) mnt_drop_write(path->mnt); out: path_put(path); return dentry; } -struct dentry *kern_path_create(int dfd, const char *pathname, - struct path *path, unsigned int lookup_flags) +struct dentry *start_creating_path(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4168,9 +4254,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname, putname(filename); return res; } -EXPORT_SYMBOL(kern_path_create); +EXPORT_SYMBOL(start_creating_path); -void done_path_create(struct path *path, struct dentry *dentry) +void end_creating_path(struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); @@ -4178,10 +4264,11 @@ void done_path_create(struct path *path, struct dentry *dentry) mnt_drop_write(path->mnt); path_put(path); } -EXPORT_SYMBOL(done_path_create); +EXPORT_SYMBOL(end_creating_path); -inline struct dentry *user_path_create(int dfd, const char __user *pathname, - struct path *path, unsigned int lookup_flags) +inline struct dentry *start_creating_user_path( + int dfd, const char __user *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4189,7 +4276,7 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname, putname(filename); return res; } -EXPORT_SYMBOL(user_path_create); +EXPORT_SYMBOL(start_creating_user_path); /** * vfs_mknod - create device node or file @@ -4297,7 +4384,7 @@ retry: break; } out2: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4401,7 +4488,7 @@ retry: if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4755,7 +4842,7 @@ retry: if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4828,7 +4915,7 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to - * be writen back improperly if their true value is unknown to + * be written back improperly if their true value is unknown to * the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) @@ -4924,7 +5011,7 @@ retry: error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { @@ -5024,20 +5111,20 @@ int vfs_rename(struct renamedata *rd) if (source == target) return 0; - error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); + error = may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { - error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); + error = may_create(rd->mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) - error = may_delete(rd->new_mnt_idmap, new_dir, + error = may_delete(rd->mnt_idmap, new_dir, new_dentry, is_dir); else - error = may_delete(rd->new_mnt_idmap, new_dir, + error = may_delete(rd->mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) @@ -5052,13 +5139,13 @@ int vfs_rename(struct renamedata *rd) */ if (new_dir != old_dir) { if (is_dir) { - error = inode_permission(rd->old_mnt_idmap, source, + error = inode_permission(rd->mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { - error = inode_permission(rd->new_mnt_idmap, target, + error = inode_permission(rd->mnt_idmap, target, MAY_WRITE); if (error) return error; @@ -5126,7 +5213,7 @@ int vfs_rename(struct renamedata *rd) if (error) goto out; } - error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, + error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; @@ -5269,10 +5356,9 @@ retry_deleg: rd.old_parent = old_path.dentry; rd.old_dentry = old_dentry; - rd.old_mnt_idmap = mnt_idmap(old_path.mnt); + rd.mnt_idmap = mnt_idmap(old_path.mnt); rd.new_parent = new_path.dentry; rd.new_dentry = new_dentry; - rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); diff --git a/fs/namespace.c b/fs/namespace.c index ae6d1312b184..dc01b14c58cd 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -33,6 +33,7 @@ #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> #include <linux/pidfs.h> +#include <linux/nstree.h> #include "pnode.h" #include "internal.h" @@ -65,6 +66,15 @@ static int __init set_mphash_entries(char *str) } __setup("mphash_entries=", set_mphash_entries); +static char * __initdata initramfs_options; +static int __init initramfs_options_setup(char *str) +{ + initramfs_options = str; + return 1; +} + +__setup("initramfs_options=", initramfs_options_setup); + static u64 event; static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); @@ -80,13 +90,10 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ -static DEFINE_SEQLOCK(mnt_ns_tree_lock); #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif -static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ -static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ enum mount_kattr_flags_t { MOUNT_KATTR_RECURSE = (1 << 0), @@ -119,59 +126,18 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { + struct ns_common *ns; + if (!node) return NULL; - return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); -} - -static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) -{ - struct mnt_namespace *ns_a = node_to_mnt_ns(a); - struct mnt_namespace *ns_b = node_to_mnt_ns(b); - u64 seq_a = ns_a->seq; - u64 seq_b = ns_b->seq; - - if (seq_a < seq_b) - return -1; - if (seq_a > seq_b) - return 1; - return 0; -} - -static inline void mnt_ns_tree_write_lock(void) -{ - write_seqlock(&mnt_ns_tree_lock); -} - -static inline void mnt_ns_tree_write_unlock(void) -{ - write_sequnlock(&mnt_ns_tree_lock); -} - -static void mnt_ns_tree_add(struct mnt_namespace *ns) -{ - struct rb_node *node, *prev; - - mnt_ns_tree_write_lock(); - node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); - /* - * If there's no previous entry simply add it after the - * head and if there is add it after the previous entry. - */ - prev = rb_prev(&ns->mnt_ns_tree_node); - if (!prev) - list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list); - else - list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list); - mnt_ns_tree_write_unlock(); - - WARN_ON_ONCE(node); + ns = rb_entry(node, struct ns_common, ns_tree_node); + return container_of(ns, struct mnt_namespace, ns); } static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ - if (refcount_dec_and_test(&ns->passive)) { + if (ns && refcount_dec_and_test(&ns->passive)) { fsnotify_mntns_delete(ns); put_user_ns(ns->user_ns); kfree(ns); @@ -181,32 +147,16 @@ DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { - mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu)); + mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu)); } static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ - if (!is_anon_ns(ns)) { - mnt_ns_tree_write_lock(); - rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); - list_bidir_del_rcu(&ns->mnt_ns_list); - mnt_ns_tree_write_unlock(); - } + if (ns_tree_active(ns)) + ns_tree_remove(ns); - call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); -} - -static int mnt_ns_find(const void *key, const struct rb_node *node) -{ - const u64 mnt_ns_id = *(u64 *)key; - const struct mnt_namespace *ns = node_to_mnt_ns(node); - - if (mnt_ns_id < ns->seq) - return -1; - if (mnt_ns_id > ns->seq) - return 1; - return 0; + call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu); } /* @@ -225,28 +175,21 @@ static int mnt_ns_find(const void *key, const struct rb_node *node) */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { - struct mnt_namespace *ns; - struct rb_node *node; - unsigned int seq; + struct mnt_namespace *mnt_ns; + struct ns_common *ns; guard(rcu)(); - do { - seq = read_seqbegin(&mnt_ns_tree_lock); - node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); - if (node) - break; - } while (read_seqretry(&mnt_ns_tree_lock, seq)); - - if (!node) + ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS); + if (!ns) return NULL; /* * The last reference count is put with RCU delay so we can * unconditonally acquire a reference here. */ - ns = node_to_mnt_ns(node); - refcount_inc(&ns->passive); - return ns; + mnt_ns = container_of(ns, struct mnt_namespace, ns); + refcount_inc(&mnt_ns->passive); + return mnt_ns; } static inline void lock_mount_hash(void) @@ -1017,7 +960,7 @@ static inline bool check_anonymous_mnt(struct mount *mnt) return false; seq = mnt->mnt_ns->seq_origin; - return !seq || (seq == current->nsproxy->mnt_ns->seq); + return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id); } /* @@ -2152,19 +2095,16 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) { + struct ns_common *ns; + guard(rcu)(); for (;;) { - struct list_head *list; - - if (previous) - list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list)); - else - list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list)); - if (list_is_head(list, &mnt_ns_list)) - return ERR_PTR(-ENOENT); + ns = ns_tree_adjoined_rcu(mntns, previous); + if (IS_ERR(ns)) + return ERR_CAST(ns); - mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list); + mntns = to_mnt_ns(ns); /* * The last passive reference count is put with RCU @@ -2179,7 +2119,7 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr * the mount namespace and it might already be on its * deathbed. */ - if (!refcount_inc_not_zero(&mntns->ns.count)) + if (!ns_ref_get(mntns)) continue; return mntns; @@ -2204,7 +2144,7 @@ static bool mnt_ns_loop(struct dentry *dentry) if (!mnt_ns) return false; - return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; + return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id; } struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, @@ -2455,7 +2395,7 @@ struct vfsmount *clone_private_mount(const struct path *path) return ERR_PTR(-EINVAL); } - if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); if (__has_locked_children(old_mnt, path->dentry)) @@ -3080,7 +3020,7 @@ static struct file *open_detached_copy(struct path *path, bool recursive) if (is_anon_ns(src_mnt_ns)) ns->seq_origin = src_mnt_ns->seq_origin; else - ns->seq_origin = src_mnt_ns->seq; + ns->seq_origin = src_mnt_ns->ns.ns_id; } mnt = __do_loopback(path, recursive); @@ -3289,7 +3229,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ -static int do_remount(struct path *path, int ms_flags, int sb_flags, +static int do_remount(struct path *path, int sb_flags, int mnt_flags, void *data) { int err; @@ -3727,8 +3667,10 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, int error; error = security_sb_kern_mount(sb); - if (!error && mount_too_revealing(sb, &mnt_flags)) + if (!error && mount_too_revealing(sb, &mnt_flags)) { + errorfcp(fc, "VFS", "Mount too revealing"); error = -EPERM; + } if (unlikely(error)) { fc_drop_locked(fc); @@ -4112,7 +4054,7 @@ int path_mount(const char *dev_name, struct path *path, if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND)) return do_reconfigure_mnt(path, mnt_flags); if (flags & MS_REMOUNT) - return do_remount(path, flags, sb_flags, mnt_flags, data_page); + return do_remount(path, sb_flags, mnt_flags, data_page); if (flags & MS_BIND) return do_loopback(path, dev_name, flags & MS_REC); if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) @@ -4151,20 +4093,11 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) - ns_free_inum(&ns->ns); + ns_common_free(ns); dec_mnt_namespaces(ns->ucounts); mnt_ns_tree_remove(ns); } -/* - * Assign a sequence number so we can detect when we attempt to bind - * mount a reference to an older mount namespace into the current - * mount namespace, preventing reference counting loops. A 64bit - * number incrementing at 10Ghz will take 12,427 years to wrap which - * is effectively never, so we can ignore the possibility. - */ -static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); - static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; @@ -4180,22 +4113,20 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } - if (!anon) { - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - dec_mnt_namespaces(ucounts); - return ERR_PTR(ret); - } + + if (anon) + ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO); + else + ret = ns_common_init(new_ns); + if (ret) { + kfree(new_ns); + dec_mnt_namespaces(ucounts); + return ERR_PTR(ret); } - new_ns->ns.ops = &mntns_operations; if (!anon) - new_ns->seq = atomic64_inc_return(&mnt_ns_seq); - refcount_set(&new_ns->ns.count, 1); + ns_tree_gen_id(&new_ns->ns); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; - INIT_LIST_HEAD(&new_ns->mnt_ns_list); - RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; @@ -4203,7 +4134,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } __latent_entropy -struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, +struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; @@ -4234,7 +4165,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); - ns_free_inum(&new_ns->ns); + ns_common_free(ns); dec_mnt_namespaces(new_ns->ucounts); mnt_ns_release(new_ns); return ERR_CAST(new); @@ -4281,7 +4212,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (pwdmnt) mntput(pwdmnt); - mnt_ns_tree_add(new_ns); + ns_tree_add_raw(new_ns); return new_ns; } @@ -4444,7 +4375,7 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, ret = -EPERM; if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { - pr_warn("VFS: Mount too revealing\n"); + errorfcp(fc, "VFS", "Mount too revealing"); goto err_unlock; } @@ -5007,7 +4938,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; /* @@ -5400,7 +5331,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; - s->sm.mnt_ns_id = ns->seq; + s->sm.mnt_ns_id = ns->ns.ns_id; } static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) @@ -5711,7 +5642,6 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { - struct path root __free(path_put) = {}; struct mount *m; int err; @@ -5723,7 +5653,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!s->mnt) return -ENOENT; - err = grab_requested_root(ns, &root); + err = grab_requested_root(ns, &s->root); if (err) return err; @@ -5732,7 +5662,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, * mounts to show users. */ m = real_mount(s->mnt); - if (!is_path_reachable(m, m->mnt.mnt_root, &root) && + if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -5740,8 +5670,6 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (err) return err; - s->root = root; - /* * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap * can change concurrently as we only hold the read-side of the @@ -5910,7 +5838,7 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); @@ -5963,28 +5891,40 @@ retry: if (!ret) ret = copy_statmount_to_user(ks); kvfree(ks->seq.buf); + path_put(&ks->root); if (retry_statmount(ret, &seq_size)) goto retry; return ret; } -static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, - u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids, - bool reverse) +struct klistmount { + u64 last_mnt_id; + u64 mnt_parent_id; + u64 *kmnt_ids; + u32 nr_mnt_ids; + struct mnt_namespace *ns; + struct path root; +}; + +static ssize_t do_listmount(struct klistmount *kls, bool reverse) { - struct path root __free(path_put) = {}; + struct mnt_namespace *ns = kls->ns; + u64 mnt_parent_id = kls->mnt_parent_id; + u64 last_mnt_id = kls->last_mnt_id; + u64 *mnt_ids = kls->kmnt_ids; + size_t nr_mnt_ids = kls->nr_mnt_ids; struct path orig; struct mount *r, *first; ssize_t ret; rwsem_assert_held(&namespace_sem); - ret = grab_requested_root(ns, &root); + ret = grab_requested_root(ns, &kls->root); if (ret) return ret; if (mnt_parent_id == LSMT_ROOT) { - orig = root; + orig = kls->root; } else { orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); if (!orig.mnt) @@ -5996,7 +5936,7 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, * Don't trigger audit denials. We just want to determine what * mounts to show users. */ - if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) && + if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -6029,14 +5969,45 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, return ret; } +static void __free_klistmount_free(const struct klistmount *kls) +{ + path_put(&kls->root); + kvfree(kls->kmnt_ids); + mnt_ns_release(kls->ns); +} + +static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, + size_t nr_mnt_ids) +{ + + u64 last_mnt_id = kreq->param; + + /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ + if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) + return -EINVAL; + + kls->last_mnt_id = last_mnt_id; + + kls->nr_mnt_ids = nr_mnt_ids; + kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids), + GFP_KERNEL_ACCOUNT); + if (!kls->kmnt_ids) + return -ENOMEM; + + kls->ns = grab_requested_mnt_ns(kreq); + if (!kls->ns) + return -ENOENT; + + kls->mnt_parent_id = kreq->mnt_id; + return 0; +} + SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) { - u64 *kmnt_ids __free(kvfree) = NULL; + struct klistmount kls __free(klistmount_free) = {}; const size_t maxcount = 1000000; - struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct mnt_id_req kreq; - u64 last_mnt_id; ssize_t ret; if (flags & ~LISTMOUNT_REVERSE) @@ -6057,22 +6028,12 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, if (ret) return ret; - last_mnt_id = kreq.param; - /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ - if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) - return -EINVAL; - - kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids), - GFP_KERNEL_ACCOUNT); - if (!kmnt_ids) - return -ENOMEM; - - ns = grab_requested_mnt_ns(&kreq); - if (!ns) - return -ENOENT; + ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids); + if (ret) + return ret; - if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && - !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; /* @@ -6080,39 +6041,43 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, * listmount() doesn't care about any mount properties. */ scoped_guard(rwsem_read, &namespace_sem) - ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, - nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); + ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE)); if (ret <= 0) return ret; - if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids))) + if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids))) return -EFAULT; return ret; } +struct mnt_namespace init_mnt_ns = { + .ns.inum = ns_init_inum(&init_mnt_ns), + .ns.ops = &mntns_operations, + .user_ns = &init_user_ns, + .ns.__ns_ref = REFCOUNT_INIT(1), + .ns.ns_type = ns_common_type(&init_mnt_ns), + .passive = REFCOUNT_INIT(1), + .mounts = RB_ROOT, + .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), +}; + static void __init init_mount_tree(void) { struct vfsmount *mnt; struct mount *m; - struct mnt_namespace *ns; struct path root; - mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); + mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = alloc_mnt_ns(&init_user_ns, true); - if (IS_ERR(ns)) - panic("Can't allocate initial namespace"); - ns->seq = atomic64_inc_return(&mnt_ns_seq); - ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); - ns->root = m; - ns->nr_mounts = 1; - mnt_add_to_ns(ns, m); - init_task.nsproxy->mnt_ns = ns; - get_mnt_ns(ns); + init_mnt_ns.root = m; + init_mnt_ns.nr_mounts = 1; + mnt_add_to_ns(&init_mnt_ns, m); + init_task.nsproxy->mnt_ns = &init_mnt_ns; + get_mnt_ns(&init_mnt_ns); root.mnt = mnt; root.dentry = mnt->mnt_root; @@ -6120,7 +6085,7 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); - mnt_ns_tree_add(ns); + ns_tree_add(&init_mnt_ns); } void __init mnt_init(void) @@ -6160,7 +6125,7 @@ void __init mnt_init(void) void put_mnt_ns(struct mnt_namespace *ns) { - if (!refcount_dec_and_test(&ns->ns.count)) + if (!ns_ref_put(ns)) return; namespace_lock(); emptied_ns = ns; @@ -6409,7 +6374,6 @@ static struct user_namespace *mntns_owner(struct ns_common *ns) const struct proc_ns_operations mntns_operations = { .name = "mnt", - .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 18b3dc74c70e..37ab6f28b5ad 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -369,7 +369,7 @@ void netfs_readahead(struct readahead_control *ractl) return netfs_put_request(rreq, netfs_rreq_trace_put_return); cleanup_free: - return netfs_put_request(rreq, netfs_rreq_trace_put_failed); + return netfs_put_failed_request(rreq); } EXPORT_SYMBOL(netfs_readahead); @@ -472,7 +472,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, netfs_rreq_trace_put_discard); + netfs_put_failed_request(rreq); alloc_error: folio_unlock(folio); return ret; @@ -532,7 +532,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, netfs_rreq_trace_put_discard); + netfs_put_failed_request(rreq); alloc_error: folio_unlock(folio); return ret; @@ -699,7 +699,7 @@ have_folio_no_wait: return 0; error_put: - netfs_put_request(rreq, netfs_rreq_trace_put_failed); + netfs_put_failed_request(rreq); error: if (folio) { folio_unlock(folio); @@ -754,7 +754,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, return ret < 0 ? ret : 0; error_put: - netfs_put_request(rreq, netfs_rreq_trace_put_discard); + netfs_put_failed_request(rreq); error: _leave(" = %d", ret); return ret; diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index f27ea5099a68..09394ac2c180 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -347,7 +347,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, folio_put(folio); ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1); if (ret < 0) - goto error_folio_unlock; + goto out; continue; copied: diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index a05e13472baf..a498ee8d6674 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -131,6 +131,7 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) if (rreq->len == 0) { pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); return -EIO; } @@ -205,7 +206,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i if (user_backed_iter(iter)) { ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0); if (ret < 0) - goto out; + goto error_put; rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec; rreq->direct_bv_count = ret; rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); @@ -238,6 +239,10 @@ out: if (ret > 0) orig_count -= ret; return ret; + +error_put: + netfs_put_failed_request(rreq); + return ret; } EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked); diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index a16660ab7f83..a9d1c3b2c084 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -57,7 +57,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0); if (n < 0) { ret = n; - goto out; + goto error_put; } wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec; wreq->direct_bv_count = n; @@ -101,6 +101,10 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * out: netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; + +error_put: + netfs_put_failed_request(wreq); + return ret; } EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index d4f16fefd965..4319611f5354 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -87,6 +87,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); void netfs_clear_subrequests(struct netfs_io_request *rreq); void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); +void netfs_put_failed_request(struct netfs_io_request *rreq); struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq); static inline void netfs_see_request(struct netfs_io_request *rreq, diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 20748bcfbf59..486166460e17 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -321,7 +321,7 @@ void netfs_wake_collector(struct netfs_io_request *rreq) { if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { - queue_work(system_unbound_wq, &rreq->work); + queue_work(system_dfl_wq, &rreq->work); } else { trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue); wake_up(&rreq->waitq); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e8c99738b5bb..b8c4918d3dcd 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -116,10 +116,8 @@ static void netfs_free_request_rcu(struct rcu_head *rcu) netfs_stat_d(&netfs_n_rh_rreq); } -static void netfs_free_request(struct work_struct *work) +static void netfs_deinit_request(struct netfs_io_request *rreq) { - struct netfs_io_request *rreq = - container_of(work, struct netfs_io_request, cleanup_work); struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; @@ -149,6 +147,14 @@ static void netfs_free_request(struct work_struct *work) if (atomic_dec_and_test(&ictx->io_count)) wake_up_var(&ictx->io_count); +} + +static void netfs_free_request(struct work_struct *work) +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, cleanup_work); + + netfs_deinit_request(rreq); call_rcu(&rreq->rcu, netfs_free_request_rcu); } @@ -163,11 +169,29 @@ void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace dead = __refcount_dec_and_test(&rreq->ref, &r); trace_netfs_rreq_ref(debug_id, r - 1, what); if (dead) - WARN_ON(!queue_work(system_unbound_wq, &rreq->cleanup_work)); + WARN_ON(!queue_work(system_dfl_wq, &rreq->cleanup_work)); } } /* + * Free a request (synchronously) that was just allocated but has + * failed before it could be submitted. + */ +void netfs_put_failed_request(struct netfs_io_request *rreq) +{ + int r = refcount_read(&rreq->ref); + + /* new requests have two references (see + * netfs_alloc_request(), and this function is only allowed on + * new request objects + */ + WARN_ON_ONCE(r != 2); + + trace_netfs_rreq_ref(rreq->debug_id, r, netfs_rreq_trace_put_failed); + netfs_free_request(&rreq->cleanup_work); +} + +/* * Allocate and partially initialise an I/O request structure. */ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq) diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c index 8097bc069c1d..a1489aa29f78 100644 --- a/fs/netfs/read_pgpriv2.c +++ b/fs/netfs/read_pgpriv2.c @@ -118,7 +118,7 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache( return creq; cancel_put: - netfs_put_request(creq, netfs_rreq_trace_put_return); + netfs_put_failed_request(creq); cancel: rreq->copy_to_cache = ERR_PTR(-ENOBUFS); clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags); diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index fa622a6cd56d..5c0dc4efc792 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -189,7 +189,7 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite return ret; cleanup_free: - netfs_put_request(rreq, netfs_rreq_trace_put_failed); + netfs_put_failed_request(rreq); return ret; } EXPORT_SYMBOL(netfs_read_single); diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 0584cba1a043..dd8743bc8d7f 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -133,8 +133,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, return wreq; nomem: - wreq->error = -ENOMEM; - netfs_put_request(wreq, netfs_rreq_trace_put_failed); + netfs_put_failed_request(wreq); return ERR_PTR(-ENOMEM); } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 8fb4a950dd55..4e3dcc157a83 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -888,6 +888,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, if (fsinfo->xattr_support) server->caps |= NFS_CAP_XATTR; + else + server->caps &= ~NFS_CAP_XATTR; #endif } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 86e36c630f09..8059ece82468 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -28,6 +28,7 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/gfp.h> +#include <linux/rmap.h> #include <linux/swap.h> #include <linux/compaction.h> @@ -280,6 +281,37 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL_GPL(nfs_file_fsync); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to) +{ + struct folio *folio; + + if (from >= to) + return; + + folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + + if (folio_test_uptodate(folio)) { + loff_t fpos = folio_pos(folio); + size_t offset = from - fpos; + size_t end = folio_size(folio); + + if (to - fpos < end) + end = to - fpos; + folio_zero_segment(folio, offset, end); + trace_nfs_size_truncate_folio(mapping->host, to); + } + + folio_unlock(folio); + folio_put(folio); +} +EXPORT_SYMBOL_GPL(nfs_truncate_last_folio); + /* * Decide whether a read/modify/write cycle may be more efficient * then a modify/write/read cycle when writing to a page in the @@ -356,6 +388,7 @@ static int nfs_write_begin(const struct kiocb *iocb, dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); + nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos); fgp |= fgf_set_order(len); start: @@ -442,10 +475,11 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset, dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n", folio->index, offset, length); - if (offset != 0 || length < folio_size(folio)) - return; /* Cancel any unstarted writes on this page */ - nfs_wb_folio_cancel(inode, folio); + if (offset != 0 || length < folio_size(folio)) + nfs_wb_folio(inode, folio); + else + nfs_wb_folio_cancel(inode, folio); folio_wait_private_2(folio); /* [DEPRECATED] */ trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length); } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 8dc921d83538..9edb5f9b0c4e 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -293,7 +293,7 @@ ff_lseg_match_mirrors(struct pnfs_layout_segment *l1, struct pnfs_layout_segment *l2) { const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1); - const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1); + const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2); u32 i; if (fl1->mirror_array_cnt != fl2->mirror_array_cnt) @@ -773,8 +773,11 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, continue; if (check_device && - nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) + nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) { + // reinitialize the error state in case if this is the last iteration + ds = ERR_PTR(-EINVAL); continue; + } *best_idx = idx; break; @@ -804,7 +807,7 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, struct nfs4_pnfs_ds *ds; ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx); - if (ds) + if (!IS_ERR(ds)) return ds; return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); } @@ -818,7 +821,7 @@ ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx, best_idx); - if (ds || !pgio->pg_mirror_idx) + if (!IS_ERR(ds) || !pgio->pg_mirror_idx) return ds; return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx); } @@ -868,7 +871,7 @@ retry: req->wb_nio = 0; ds = ff_layout_get_ds_for_read(pgio, &ds_idx); - if (!ds) { + if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; pnfs_generic_pg_cleanup(pgio); @@ -1072,11 +1075,13 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) { u32 idx = hdr->pgio_mirror_idx + 1; u32 new_idx = 0; + struct nfs4_pnfs_ds *ds; - if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx)) - ff_layout_send_layouterror(hdr->lseg); - else + ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx); + if (IS_ERR(ds)) pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); + else + ff_layout_send_layouterror(hdr->lseg); pnfs_read_resend_pnfs(hdr, new_idx); } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 338ef77ae423..9bdaf7f38bed 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -108,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid) int nfs_drop_inode(struct inode *inode) { - return NFS_STALE(inode) || generic_drop_inode(inode); + return NFS_STALE(inode) || inode_generic_drop(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); @@ -608,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), - atomic_read(&inode->i_count)); + icount_read(inode)); out: return inode; @@ -716,6 +716,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; + loff_t oldsize = i_size_read(inode); int error = 0; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -731,7 +732,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (error) return error; - if (attr->ia_size == i_size_read(inode)) + if (attr->ia_size == oldsize) attr->ia_valid &= ~ATTR_SIZE; } @@ -767,8 +768,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode)) { + nfs_file_block_o_direct(NFS_I(inode)); nfs_sync_inode(inode); + } fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) { @@ -777,8 +780,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); - if (error == 0) + if (error == 0) { + if (attr->ia_valid & ATTR_SIZE) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + attr->ia_size); error = nfs_refresh_inode(inode, fattr); + } nfs_free_fattr(fattr); out: trace_nfs_setattr_exit(inode, error); @@ -2229,7 +2236,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), - atomic_read(&inode->i_count), fattr->valid); + icount_read(inode), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 74d712b58423..c0a44f389f8f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -437,6 +437,8 @@ int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; @@ -530,6 +532,16 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; } +/* Must be called with exclusively locked inode->i_rwsem */ +static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(&nfsi->vfs_inode); + } +} + + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 3388faf2acb9..d275b0a250bf 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -14,15 +14,6 @@ #include "internal.h" -/* Call with exclusively locked inode->i_rwsem */ -static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) -{ - if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { - clear_bit(NFS_INO_ODIRECT, &nfsi->flags); - inode_dio_wait(inode); - } -} - /** * nfs_start_io_read - declare the file is being used for buffered reads * @inode: file inode @@ -57,7 +48,7 @@ nfs_start_io_read(struct inode *inode) err = down_write_killable(&inode->i_rwsem); if (err) return err; - nfs_block_o_direct(nfsi, inode); + nfs_file_block_o_direct(nfsi); downgrade_write(&inode->i_rwsem); return 0; @@ -90,7 +81,7 @@ nfs_start_io_write(struct inode *inode) err = down_write_killable(&inode->i_rwsem); if (!err) - nfs_block_o_direct(NFS_I(inode), inode); + nfs_file_block_o_direct(NFS_I(inode)); return err; } diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index bd5fca285899..97abf62f109d 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -180,10 +180,8 @@ static void nfs_local_probe(struct nfs_client *clp) return; } - if (nfs_client_is_local(clp)) { - /* If already enabled, disable and re-enable */ - nfs_localio_disable_client(clp); - } + if (nfs_client_is_local(clp)) + return; if (!nfs_uuid_begin(&clp->cl_uuid)) return; @@ -244,7 +242,8 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, case -ENOMEM: case -ENXIO: case -ENOENT: - /* Revalidate localio, will disable if unsupported */ + /* Revalidate localio */ + nfs_localio_disable_client(clp); nfs_local_probe(clp); } } @@ -453,12 +452,13 @@ static void nfs_local_call_read(struct work_struct *work) nfs_local_iter_init(&iter, iocb, READ); status = filp->f_op->read_iter(&iocb->kiocb, &iter); + + revert_creds(save_cred); + if (status != -EIOCBQUEUED) { nfs_local_read_done(iocb, status); nfs_local_pgio_release(iocb); } - - revert_creds(save_cred); } static int @@ -648,14 +648,15 @@ static void nfs_local_call_write(struct work_struct *work) file_start_write(filp); status = filp->f_op->write_iter(&iocb->kiocb, &iter); file_end_write(filp); + + revert_creds(save_cred); + current->flags = old_flags; + if (status != -EIOCBQUEUED) { nfs_local_write_done(iocb, status); nfs_local_vfs_getattr(iocb); nfs_local_pgio_release(iocb); } - - revert_creds(save_cred); - current->flags = old_flags; } static int diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 7f1ec9c67ff2..f9a3a1fbf44c 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -335,7 +335,7 @@ static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp) num *= HZ; *((int *)kp->arg) = num; if (!list_empty(&nfs_automount_list)) - mod_delayed_work(system_wq, &nfs_automount_task, num); + mod_delayed_work(system_percpu_wq, &nfs_automount_task, num); } else { *((int *)kp->arg) = -1*HZ; cancel_delayed_work(&nfs_automount_task); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 01c01f45358b..6a0b5871ba3b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, exception.inode = inode; exception.state = lock->open_context->state; + nfs_file_block_o_direct(NFS_I(inode)); err = nfs_sync_inode(inode); if (err) goto out; @@ -137,6 +138,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) @@ -145,7 +147,11 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == -EOPNOTSUPP) + + if (err == 0) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); + else if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE | NFS_CAP_ZERO_RANGE); @@ -183,6 +189,7 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE)) @@ -191,9 +198,11 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == 0) + if (err == 0) { + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); truncate_pagecache_range(inode, offset, (offset + len) -1); - if (err == -EOPNOTSUPP) + } else if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE; inode_unlock(inode); @@ -354,22 +363,27 @@ out: /** * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload - * @inode: pointer to destination inode + * @file: pointer to destination file * @pos: destination offset * @len: copy length + * @oldsize: length of the file prior to clone/copy * * Punch a hole in the inode page cache, so that the NFS client will * know to retrieve new data. * Update the file size if necessary, and then mark the inode as having * invalid cached values for change attribute, ctime, mtime and space used. */ -static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len) +static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len, + loff_t oldsize) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; loff_t newsize = pos + len; loff_t end = newsize - 1; - WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_SHIFT, end >> PAGE_SHIFT)); + nfs_truncate_last_folio(mapping, oldsize, pos); + WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT)); spin_lock(&inode->i_lock); if (newsize > i_size_read(inode)) @@ -402,6 +416,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_server *src_server = NFS_SERVER(src_inode); loff_t pos_src = args->src_pos; loff_t pos_dst = args->dst_pos; + loff_t oldsize_dst = i_size_read(dst_inode); size_t count = args->count; ssize_t status; @@ -430,6 +445,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, return status; } + nfs_file_block_o_direct(NFS_I(dst_inode)); status = nfs_sync_inode(dst_inode); if (status) return status; @@ -475,7 +491,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, goto out; } - nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count); + nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst); nfs_invalidate_atime(src_inode); status = res->write_res.count; out: @@ -1242,6 +1258,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct nfs42_clone_res res = { .server = server, }; + loff_t oldsize_dst = i_size_read(dst_inode); int status; msg->rpc_argp = &args; @@ -1276,7 +1293,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, /* a zero-length count means clone to EOF in src */ if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE) count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset; - nfs42_copy_dest_done(dst_inode, dst_offset, count); + nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst); status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 1d6b5f4230c9..c9a0d1e420c6 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -278,9 +278,11 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, lock_two_nondirectories(src_inode, dst_inode); /* flush all pending writes on both src and dst so that server * has the latest data */ + nfs_file_block_o_direct(NFS_I(src_inode)); ret = nfs_sync_inode(src_inode); if (ret) goto out_unlock; + nfs_file_block_o_direct(NFS_I(dst_inode)); ret = nfs_sync_inode(dst_inode); if (ret) goto out_unlock; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7d2b67e06cc3..ce61253efd45 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4013,8 +4013,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f res.attr_bitmask[2]; } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); - server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | - NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL); + server->caps &= + ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS | + NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS | + NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME); server->fattr_valid = NFS_ATTR_FATTR_V4; if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) @@ -4092,7 +4094,6 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) }; int err; - nfs_server_set_init_caps(server); do { err = nfs4_handle_exception(server, _nfs4_server_capabilities(server, fhandle), diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index db3811af0796..18ae614e5a6c 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -122,7 +122,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) timeout = 5 * HZ; dprintk("%s: requeueing work. Lease period = %ld\n", __func__, (timeout + HZ - 1) / HZ); - mod_delayed_work(system_wq, &clp->cl_renewd, timeout); + mod_delayed_work(system_percpu_wq, &clp->cl_renewd, timeout); set_bit(NFS_CS_RENEWD, &clp->cl_res_state); spin_unlock(&clp->cl_lock); } diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 96b1323318c2..627115179795 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -272,6 +272,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class, TP_ARGS(inode, new_size)) DEFINE_NFS_UPDATE_SIZE_EVENT(truncate); +DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio); DEFINE_NFS_UPDATE_SIZE_EVENT(wcc); DEFINE_NFS_UPDATE_SIZE_EVENT(update); DEFINE_NFS_UPDATE_SIZE_EVENT(grow); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 8b7c04737967..647c53d1418a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -237,59 +237,17 @@ static void nfs_mapping_set_error(struct folio *folio, int error) } /* - * nfs_page_group_search_locked - * @head - head request of page group - * @page_offset - offset into page + * nfs_page_covers_folio + * @req: struct nfs_page * - * Search page group with head @head to find a request that contains the - * page offset @page_offset. - * - * Returns a pointer to the first matching nfs request, or NULL if no - * match is found. - * - * Must be called with the page group lock held - */ -static struct nfs_page * -nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) -{ - struct nfs_page *req; - - req = head; - do { - if (page_offset >= req->wb_pgbase && - page_offset < (req->wb_pgbase + req->wb_bytes)) - return req; - - req = req->wb_this_page; - } while (req != head); - - return NULL; -} - -/* - * nfs_page_group_covers_page - * @head - head request of page group - * - * Return true if the page group with head @head covers the whole page, - * returns false otherwise + * Return true if the request covers the whole folio. + * Note that the caller should ensure all subrequests have been joined */ static bool nfs_page_group_covers_page(struct nfs_page *req) { unsigned int len = nfs_folio_length(nfs_page_to_folio(req)); - struct nfs_page *tmp; - unsigned int pos = 0; - - nfs_page_group_lock(req); - for (;;) { - tmp = nfs_page_group_search_locked(req->wb_head, pos); - if (!tmp) - break; - pos = tmp->wb_pgbase + tmp->wb_bytes; - } - - nfs_page_group_unlock(req); - return pos >= len; + return req->wb_pgbase == 0 && req->wb_bytes == len; } /* We can set the PG_uptodate flag if we see that a write request @@ -2045,6 +2003,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) * release it */ nfs_inode_remove_request(req); nfs_unlock_and_release_request(req); + folio_cancel_dirty(folio); } return ret; diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 732abf6b92a5..85ca663c052c 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -113,7 +113,7 @@ static void nfsd_file_schedule_laundrette(void) { if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags)) - queue_delayed_work(system_unbound_wq, &nfsd_filecache_laundrette, + queue_delayed_work(system_dfl_wq, &nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY); } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index edf050766e57..aa4a95713a48 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1951,10 +1951,9 @@ retry: goto out_dput_old; } else { struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, + .mnt_idmap = &nop_mnt_idmap, .old_parent = fdentry, .old_dentry = odentry, - .new_mnt_idmap = &nop_mnt_idmap, .new_parent = tdentry, .new_dentry = ndentry, }; diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 14868a3dd592..bc52afbfc5c7 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -1075,7 +1075,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) ************************************************************************/ static ssize_t nilfs_feature_revision_show(struct kobject *kobj, - struct attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d.%d\n", NILFS_CURRENT_REV, NILFS_MINOR_REV); @@ -1087,7 +1087,7 @@ static const char features_readme_str[] = "(1) revision\n\tshow current revision of NILFS file system driver.\n"; static ssize_t nilfs_feature_README_show(struct kobject *kobj, - struct attribute *attr, + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, features_readme_str); diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h index 78a87a016928..d370cd5cce3f 100644 --- a/fs/nilfs2/sysfs.h +++ b/fs/nilfs2/sysfs.h @@ -50,16 +50,16 @@ struct nilfs_sysfs_dev_subgroups { struct completion sg_segments_kobj_unregister; }; -#define NILFS_COMMON_ATTR_STRUCT(name) \ +#define NILFS_KOBJ_ATTR_STRUCT(name) \ struct nilfs_##name##_attr { \ struct attribute attr; \ - ssize_t (*show)(struct kobject *, struct attribute *, \ + ssize_t (*show)(struct kobject *, struct kobj_attribute *, \ char *); \ - ssize_t (*store)(struct kobject *, struct attribute *, \ + ssize_t (*store)(struct kobject *, struct kobj_attribute *, \ const char *, size_t); \ } -NILFS_COMMON_ATTR_STRUCT(feature); +NILFS_KOBJ_ATTR_STRUCT(feature); #define NILFS_DEV_ATTR_STRUCT(name) \ struct nilfs_##name##_attr { \ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 079b868552c2..46bfc543f946 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -66,7 +66,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * removed all zero refcount inodes, in any case. Test to * be sure. */ - if (!atomic_read(&inode->i_count)) { + if (!icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 798340db69d7..55a03bb05aa1 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -428,7 +428,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) conn->destroy_next = connector_destroy_list; connector_destroy_list = conn; spin_unlock(&destroy_lock); - queue_work(system_unbound_wq, &connector_reaper_work); + queue_work(system_dfl_wq, &connector_reaper_work); } /* * Note that we didn't update flags telling whether inode cares about @@ -439,7 +439,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); - queue_delayed_work(system_unbound_wq, &reaper_work, + queue_delayed_work(system_dfl_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } EXPORT_SYMBOL_GPL(fsnotify_put_mark); diff --git a/fs/nsfs.c b/fs/nsfs.c index 59aa801347a7..e7fd8a790aaa 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -13,12 +13,26 @@ #include <linux/nsfs.h> #include <linux/uaccess.h> #include <linux/mnt_namespace.h> +#include <linux/ipc_namespace.h> +#include <linux/time_namespace.h> +#include <linux/utsname.h> +#include <linux/exportfs.h> +#include <linux/nstree.h> +#include <net/net_namespace.h> #include "mount.h" #include "internal.h" static struct vfsmount *nsfs_mnt; +static struct path nsfs_root_path = {}; + +void nsfs_get_root(struct path *path) +{ + *path = nsfs_root_path; + path_get(path); +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static const struct file_operations ns_file_operations = { @@ -139,7 +153,7 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns, * the size value will be set to the size the kernel knows about. */ kinfo->size = min(usize, sizeof(*kinfo)); - kinfo->mnt_ns_id = mnt_ns->seq; + kinfo->mnt_ns_id = mnt_ns->ns.ns_id; kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts); /* Subtract the root mount of the mount namespace. */ if (kinfo->nr_mounts) @@ -163,15 +177,18 @@ static bool nsfs_ioctl_valid(unsigned int cmd) case NS_GET_TGID_FROM_PIDNS: case NS_GET_PID_IN_PIDNS: case NS_GET_TGID_IN_PIDNS: - return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + case NS_GET_ID: + return true; } /* Extensible ioctls require some extra handling. */ switch (_IOC_NR(cmd)) { case _IOC_NR(NS_MNT_GET_INFO): + return extensible_ioctl_valid(cmd, NS_MNT_GET_INFO, MNT_NS_INFO_SIZE_VER0); case _IOC_NR(NS_MNT_GET_NEXT): + return extensible_ioctl_valid(cmd, NS_MNT_GET_NEXT, MNT_NS_INFO_SIZE_VER0); case _IOC_NR(NS_MNT_GET_PREV): - return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + return extensible_ioctl_valid(cmd, NS_MNT_GET_PREV, MNT_NS_INFO_SIZE_VER0); } return false; @@ -202,26 +219,14 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); case NS_GET_NSTYPE: - return ns->ops->type; + return ns->ns_type; case NS_GET_OWNER_UID: - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; user_ns = container_of(ns, struct user_namespace, ns); argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); - case NS_GET_MNTNS_ID: { - __u64 __user *idp; - __u64 id; - - if (ns->ops->type != CLONE_NEWNS) - return -EINVAL; - - mnt_ns = container_of(ns, struct mnt_namespace, ns); - idp = (__u64 __user *)arg; - id = mnt_ns->seq; - return put_user(id, idp); - } case NS_GET_PID_FROM_PIDNS: fallthrough; case NS_GET_TGID_FROM_PIDNS: @@ -229,7 +234,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, case NS_GET_PID_IN_PIDNS: fallthrough; case NS_GET_TGID_IN_PIDNS: { - if (ns->ops->type != CLONE_NEWPID) + if (ns->ns_type != CLONE_NEWPID) return -EINVAL; ret = -ESRCH; @@ -267,6 +272,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, ret = -ESRCH; return ret; } + case NS_GET_MNTNS_ID: + if (ns->ns_type != CLONE_NEWNS) + return -EINVAL; + fallthrough; + case NS_GET_ID: { + __u64 __user *idp; + __u64 id; + + idp = (__u64 __user *)arg; + id = ns->ns_id; + return put_user(id, idp); + } } /* extensible ioctls */ @@ -276,7 +293,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (!uinfo) @@ -297,7 +314,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct file *f __free(fput) = NULL; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (usize < MNT_NS_INFO_SIZE_VER0) @@ -415,12 +432,164 @@ static const struct stashed_operations nsfs_stashed_ops = { .put_data = nsfs_put_data, }; +#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32)) +#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32)) + +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct ns_common *ns = inode->i_private; + int len = *max_len; + + if (parent) + return FILEID_INVALID; + + if (len < NSFS_FID_SIZE_U32_VER0) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + return FILEID_INVALID; + } else if (len > NSFS_FID_SIZE_U32_LATEST) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + } + + fid->ns_id = ns->ns_id; + fid->ns_type = ns->ns_type; + fid->ns_inum = inode->i_ino; + return FILEID_NSFS; +} + +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct path path __free(path_put) = {}; + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct user_namespace *owning_ns = NULL; + struct ns_common *ns; + int ret; + + if (fh_len < NSFS_FID_SIZE_U32_VER0) + return NULL; + + /* Check that any trailing bytes are zero. */ + if ((fh_len > NSFS_FID_SIZE_U32_LATEST) && + memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0, + fh_len - NSFS_FID_SIZE_U32_LATEST)) + return NULL; + + switch (fh_type) { + case FILEID_NSFS: + break; + default: + return NULL; + } + + scoped_guard(rcu) { + ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); + if (!ns) + return NULL; + + VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); + VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); + + if (!__ns_ref_get(ns)) + return NULL; + } + + switch (ns->ns_type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + if (!current_in_namespace(to_cg_ns(ns))) + owning_ns = to_cg_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + if (!current_in_namespace(to_ipc_ns(ns))) + owning_ns = to_ipc_ns(ns)->user_ns; + break; +#endif + case CLONE_NEWNS: + if (!current_in_namespace(to_mnt_ns(ns))) + owning_ns = to_mnt_ns(ns)->user_ns; + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + if (!current_in_namespace(to_net_ns(ns))) + owning_ns = to_net_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + if (!current_in_namespace(to_pid_ns(ns))) { + owning_ns = to_pid_ns(ns)->user_ns; + } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + if (!current_in_namespace(to_time_ns(ns))) + owning_ns = to_time_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + if (!current_in_namespace(to_user_ns(ns))) + owning_ns = to_user_ns(ns); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + if (!current_in_namespace(to_uts_ns(ns))) + owning_ns = to_uts_ns(ns)->user_ns; + break; +#endif + default: + return ERR_PTR(-EOPNOTSUPP); + } + + if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + + /* path_from_stashed() unconditionally consumes the reference. */ + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (ret) + return ERR_PTR(ret); + + return no_free_ptr(path.dentry); +} + +static int nsfs_export_permission(struct handle_to_path_ctx *ctx, + unsigned int oflags) +{ + /* nsfs_fh_to_dentry() performs all permission checks. */ + return 0; +} + +static struct file *nsfs_export_open(struct path *path, unsigned int oflags) +{ + return file_open_root(path, "", oflags, 0); +} + +static const struct export_operations nsfs_export_operations = { + .encode_fh = nsfs_encode_fh, + .fh_to_dentry = nsfs_fh_to_dentry, + .open = nsfs_export_open, + .permission = nsfs_export_permission, +}; + static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &nsfs_ops; + ctx->eops = &nsfs_export_operations; ctx->dops = &ns_dentry_operations; fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; @@ -438,4 +607,6 @@ void __init nsfs_init(void) if (IS_ERR(nsfs_mnt)) panic("can't set nsfs up\n"); nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; + nsfs_root_path.mnt = nsfs_mnt; + nsfs_root_path.dentry = nsfs_mnt->mnt_root; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 2018501b2249..2347a50f079b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1876,7 +1876,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) dlm_debug_init(dlm); snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); - dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0); + dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 5130ec44e5e1..cccaa1d6fbba 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -547,7 +547,7 @@ static const struct super_operations dlmfs_ops = { .alloc_inode = dlmfs_alloc_inode, .free_inode = dlmfs_free_inode, .evict_inode = dlmfs_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct inode_operations dlmfs_file_inode_operations = { @@ -595,7 +595,8 @@ static int __init init_dlmfs_fs(void) } cleanup_inode = 1; - user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0); + user_dlm_worker = alloc_workqueue("user_dlm", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!user_dlm_worker) { status = -ENOMEM; goto bail; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 930150ed5db1..ef147e8b3271 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -706,6 +706,8 @@ out: * it not only handles the fiemap for inlined files, but also deals * with the fast symlink, cause they have no difference for extent * mapping per se. + * + * Must be called with ip_alloc_sem semaphore held. */ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct fiemap_extent_info *fieinfo, @@ -717,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, u64 phys; u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; struct ocfs2_inode_info *oi = OCFS2_I(inode); + lockdep_assert_held_read(&oi->ip_alloc_sem); di = (struct ocfs2_dinode *)di_bh->b_data; if (ocfs2_inode_is_fast_symlink(inode)) @@ -732,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret < 0) return ret; } @@ -802,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; - + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, len_bytes, fe_flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret) break; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 14bf440ea4df..6c4f78f473fb 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1281,6 +1281,9 @@ static void ocfs2_clear_inode(struct inode *inode) * the journal is flushed before journal shutdown. Thus it is safe to * have inodes get cleaned up after journal shutdown. */ + if (!osb->journal) + return; + jbd2_journal_release_jbd_inode(osb->journal->j_journal, &oi->ip_jinode); } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 8f732742b26e..267b50e8e42e 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4418,7 +4418,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, return error; } - new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_user_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) { mlog_errno(error); @@ -4435,7 +4435,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, d_inode(new_path.dentry), new_dentry, preserve); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 0f045e45fa0c..765105f1ff8a 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -952,7 +952,7 @@ static const struct dlm_lockspace_ops ocfs2_ls_ops = { static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) { version_unlock(conn); - dlm_release_lockspace(conn->cc_lockspace, 2); + dlm_release_lockspace(conn->cc_lockspace, DLM_RELEASE_NORMAL); conn->cc_lockspace = NULL; ocfs2_live_connection_drop(conn->cc_private); conn->cc_private = NULL; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index f3da840758e7..b46100a4f529 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -306,7 +306,7 @@ static const struct super_operations orangefs_s_ops = { .free_inode = orangefs_free_inode, .destroy_inode = orangefs_destroy_inode, .write_inode = orangefs_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .statfs = orangefs_statfs, .show_options = orangefs_show_options, }; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index bb0d7ded8e76..4f84abaa0d68 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -361,10 +361,9 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, { int err; struct renamedata rd = { - .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), + .mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_parent = olddir, .old_dentry = olddentry, - .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), .new_parent = newdir, .new_dentry = newdentry, .flags = flags, diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index b65cdfce31ce..15cb06fa0c9a 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -270,26 +270,26 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name, static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { - int err; + int err = 0; struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; old_cred = ovl_override_creds(rdd->dentry->d_sb); - err = down_write_killable(&dir->d_inode->i_rwsem); - if (!err) { - while (rdd->first_maybe_whiteout) { - struct ovl_cache_entry *p = - rdd->first_maybe_whiteout; - rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one(mnt_idmap(path->mnt), - &QSTR_LEN(p->name, p->len), dir); - if (!IS_ERR(dentry)) { - p->is_whiteout = ovl_is_whiteout(dentry); - dput(dentry); - } + while (rdd->first_maybe_whiteout) { + struct ovl_cache_entry *p = + rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p->next_maybe_whiteout; + dentry = lookup_one_positive_killable(mnt_idmap(path->mnt), + &QSTR_LEN(p->name, p->len), + dir); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } else if (PTR_ERR(dentry) == -EINTR) { + err = -EINTR; + break; } - inode_unlock(dir->d_inode); } ovl_revert_creds(old_cred); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index df85a76597e9..bd3d7ba8fb95 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -280,7 +280,7 @@ static const struct super_operations ovl_super_operations = { .alloc_inode = ovl_alloc_inode, .free_inode = ovl_free_inode, .destroy_inode = ovl_destroy_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = ovl_put_super, .sync_fs = ovl_sync_fs, .statfs = ovl_statfs, diff --git a/fs/pidfs.c b/fs/pidfs.c index 108e7527f837..c40c29c702e5 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -440,7 +440,7 @@ static bool pidfs_ioctl_valid(unsigned int cmd) * erronously mistook the file descriptor for a pidfd. * This is not perfect but will catch most cases. */ - return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); + return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0); } return false; @@ -718,7 +718,7 @@ static void pidfs_evict_inode(struct inode *inode) } static const struct super_operations pidfs_sops = { - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pidfs_evict_inode, .statfs = simple_statfs, }; diff --git a/fs/pipe.c b/fs/pipe.c index 731622d0738d..42fead1efe52 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&pipe->mutex); if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } @@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; diff --git a/fs/proc/array.c b/fs/proc/array.c index d6a0369caa93..69269745d73b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -157,13 +157,11 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, unsigned int max_fds = 0; rcu_read_lock(); - ppid = pid_alive(p) ? - task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; - tracer = ptrace_parent(p); if (tracer) tpid = task_pid_nr_ns(tracer, ns); + ppid = task_ppid_nr_ns(p, ns); tgid = task_tgid_nr_ns(p, ns); ngid = task_numa_group_id(p); cred = get_task_cred(p); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 76e800e38c8f..176281112273 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -367,6 +367,25 @@ static const struct inode_operations proc_dir_inode_operations = { .setattr = proc_notify_change, }; +static void pde_set_flags(struct proc_dir_entry *pde) +{ + const struct proc_ops *proc_ops = pde->proc_ops; + + if (!proc_ops) + return; + + if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT) + pde->flags |= PROC_ENTRY_PERMANENT; + if (proc_ops->proc_read_iter) + pde->flags |= PROC_ENTRY_proc_read_iter; +#ifdef CONFIG_COMPAT + if (proc_ops->proc_compat_ioctl) + pde->flags |= PROC_ENTRY_proc_compat_ioctl; +#endif + if (proc_ops->proc_lseek) + pde->flags |= PROC_ENTRY_proc_lseek; +} + /* returns the registered entry, or frees dp and returns NULL on failure */ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, struct proc_dir_entry *dp) @@ -374,6 +393,9 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, if (proc_alloc_inum(&dp->low_ino)) goto out_free_entry; + if (!S_ISDIR(dp->mode)) + pde_set_flags(dp); + write_lock(&proc_subdir_lock); dp->parent = dir; if (pde_subdir_insert(dir, dp) == false) { @@ -561,20 +583,6 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, return p; } -static void pde_set_flags(struct proc_dir_entry *pde) -{ - if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) - pde->flags |= PROC_ENTRY_PERMANENT; - if (pde->proc_ops->proc_read_iter) - pde->flags |= PROC_ENTRY_proc_read_iter; -#ifdef CONFIG_COMPAT - if (pde->proc_ops->proc_compat_ioctl) - pde->flags |= PROC_ENTRY_proc_compat_ioctl; -#endif - if (pde->proc_ops->proc_lseek) - pde->flags |= PROC_ENTRY_proc_lseek; -} - struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops, void *data) @@ -585,7 +593,6 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, if (!p) return NULL; p->proc_ops = proc_ops; - pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); @@ -636,7 +643,6 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, p->proc_ops = &proc_seq_ops; p->seq_ops = ops; p->state_size = state_size; - pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_seq_private); @@ -667,7 +673,6 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, return NULL; p->proc_ops = &proc_single_ops; p->single_show = show; - pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_single_data); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 129490151be1..d9b7ef122343 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -187,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .free_inode = proc_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = proc_evict_inode, .statfs = simple_statfs, .show_options = proc_show_options, diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 4403a2e20c16..ea2b597fd92c 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -12,7 +12,7 @@ #include "internal.h" -static const struct proc_ns_operations *ns_entries[] = { +static const struct proc_ns_operations *const ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif @@ -117,7 +117,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry, static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { struct task_struct *task = get_proc_task(file_inode(file)); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; if (!task) return -ENOENT; @@ -151,7 +151,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct task_struct *task = get_proc_task(dir); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; unsigned int len = dentry->d_name.len; struct dentry *res = ERR_PTR(-ENOENT); diff --git a/fs/proc/root.c b/fs/proc/root.c index ed86ac710384..1e24e085c7d5 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -38,12 +38,14 @@ enum proc_param { Opt_gid, Opt_hidepid, Opt_subset, + Opt_pidns, }; static const struct fs_parameter_spec proc_fs_parameters[] = { - fsparam_u32("gid", Opt_gid), + fsparam_u32("gid", Opt_gid), fsparam_string("hidepid", Opt_hidepid), fsparam_string("subset", Opt_subset), + fsparam_file_or_string("pidns", Opt_pidns), {} }; @@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value) return 0; } +#ifdef CONFIG_PID_NS +static int proc_parse_pidns_param(struct fs_context *fc, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct pid_namespace *target, *active = task_active_pid_ns(current); + struct ns_common *ns; + struct file *ns_filp __free(fput) = NULL; + + switch (param->type) { + case fs_value_is_file: + /* came through fsconfig, steal the file reference */ + ns_filp = no_free_ptr(param->file); + break; + case fs_value_is_string: + ns_filp = filp_open(param->string, O_RDONLY, 0); + break; + default: + WARN_ON_ONCE(true); + break; + } + if (!ns_filp) + ns_filp = ERR_PTR(-EBADF); + if (IS_ERR(ns_filp)) { + errorfc(fc, "could not get file from pidns argument"); + return PTR_ERR(ns_filp); + } + + if (!proc_ns_file(ns_filp)) + return invalfc(fc, "pidns argument is not an nsfs file"); + ns = get_proc_ns(file_inode(ns_filp)); + if (ns->ns_type != CLONE_NEWPID) + return invalfc(fc, "pidns argument is not a pidns file"); + target = container_of(ns, struct pid_namespace, ns); + + /* + * pidns= is shorthand for joining the pidns to get a fsopen fd, so the + * permission model should be the same as pidns_install(). + */ + if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) { + errorfc(fc, "insufficient permissions to set pidns"); + return -EPERM; + } + if (!pidns_is_ancestor(target, active)) + return invalfc(fc, "cannot set pidns to non-descendant pidns"); + + put_pid_ns(ctx->pid_ns); + ctx->pid_ns = get_pid_ns(target); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + return 0; +} +#endif /* CONFIG_PID_NS */ + static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - int opt; + int opt, err; opt = fs_parse(fc, proc_fs_parameters, param, &result); if (opt < 0) @@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_hidepid: - if (proc_parse_hidepid_param(fc, param)) - return -EINVAL; + err = proc_parse_hidepid_param(fc, param); + if (err) + return err; break; case Opt_subset: - if (proc_parse_subset_param(fc, param->string) < 0) - return -EINVAL; + err = proc_parse_subset_param(fc, param->string); + if (err) + return err; + break; + + case Opt_pidns: +#ifdef CONFIG_PID_NS + /* + * We would have to RCU-protect every proc_pid_ns() or + * proc_sb_info() access if we allowed this to be reconfigured + * for an existing procfs instance. Luckily, procfs instances + * are cheap to create, and mount-beneath would let you + * atomically replace an instance even with overmounts. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + errorfc(fc, "cannot reconfigure pidns for existing procfs"); + return -EBUSY; + } + err = proc_parse_pidns_param(fc, param, &result); + if (err) + return err; break; +#else + errorfc(fc, "pidns mount flag not supported on this system"); + return -EOPNOTSUPP; +#endif default: return -EINVAL; @@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info, fs_info->hide_pid = ctx->hidepid; if (ctx->mask & (1 << Opt_subset)) fs_info->pidonly = ctx->pidonly; + if (ctx->mask & (1 << Opt_pidns) && + !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) { + put_pid_ns(fs_info->pid_ns); + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + } } static int proc_fill_super(struct super_block *s, struct fs_context *fc) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 29cca0e6d0ff..b26ae556b446 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2417,6 +2417,9 @@ static void pagemap_scan_backout_range(struct pagemap_scan_private *p, { struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + if (!p->vec_buf) + return; + if (cur_buf->start != addr) cur_buf->end = addr; else diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 1a2e1185426c..b4e55c90f8dc 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -282,7 +282,7 @@ static int pstore_reconfigure(struct fs_context *fc) static const struct super_operations pstore_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pstore_evict_inode, .show_options = pstore_show_options, }; diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index ceb5639a0629..eb61ba5bb964 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -43,7 +43,7 @@ struct psz_buffer { * * @magic: magic num for kmsg dump header * @time: kmsg dump trigger time - * @compressed: whether conpressed + * @compressed: whether compressed * @counter: kmsg dump counter * @reason: the kmsg dump reason (e.g. oops, panic, etc) * @data: pointer to log data @@ -214,7 +214,7 @@ static int psz_zone_write(struct pstore_zone *zone, atomic_set(&zone->buffer->datalen, wlen + off); } - /* avoid to damage old records */ + /* avoid damaging old records */ if (!is_on_panic() && !atomic_read(&pstore_zone_cxt.recovered)) goto dirty; @@ -249,7 +249,7 @@ static int psz_zone_write(struct pstore_zone *zone, return 0; dirty: - /* no need to mark dirty if going to try next zone */ + /* no need to mark it dirty if going to try next zone */ if (wcnt == -ENOMSG) return -ENOMSG; atomic_set(&zone->dirty, true); @@ -378,7 +378,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt) struct timespec64 time = { }; unsigned long i; /* - * Recover may on panic, we can't allocate any memory by kmalloc. + * Recover may happen on panic, we can't allocate any memory by kmalloc. * So, we use local array instead. */ char buffer_header[sizeof(*buf) + sizeof(*hdr)] = {0}; @@ -856,11 +856,11 @@ static int notrace psz_record_write(struct pstore_zone *zone, /** * psz_zone_write will set datalen as start + cnt. - * It work if actual data length lesser than buffer size. - * If data length greater than buffer size, pmsg will rewrite to - * beginning of zone, which make buffer->datalen wrongly. + * It works if actual data length is lesser than buffer size. + * If data length is greater than buffer size, pmsg will rewrite to + * the beginning of the zone, which makes buffer->datalen wrong. * So we should reset datalen as buffer size once actual data length - * greater than buffer size. + * is greater than buffer size. */ if (is_full_data) { atomic_set(&zone->buffer->datalen, zone->buffer_size); @@ -878,8 +878,9 @@ static int notrace psz_pstore_write(struct pstore_record *record) atomic_set(&cxt->on_panic, 1); /* - * if on panic, do not write except panic records - * Fix case that panic_write prints log which wakes up console backend. + * If on panic, do not write anything except panic records. + * Fix the case when panic_write prints log that wakes up + * console backend. */ if (is_on_panic() && record->type != PSTORE_TYPE_DMESG) return -EBUSY; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index df4a9b348769..afa15a214538 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -881,7 +881,7 @@ void dqput(struct dquot *dquot) put_releasing_dquots(dquot); atomic_dec(&dquot->dq_count); spin_unlock(&dq_list_lock); - queue_delayed_work(system_unbound_wq, "a_release_work, 1); + queue_delayed_work(system_dfl_wq, "a_release_work, 1); } EXPORT_SYMBOL(dqput); diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index f8874c3b8c1e..41f9995da7ca 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -215,7 +215,7 @@ static int ramfs_show_options(struct seq_file *m, struct dentry *root) static const struct super_operations ramfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = ramfs_show_options, }; diff --git a/fs/read_write.c b/fs/read_write.c index c5b6265d984b..833bae068770 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1576,6 +1576,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (len == 0) return 0; + /* + * Make sure return value doesn't overflow in 32bit compat mode. Also + * limit the size for all cases except when calling ->copy_file_range(). + */ + if (splice || !file_out->f_op->copy_file_range || in_compat_syscall()) + len = min_t(size_t, MAX_RW_COUNT, len); + file_start_write(file_out); /* @@ -1589,9 +1596,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, - min_t(loff_t, MAX_RW_COUNT, len), - REMAP_FILE_CAN_SHORTEN); + file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; @@ -1624,8 +1629,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ - ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, - min_t(size_t, len, MAX_RW_COUNT), 0); + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0); done: if (ret > 0) { fsnotify_access(file_in); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index d98e0d2de09f..0d0ef54fc4de 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -473,12 +473,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, rdt_last_cmd_clear(); if (!strcmp(buf, "mbm_local_bytes")) { - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; else ret = -EINVAL; } else if (!strcmp(buf, "mbm_total_bytes")) { - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; else ret = -EINVAL; @@ -563,10 +563,15 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->r = r; rr->d = d; rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); - if (IS_ERR(rr->arch_mon_ctx)) { - rr->err = -EINVAL; - return; + if (resctrl_arch_mbm_cntr_assign_enabled(r) && + resctrl_is_mbm_event(evtid)) { + rr->is_mbm_cntr = true; + } else { + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } } cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); @@ -582,7 +587,8 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); + if (rr->arch_mon_ctx) + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) @@ -625,11 +631,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) */ list_for_each_entry(d, &r->mon_domains, hdr.list) { if (d->ci_id == domid) { - rr.ci_id = d->ci_id; cpu = cpumask_any(&d->hdr.cpu_mask); ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) continue; + rr.ci = ci; mon_event_read(&rr, r, NULL, rdtgrp, &ci->shared_cpu_map, evtid, false); goto checkresult; @@ -653,10 +659,16 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) checkresult: + /* + * -ENOENT is a special case, set only when "mbm_event" counter assignment + * mode is enabled and no counter has been assigned. + */ if (rr.err == -EIO) seq_puts(m, "Error\n"); else if (rr.err == -EINVAL) seq_puts(m, "Unavailable\n"); + else if (rr.err == -ENOENT) + seq_puts(m, "Unassigned\n"); else seq_printf(m, "%llu\n", rr.val); diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 0a1eedba2b03..cf1fd82dc5a9 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -52,19 +52,31 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) } /** - * struct mon_evt - Entry in the event list of a resource + * struct mon_evt - Properties of a monitor event * @evtid: event id + * @rid: resource id for this event * @name: name of the event + * @evt_cfg: Event configuration value that represents the + * memory transactions (e.g., READS_TO_LOCAL_MEM, + * READS_TO_REMOTE_MEM) being tracked by @evtid. + * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list + * @enabled: true if the event is enabled */ struct mon_evt { enum resctrl_event_id evtid; + enum resctrl_res_level rid; char *name; + u32 evt_cfg; bool configurable; - struct list_head list; + bool enabled; }; +extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; + +#define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \ + mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++) + /** * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. @@ -98,7 +110,9 @@ struct mon_data { * domains in @r sharing L3 @ci.id * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains. + * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @is_mbm_cntr: true if "mbm_event" counter assignment mode is enabled and it + * is an MBM event. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -112,7 +126,8 @@ struct rmid_read { struct rdt_mon_domain *d; enum resctrl_event_id evtid; bool first; - unsigned int ci_id; + struct cacheinfo *ci; + bool is_mbm_cntr; int err; u64 val; void *arch_mon_ctx; @@ -226,6 +241,8 @@ struct rdtgroup { #define RFTYPE_DEBUG BIT(10) +#define RFTYPE_ASSIGN_CONFIG BIT(11) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) @@ -375,6 +392,41 @@ bool closid_allocated(unsigned int closid); int resctrl_find_cleanest_closid(void); +void *rdt_kn_parent_priv(struct kernfs_node *kn); + +int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show); + +int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, + void *v); + +void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); + +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); + +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); + +ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index f5637855c3ac..4076336fbba6 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -336,7 +336,7 @@ void free_rmid(u32 closid, u32 rmid) entry = __rmid_entry(idx); - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) add_rmid_to_limbo(entry); else list_add_tail(&entry->list, &rmid_free_lru); @@ -346,28 +346,97 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct mbm_state *state; - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: + if (!resctrl_is_mbm_event(evtid)) return NULL; + + state = d->mbm_states[MBM_STATE_IDX(evtid)]; + + return state ? &state[idx] : NULL; +} + +/* + * mbm_cntr_get() - Return the counter ID for the matching @evtid and @rdtgrp. + * + * Return: + * Valid counter ID on success, or -ENOENT on failure. + */ +static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) +{ + int cntr_id; + + if (!r->mon.mbm_cntr_assignable) + return -ENOENT; + + if (!resctrl_is_mbm_event(evtid)) + return -ENOENT; + + for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) { + if (d->cntr_cfg[cntr_id].rdtgrp == rdtgrp && + d->cntr_cfg[cntr_id].evtid == evtid) + return cntr_id; + } + + return -ENOENT; +} + +/* + * mbm_cntr_alloc() - Initialize and return a new counter ID in the domain @d. + * Caller must ensure that the specified event is not assigned already. + * + * Return: + * Valid counter ID on success, or -ENOSPC on failure. + */ +static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) +{ + int cntr_id; + + for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) { + if (!d->cntr_cfg[cntr_id].rdtgrp) { + d->cntr_cfg[cntr_id].rdtgrp = rdtgrp; + d->cntr_cfg[cntr_id].evtid = evtid; + return cntr_id; + } } + + return -ENOSPC; } -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) +/* + * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d. + */ +static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) +{ + memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); +} + +static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { int cpu = smp_processor_id(); + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; struct rdt_mon_domain *d; - struct cacheinfo *ci; + int cntr_id = -ENOENT; struct mbm_state *m; int err, ret; u64 tval = 0; + if (rr->is_mbm_cntr) { + cntr_id = mbm_cntr_get(rr->r, rr->d, rdtgrp, rr->evtid); + if (cntr_id < 0) { + rr->err = -ENOENT; + return -EINVAL; + } + } + if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + if (rr->is_mbm_cntr) + resctrl_arch_reset_cntr(rr->r, rr->d, closid, rmid, cntr_id, rr->evtid); + else + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); m = get_mbm_state(rr->d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); @@ -378,8 +447,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) /* Reading a single domain, must be on a CPU in that domain. */ if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) return -EINVAL; - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->is_mbm_cntr) + rr->err = resctrl_arch_cntr_read(rr->r, rr->d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -389,8 +462,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) } /* Summing domains that share a cache, must be on a CPU for that cache. */ - ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); - if (!ci || ci->id != rr->ci_id) + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) return -EINVAL; /* @@ -402,10 +474,14 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) */ ret = -EINVAL; list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci_id != rr->ci_id) + if (d->ci_id != rr->ci->id) continue; - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->is_mbm_cntr) + err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -421,8 +497,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). - * @closid: The closid used to identify the cached mbm_state. - * @rmid: The rmid used to identify the cached mbm_state. + * @rdtgrp: resctrl group associated with the CLOSID and RMID to identify + * the cached mbm_state. * @rr: The struct rmid_read populated by __mon_event_count(). * * Supporting function to calculate the memory bandwidth @@ -430,9 +506,11 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) * __mon_event_count() is compared with the chunks value from the previous * invocation. This must be called once per second to maintain values in MBps. */ -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) +static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { u64 cur_bw, bytes, cur_bytes; + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; struct mbm_state *m; m = get_mbm_state(rr->d, closid, rmid, rr->evtid); @@ -461,7 +539,7 @@ void mon_event_count(void *info) rdtgrp = rr->rgrp; - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp, rr); /* * For Ctrl groups read data from child monitor groups and @@ -472,8 +550,7 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->closid, entry->mon.rmid, - rr) == 0) + if (__mon_event_count(entry, rr) == 0) ret = 0; } } @@ -604,44 +681,49 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) } static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid, enum resctrl_event_id evtid) + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { struct rmid_read rr = {0}; rr.r = r; rr.d = d; rr.evtid = evtid; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; + if (resctrl_arch_mbm_cntr_assign_enabled(r)) { + rr.is_mbm_cntr = true; + } else { + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } } - __mon_event_count(closid, rmid, &rr); + __mon_event_count(rdtgrp, &rr); /* * If the software controller is enabled, compute the * bandwidth for this event id. */ if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); + mbm_bw_count(rdtgrp, &rr); - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); + if (rr.arch_mon_ctx) + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid) + struct rdtgroup *rdtgrp) { /* * This is protected from concurrent reads from user as both * the user and overflow handler hold the global mutex. */ - if (resctrl_arch_is_mbm_total_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_TOTAL_EVENT_ID); - if (resctrl_arch_is_mbm_local_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_LOCAL_EVENT_ID); } /* @@ -714,11 +796,11 @@ void mbm_handle_overflow(struct work_struct *work) d = container_of(work, struct rdt_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); + mbm_update(r, d, prgrp); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); + mbm_update(r, d, crgrp); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); @@ -844,38 +926,819 @@ out_unlock: mutex_unlock(&rdtgroup_mutex); } -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, +/* + * All available events. Architecture code marks the ones that + * are supported by a system using resctrl_enable_mon_event() + * to set .enabled. + */ +struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { + [QOS_L3_OCCUP_EVENT_ID] = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_TOTAL_EVENT_ID] = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_LOCAL_EVENT_ID] = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, }; -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +void resctrl_enable_mon_event(enum resctrl_event_id eventid) +{ + if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) + return; + if (mon_event_all[eventid].enabled) { + pr_warn("Duplicate enable for event %d\n", eventid); + return; + } + + mon_event_all[eventid].enabled = true; +} + +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) +{ + return eventid >= QOS_FIRST_EVENT && eventid < QOS_NUM_EVENTS && + mon_event_all[eventid].enabled; +} + +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) +{ + return mon_event_all[evtid].evt_cfg; +} + +/** + * struct mbm_transaction - Memory transaction an MBM event can be configured with. + * @name: Name of memory transaction (read, write ...). + * @val: The bit (eg. READS_TO_LOCAL_MEM or READS_TO_REMOTE_MEM) used to + * represent the memory transaction within an event's configuration. + */ +struct mbm_transaction { + char name[32]; + u32 val; }; -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +/* Decoded values for each type of memory transaction. */ +static struct mbm_transaction mbm_transactions[NUM_MBM_TRANSACTIONS] = { + {"local_reads", READS_TO_LOCAL_MEM}, + {"remote_reads", READS_TO_REMOTE_MEM}, + {"local_non_temporal_writes", NON_TEMP_WRITE_TO_LOCAL_MEM}, + {"remote_non_temporal_writes", NON_TEMP_WRITE_TO_REMOTE_MEM}, + {"local_reads_slow_memory", READS_TO_LOCAL_S_MEM}, + {"remote_reads_slow_memory", READS_TO_REMOTE_S_MEM}, + {"dirty_victim_writes_all", DIRTY_VICTIMS_TO_ALL_MEM}, }; +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + bool sep = false; + int ret = 0, i; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (mevt->evt_cfg & mbm_transactions[i].val) { + if (sep) + seq_putc(seq, ','); + seq_printf(seq, "%s", mbm_transactions[i].name); + sep = true; + } + } + seq_putc(seq, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, struct seq_file *s, + void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + seq_printf(s, "%u\n", r->mon.mbm_assign_on_mkdir); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool value; + int ret; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + r->mon.mbm_assign_on_mkdir = value; + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + +/* + * mbm_cntr_free_all() - Clear all the counter ID configuration details in the + * domain @d. Called when mbm_assign_mode is changed. + */ +static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + memset(d->cntr_cfg, 0, sizeof(*d->cntr_cfg) * r->mon.num_mbm_cntrs); +} + +/* + * resctrl_reset_rmid_all() - Reset all non-architecture states for all the + * supported RMIDs. + */ +static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + enum resctrl_event_id evt; + int idx; + + for_each_mbm_event_id(evt) { + if (!resctrl_is_mon_event_enabled(evt)) + continue; + idx = MBM_STATE_IDX(evt); + memset(d->mbm_states[idx], 0, sizeof(*d->mbm_states[0]) * idx_limit); + } +} + +/* + * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID + * pair in the domain. + * + * Assign the counter if @assign is true else unassign the counter. Reset the + * associated non-architectural state. + */ +static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct mbm_state *m; + + resctrl_arch_config_cntr(r, d, evtid, rmid, closid, cntr_id, assign); + + m = get_mbm_state(d, closid, rmid, evtid); + if (m) + memset(m, 0, sizeof(*m)); +} + +/* + * rdtgroup_alloc_assign_cntr() - Allocate a counter ID and assign it to the event + * pointed to by @mevt and the resctrl group @rdtgrp within the domain @d. + * + * Return: + * 0 on success, < 0 on failure. + */ +static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int cntr_id; + + /* No action required if the counter is assigned already. */ + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); + if (cntr_id >= 0) + return 0; + + cntr_id = mbm_cntr_alloc(r, d, rdtgrp, mevt->evtid); + if (cntr_id < 0) { + rdt_last_cmd_printf("Failed to allocate counter for %s in domain %d\n", + mevt->name, d->hdr.id); + return cntr_id; + } + + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, true); + + return 0; +} + /* - * Initialize the event list for the resource. + * rdtgroup_assign_cntr_event() - Assign a hardware counter for the event in + * @mevt to the resctrl group @rdtgrp. Assign counters to all domains if @d is + * NULL; otherwise, assign the counter to the specified domain @d. + * + * If all counters in a domain are already in use, rdtgroup_alloc_assign_cntr() + * will fail. The assignment process will abort at the first failure encountered + * during domain traversal, which may result in the event being only partially + * assigned. * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. + * Return: + * 0 on success, < 0 on failure. + */ +static int rdtgroup_assign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + int ret = 0; + + if (!d) { + list_for_each_entry(d, &r->mon_domains, hdr.list) { + ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt); + if (ret) + return ret; + } + } else { + ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt); + } + + return ret; +} + +/* + * rdtgroup_assign_cntrs() - Assign counters to MBM events. Called when + * a new group is created. + * + * Each group can accommodate two counters per domain: one for the total + * event and one for the local event. Assignments may fail due to the limited + * number of counters. However, it is not necessary to fail the group creation + * and thus no failure is returned. Users have the option to modify the + * counter assignments after the group has been created. + */ +void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r) || + !r->mon.mbm_assign_on_mkdir) + return; + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + rdtgroup_assign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + rdtgroup_assign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); +} + +/* + * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration + * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp. + */ +static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int cntr_id; + + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); + + /* If there is no cntr_id assigned, nothing to do */ + if (cntr_id < 0) + return; + + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, false); + + mbm_cntr_free(d, cntr_id); +} + +/* + * rdtgroup_unassign_cntr_event() - Unassign a hardware counter associated with + * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign + * the counters from all the domains if @d is NULL else unassign from @d. + */ +static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + + if (!d) { + list_for_each_entry(d, &r->mon_domains, hdr.list) + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); + } else { + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); + } +} + +/* + * rdtgroup_unassign_cntrs() - Unassign the counters associated with MBM events. + * Called when a group is deleted. */ -static void l3_mon_evt_init(struct rdt_resource *r) +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) { - INIT_LIST_HEAD(&r->evt_list); + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - if (resctrl_arch_is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) + return; + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + rdtgroup_unassign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + rdtgroup_unassign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); +} + +static int resctrl_parse_mem_transactions(char *tok, u32 *val) +{ + u32 temp_val = 0; + char *evt_str; + bool found; + int i; + +next_config: + if (!tok || tok[0] == '\0') { + *val = temp_val; + return 0; + } + + /* Start processing the strings for each memory transaction type */ + evt_str = strim(strsep(&tok, ",")); + found = false; + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (!strcmp(mbm_transactions[i].name, evt_str)) { + temp_val |= mbm_transactions[i].val; + found = true; + break; + } + } + + if (!found) { + rdt_last_cmd_printf("Invalid memory transaction type %s\n", evt_str); + return -EINVAL; + } + + goto next_config; +} + +/* + * rdtgroup_update_cntr_event - Update the counter assignments for the event + * in a group. + * @r: Resource to which update needs to be done. + * @rdtgrp: Resctrl group. + * @evtid: MBM monitor event. + */ +static void rdtgroup_update_cntr_event(struct rdt_resource *r, struct rdtgroup *rdtgrp, + enum resctrl_event_id evtid) +{ + struct rdt_mon_domain *d; + int cntr_id; + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + cntr_id = mbm_cntr_get(r, d, rdtgrp, evtid); + if (cntr_id >= 0) + rdtgroup_assign_cntr(r, d, evtid, rdtgrp->mon.rmid, + rdtgrp->closid, cntr_id, true); + } +} + +/* + * resctrl_update_cntr_allrdtgrp - Update the counter assignments for the event + * for all the groups. + * @mevt MBM Monitor event. + */ +static void resctrl_update_cntr_allrdtgrp(struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + struct rdtgroup *prgrp, *crgrp; + + /* + * Find all the groups where the event is assigned and update the + * configuration of existing assignments. + */ + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + rdtgroup_update_cntr_event(r, prgrp, mevt->evtid); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + rdtgroup_update_cntr_event(r, crgrp, mevt->evtid); + } +} + +ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + u32 evt_cfg = 0; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + ret = resctrl_parse_mem_transactions(buf, &evt_cfg); + if (!ret && mevt->evt_cfg != evt_cfg) { + mevt->evt_cfg = evt_cfg; + resctrl_update_cntr_allrdtgrp(mevt); + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool enabled; + + mutex_lock(&rdtgroup_mutex); + enabled = resctrl_arch_mbm_cntr_assign_enabled(r); + + if (r->mon.mbm_cntr_assignable) { + if (enabled) + seq_puts(s, "[mbm_event]\n"); + else + seq_puts(s, "[default]\n"); + + if (!IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED)) { + if (enabled) + seq_puts(s, "default\n"); + else + seq_puts(s, "mbm_event\n"); + } + } else { + seq_puts(s, "[default]\n"); + } + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *d; + int ret = 0; + bool enable; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!strcmp(buf, "default")) { + enable = 0; + } else if (!strcmp(buf, "mbm_event")) { + if (r->mon.mbm_cntr_assignable) { + enable = 1; + } else { + ret = -EINVAL; + rdt_last_cmd_puts("mbm_event mode is not supported\n"); + goto out_unlock; + } + } else { + ret = -EINVAL; + rdt_last_cmd_puts("Unsupported assign mode\n"); + goto out_unlock; + } + + if (enable != resctrl_arch_mbm_cntr_assign_enabled(r)) { + ret = resctrl_arch_mbm_cntr_assign_set(r, enable); + if (ret) + goto out_unlock; + + /* Update the visibility of BMEC related files */ + resctrl_bmec_files_show(r, NULL, !enable); + + /* + * Initialize the default memory transaction values for + * total and local events. + */ + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); + /* Enable auto assignment when switching to "mbm_event" mode */ + if (enable) + r->mon.mbm_assign_on_mkdir = true; + /* + * Reset all the non-achitectural RMID state and assignable counters. + */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + mbm_cntr_free_all(r, d); + resctrl_reset_rmid_all(r, d); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *dom; + bool sep = false; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + seq_printf(s, "%d=%d", dom->hdr.id, r->mon.num_mbm_cntrs); + sep = true; + } + seq_putc(s, '\n'); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return 0; +} + +int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *dom; + bool sep = false; + u32 cntrs, i; + int ret = 0; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + cntrs = 0; + for (i = 0; i < r->mon.num_mbm_cntrs; i++) { + if (!dom->cntr_cfg[i].rdtgrp) + cntrs++; + } + + seq_printf(s, "%d=%u", dom->hdr.id, cntrs); + sep = true; + } + seq_putc(s, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret; +} + +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + struct mon_evt *mevt; + int ret = 0; + bool sep; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out_unlock; + } + + rdt_last_cmd_clear(); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + sep = false; + seq_printf(s, "%s:", mevt->name); + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + if (mbm_cntr_get(r, d, rdtgrp, mevt->evtid) < 0) + seq_printf(s, "%d=_", d->hdr.id); + else + seq_printf(s, "%d=e", d->hdr.id); + + sep = true; + } + seq_putc(s, '\n'); + } + +out_unlock: + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* + * mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching + * event name. + */ +static struct mon_evt *mbm_get_mon_event_by_name(struct rdt_resource *r, char *name) +{ + struct mon_evt *mevt; + + for_each_mon_event(mevt) { + if (mevt->rid == r->rid && mevt->enabled && + resctrl_is_mbm_event(mevt->evtid) && + !strcmp(mevt->name, name)) + return mevt; + } + + return NULL; +} + +static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int ret = 0; + + if (!assign || strlen(assign) != 1) + return -EINVAL; + + switch (*assign) { + case 'e': + ret = rdtgroup_assign_cntr_event(d, rdtgrp, mevt); + break; + case '_': + rdtgroup_unassign_cntr_event(d, rdtgrp, mevt); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup *rdtgrp, + char *event, char *tok) +{ + struct rdt_mon_domain *d; + unsigned long dom_id = 0; + char *dom_str, *id_str; + struct mon_evt *mevt; + int ret; + + mevt = mbm_get_mon_event_by_name(r, event); + if (!mevt) { + rdt_last_cmd_printf("Invalid event %s\n", event); + return -ENOENT; + } + +next: + if (!tok || tok[0] == '\0') + return 0; + + /* Start processing the strings for each domain */ + dom_str = strim(strsep(&tok, ";")); + + id_str = strsep(&dom_str, "="); + + /* Check for domain id '*' which means all domains */ + if (id_str && *id_str == '*') { + ret = rdtgroup_modify_assign_state(dom_str, NULL, rdtgrp, mevt); + if (ret) + rdt_last_cmd_printf("Assign operation '%s:*=%s' failed\n", + event, dom_str); + return ret; + } else if (!id_str || kstrtoul(id_str, 10, &dom_id)) { + rdt_last_cmd_puts("Missing domain id\n"); + return -EINVAL; + } + + /* Verify if the dom_id is valid */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->hdr.id == dom_id) { + ret = rdtgroup_modify_assign_state(dom_str, d, rdtgrp, mevt); + if (ret) { + rdt_last_cmd_printf("Assign operation '%s:%ld=%s' failed\n", + event, dom_id, dom_str); + return ret; + } + goto next; + } + } + + rdt_last_cmd_printf("Invalid domain id %ld\n", dom_id); + return -EINVAL; +} + +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdtgroup *rdtgrp; + char *token, *event; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event mode is not enabled\n"); + rdtgroup_kn_unlock(of->kn); + return -EINVAL; + } + + while ((token = strsep(&buf, "\n")) != NULL) { + /* + * The write command follows the following format: + * "<Event>:<Domain ID>=<Assignment state>" + * Extract the event name first. + */ + event = strsep(&token, ":"); + + ret = resctrl_parse_mbm_assignment(r, rdtgrp, event, token); + if (ret) + break; + } + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; } /** @@ -902,24 +1765,43 @@ int resctrl_mon_resource_init(void) if (ret) return ret; - l3_mon_evt_init(r); - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { - mbm_total_event.configurable = true; + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { - mbm_local_event.configurable = true; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_local_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (resctrl_arch_is_mbm_total_enabled()) + else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + if (r->mon.mbm_cntr_assignable) { + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); + r->mon.mbm_assign_on_mkdir = true; + resctrl_file_fflags_init("num_mbm_cntrs", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("available_mbm_cntrs", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); + resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | + RFTYPE_RES_CACHE); + resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); + } + return 0; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 77d08229d855..0320360cd7a6 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -123,14 +123,8 @@ void rdt_staged_configs_clear(void) static bool resctrl_is_mbm_enabled(void) { - return (resctrl_arch_is_mbm_total_enabled() || - resctrl_arch_is_mbm_local_enabled()); -} - -static bool resctrl_is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); + return (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID) || + resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)); } /* @@ -196,7 +190,7 @@ static int closid_alloc(void) lockdep_assert_held(&rdtgroup_mutex); if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && - resctrl_arch_is_llc_occupancy_enabled()) { + resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { cleanest_closid = resctrl_find_cleanest_closid(); if (cleanest_closid < 0) return cleanest_closid; @@ -981,7 +975,7 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of, return 0; } -static void *rdt_kn_parent_priv(struct kernfs_node *kn) +void *rdt_kn_parent_priv(struct kernfs_node *kn) { /* * The parent pointer is only valid within RCU section since it can be @@ -1141,7 +1135,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of, { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - seq_printf(seq, "%d\n", r->num_rmid); + seq_printf(seq, "%d\n", r->mon.num_rmid); return 0; } @@ -1152,9 +1146,12 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, struct rdt_resource *r = rdt_kn_parent_priv(of->kn); struct mon_evt *mevt; - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; seq_printf(seq, "%s\n", mevt->name); - if (mevt->configurable) + if (mevt->configurable && + !resctrl_arch_mbm_cntr_assign_enabled(r)) seq_printf(seq, "%s_config\n", mevt->name); } @@ -1735,9 +1732,9 @@ next: } /* Value from user cannot be more than the supported set of events */ - if ((val & r->mbm_cfg_mask) != val) { + if ((val & r->mon.mbm_cfg_mask) != val) { rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - r->mbm_cfg_mask); + r->mon.mbm_cfg_mask); return -EINVAL; } @@ -1803,6 +1800,44 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, return ret ?: nbytes; } +/* + * resctrl_bmec_files_show() — Controls the visibility of BMEC-related resctrl + * files. When @show is true, the files are displayed; when false, the files + * are hidden. + * Don't treat kernfs_find_and_get failure as an error, since this function may + * be called regardless of whether BMEC is supported or the event is enabled. + */ +void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show) +{ + struct kernfs_node *kn_config, *mon_kn = NULL; + char name[32]; + + if (!l3_mon_kn) { + sprintf(name, "%s_MON", r->name); + mon_kn = kernfs_find_and_get(kn_info, name); + if (!mon_kn) + return; + l3_mon_kn = mon_kn; + } + + kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_total_bytes_config"); + if (kn_config) { + kernfs_show(kn_config, show); + kernfs_put(kn_config); + } + + kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_local_bytes_config"); + if (kn_config) { + kernfs_show(kn_config, show); + kernfs_put(kn_config); + } + + /* Release the reference only if it was acquired */ + if (mon_kn) + kernfs_put(mon_kn); +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -1813,6 +1848,13 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_TOP_INFO, }, { + .name = "mbm_assign_on_mkdir", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_on_mkdir_show, + .write = resctrl_mbm_assign_on_mkdir_write, + }, + { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, @@ -1827,6 +1869,12 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_MON_INFO, }, { + .name = "available_mbm_cntrs", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_available_mbm_cntrs_show, + }, + { .name = "num_rmids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, @@ -1841,6 +1889,12 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { + .name = "num_mbm_cntrs", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_num_mbm_cntrs_show, + }, + { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, @@ -1916,6 +1970,28 @@ static struct rftype res_common_files[] = { .write = mbm_local_bytes_config_write, }, { + .name = "event_filter", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = event_filter_show, + .write = event_filter_write, + }, + { + .name = "mbm_L3_assignments", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_L3_assignments_show, + .write = mbm_L3_assignments_write, + }, + { + .name = "mbm_assign_mode", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_mode_show, + .write = resctrl_mbm_assign_mode_write, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, + }, + { .name = "cpus", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, @@ -2168,10 +2244,48 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, return ret; } +static int resctrl_mkdir_event_configs(struct rdt_resource *r, struct kernfs_node *l3_mon_kn) +{ + struct kernfs_node *kn_subdir, *kn_subdir2; + struct mon_evt *mevt; + int ret; + + kn_subdir = kernfs_create_dir(l3_mon_kn, "event_configs", l3_mon_kn->mode, NULL); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + kn_subdir2 = kernfs_create_dir(kn_subdir, mevt->name, kn_subdir->mode, mevt); + if (IS_ERR(kn_subdir2)) { + ret = PTR_ERR(kn_subdir2); + goto out; + } + + ret = rdtgroup_kn_set_ugid(kn_subdir2); + if (ret) + goto out; + + ret = rdtgroup_add_files(kn_subdir2, RFTYPE_ASSIGN_CONFIG); + if (ret) + break; + } + +out: + return ret; +} + static int rdtgroup_mkdir_info_resdir(void *priv, char *name, unsigned long fflags) { struct kernfs_node *kn_subdir; + struct rdt_resource *r; int ret; kn_subdir = kernfs_create_dir(kn_info, name, @@ -2184,8 +2298,25 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name, return ret; ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); + if (ret) + return ret; + + if ((fflags & RFTYPE_MON_INFO) == RFTYPE_MON_INFO) { + r = priv; + if (r->mon.mbm_cntr_assignable) { + ret = resctrl_mkdir_event_configs(r, kn_subdir); + if (ret) + return ret; + /* + * Hide BMEC related files if mbm_event mode + * is enabled. + */ + if (resctrl_arch_mbm_cntr_assign_enabled(r)) + resctrl_bmec_files_show(r, kn_subdir, false); + } + } + + kernfs_activate(kn_subdir); return ret; } @@ -2608,10 +2739,8 @@ static int rdt_get_tree(struct fs_context *fc) goto out_root; ret = schemata_list_create(); - if (ret) { - schemata_list_destroy(); - goto out_ctx; - } + if (ret) + goto out_schemata_free; ret = closid_init(); if (ret) @@ -2637,6 +2766,8 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_info; + rdtgroup_assign_cntrs(&rdtgroup_default); + ret = mkdir_mondata_all(rdtgroup_default.kn, &rdtgroup_default, &kn_mondata); if (ret < 0) @@ -2675,15 +2806,16 @@ out_mondata: if (resctrl_arch_mon_capable()) kernfs_remove(kn_mondata); out_mongrp: - if (resctrl_arch_mon_capable()) + if (resctrl_arch_mon_capable()) { + rdtgroup_unassign_cntrs(&rdtgroup_default); kernfs_remove(kn_mongrp); + } out_info: kernfs_remove(kn_info); out_closid_exit: closid_exit(); out_schemata_free: schemata_list_destroy(); -out_ctx: rdt_disable_ctx(); out_root: rdtgroup_destroy_root(); @@ -2822,6 +2954,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) head = &rdtgrp->mon.crdtgrp_list; list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + rdtgroup_unassign_cntrs(sentry); free_rmid(sentry->closid, sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); @@ -2862,6 +2995,8 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); kernfs_remove(rdtgrp->kn); @@ -2946,6 +3081,7 @@ static void resctrl_fs_teardown(void) return; rmdir_all_sub(); + rdtgroup_unassign_cntrs(&rdtgroup_default); mon_put_kn_priv(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; @@ -3057,10 +3193,9 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, struct mon_evt *mevt; int ret, domid; - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; domid = do_sum ? d->ci_id : d->hdr.id; priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); if (WARN_ON_ONCE(!priv)) @@ -3427,9 +3562,12 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) } rdtgrp->mon.rmid = ret; + rdtgroup_assign_cntrs(rdtgrp); + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); if (ret) { rdt_last_cmd_puts("kernfs subdir error\n"); + rdtgroup_unassign_cntrs(rdtgrp); free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); return ret; } @@ -3439,8 +3577,10 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) { - if (resctrl_arch_mon_capable()) + if (resctrl_arch_mon_capable()) { + rdtgroup_unassign_cntrs(rgrp); free_rmid(rgrp->closid, rgrp->mon.rmid); + } } /* @@ -3716,6 +3856,9 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; + + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); /* @@ -3763,6 +3906,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); closid_free(rdtgrp->closid); @@ -4022,9 +4167,14 @@ static void rdtgroup_setup_default(void) static void domain_destroy_mon_state(struct rdt_mon_domain *d) { + int idx; + + kfree(d->cntr_cfg); bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } } void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -4050,7 +4200,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -4084,32 +4234,41 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; + size_t tsize = sizeof(*d->mbm_states[0]); + enum resctrl_event_id eventid; + int idx; - if (resctrl_arch_is_llc_occupancy_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (resctrl_arch_is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + d->mbm_states[idx] = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_states[idx]) + goto cleanup; } - if (resctrl_arch_is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } + + if (resctrl_is_mbm_enabled() && r->mon.mbm_cntr_assignable) { + tsize = sizeof(*d->cntr_cfg); + d->cntr_cfg = kcalloc(r->mon.num_mbm_cntrs, tsize, GFP_KERNEL); + if (!d->cntr_cfg) + goto cleanup; } return 0; +cleanup: + bitmap_free(d->rmid_busy_llc); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } + + return -ENOMEM; } int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -4144,7 +4303,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) RESCTRL_PICK_ANY_CPU); } - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); /* @@ -4219,7 +4378,7 @@ void resctrl_offline_cpu(unsigned int cpu) cancel_delayed_work(&d->mbm_over); mbm_setup_overflow_handler(d, 0, cpu); } - if (resctrl_arch_is_llc_occupancy_enabled() && + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && cpu == d->cqm_work_cpu && has_busy_rmid(d)) { cancel_delayed_work(&d->cqm_limbo); cqm_setup_limbo_handler(d, 0, cpu); diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index beb4f18f05ef..35c4d27d2cc0 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -24,6 +24,7 @@ #endif #ifdef CONFIG_CIFS_SMB_DIRECT #include "smbdirect.h" +#include "../common/smbdirect/smbdirect_pdu.h" #endif #include "cifs_swn.h" #include "cached_dir.h" @@ -304,6 +305,8 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v) list_for_each(tmp1, &ses->tcon_list) { tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); cfids = tcon->cfids; + if (!cfids) + continue; spin_lock(&cfids->cfid_list_lock); /* check lock ordering */ seq_printf(m, "Num entries: %d\n", cfids->num_entries); list_for_each_entry(cfid, &cfids->entries, entry) { @@ -319,8 +322,6 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v) seq_printf(m, "\n"); } spin_unlock(&cfids->cfid_list_lock); - - } } } @@ -347,6 +348,22 @@ static __always_inline const char *compression_alg_str(__le16 alg) } } +static __always_inline const char *cipher_alg_str(__le16 cipher) +{ + switch (cipher) { + case SMB2_ENCRYPTION_AES128_CCM: + return "AES128-CCM"; + case SMB2_ENCRYPTION_AES128_GCM: + return "AES128-GCM"; + case SMB2_ENCRYPTION_AES256_CCM: + return "AES256-CCM"; + case SMB2_ENCRYPTION_AES256_GCM: + return "AES256-GCM"; + default: + return "UNKNOWN"; + } +} + static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { struct mid_q_entry *mid_entry; @@ -440,57 +457,55 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) sc = &server->smbd_conn->socket; sp = &sc->parameters; - seq_printf(m, "\nSMBDirect (in hex) protocol version: %x " - "transport status: %x", - server->smbd_conn->protocol, - server->smbd_conn->socket.status); - seq_printf(m, "\nConn receive_credit_max: %x " - "send_credit_target: %x max_send_size: %x", + seq_printf(m, "\nSMBDirect protocol version: 0x%x " + "transport status: %s (%u)", + SMBDIRECT_V1, + smbdirect_socket_status_string(sc->status), + sc->status); + seq_printf(m, "\nConn receive_credit_max: %u " + "send_credit_target: %u max_send_size: %u", sp->recv_credit_max, sp->send_credit_target, sp->max_send_size); - seq_printf(m, "\nConn max_fragmented_recv_size: %x " - "max_fragmented_send_size: %x max_receive_size:%x", + seq_printf(m, "\nConn max_fragmented_recv_size: %u " + "max_fragmented_send_size: %u max_receive_size:%u", sp->max_fragmented_recv_size, sp->max_fragmented_send_size, sp->max_recv_size); - seq_printf(m, "\nConn keep_alive_interval: %x " - "max_readwrite_size: %x rdma_readwrite_threshold: %x", + seq_printf(m, "\nConn keep_alive_interval: %u " + "max_readwrite_size: %u rdma_readwrite_threshold: %u", sp->keepalive_interval_msec * 1000, sp->max_read_write_size, - server->smbd_conn->rdma_readwrite_threshold); - seq_printf(m, "\nDebug count_get_receive_buffer: %x " - "count_put_receive_buffer: %x count_send_empty: %x", - server->smbd_conn->count_get_receive_buffer, - server->smbd_conn->count_put_receive_buffer, - server->smbd_conn->count_send_empty); - seq_printf(m, "\nRead Queue count_reassembly_queue: %x " - "count_enqueue_reassembly_queue: %x " - "count_dequeue_reassembly_queue: %x " - "reassembly_data_length: %x " - "reassembly_queue_length: %x", - server->smbd_conn->count_reassembly_queue, - server->smbd_conn->count_enqueue_reassembly_queue, - server->smbd_conn->count_dequeue_reassembly_queue, + server->rdma_readwrite_threshold); + seq_printf(m, "\nDebug count_get_receive_buffer: %llu " + "count_put_receive_buffer: %llu count_send_empty: %llu", + sc->statistics.get_receive_buffer, + sc->statistics.put_receive_buffer, + sc->statistics.send_empty); + seq_printf(m, "\nRead Queue " + "count_enqueue_reassembly_queue: %llu " + "count_dequeue_reassembly_queue: %llu " + "reassembly_data_length: %u " + "reassembly_queue_length: %u", + sc->statistics.enqueue_reassembly_queue, + sc->statistics.dequeue_reassembly_queue, sc->recv_io.reassembly.data_length, sc->recv_io.reassembly.queue_length); - seq_printf(m, "\nCurrent Credits send_credits: %x " - "receive_credits: %x receive_credit_target: %x", - atomic_read(&server->smbd_conn->send_credits), - atomic_read(&server->smbd_conn->receive_credits), - server->smbd_conn->receive_credit_target); - seq_printf(m, "\nPending send_pending: %x ", - atomic_read(&server->smbd_conn->send_pending)); - seq_printf(m, "\nReceive buffers count_receive_queue: %x ", - server->smbd_conn->count_receive_queue); - seq_printf(m, "\nMR responder_resources: %x " - "max_frmr_depth: %x mr_type: %x", - server->smbd_conn->responder_resources, - server->smbd_conn->max_frmr_depth, - server->smbd_conn->mr_type); - seq_printf(m, "\nMR mr_ready_count: %x mr_used_count: %x", - atomic_read(&server->smbd_conn->mr_ready_count), - atomic_read(&server->smbd_conn->mr_used_count)); + seq_printf(m, "\nCurrent Credits send_credits: %u " + "receive_credits: %u receive_credit_target: %u", + atomic_read(&sc->send_io.credits.count), + atomic_read(&sc->recv_io.credits.count), + sc->recv_io.credits.target); + seq_printf(m, "\nPending send_pending: %u ", + atomic_read(&sc->send_io.pending.count)); + seq_printf(m, "\nMR responder_resources: %u " + "max_frmr_depth: %u mr_type: 0x%x", + sp->responder_resources, + sp->max_frmr_depth, + sc->mr_io.type); + seq_printf(m, "\nMR mr_ready_count: %u mr_used_count: %u", + atomic_read(&sc->mr_io.ready.count), + atomic_read(&sc->mr_io.used.count)); skip_rdma: #endif seq_printf(m, "\nNumber of credits: %d,%d,%d Dialect 0x%x", @@ -539,6 +554,11 @@ skip_rdma: else seq_puts(m, "disabled (not supported by this server)"); + /* Show negotiated encryption cipher, even if not required */ + seq_puts(m, "\nEncryption: "); + if (server->cipher_type) + seq_printf(m, "Negotiated cipher (%s)", cipher_alg_str(server->cipher_type)); + seq_printf(m, "\n\n\tSessions: "); i = 0; list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { @@ -576,12 +596,8 @@ skip_rdma: /* dump session id helpful for use with network trace */ seq_printf(m, " SessionId: 0x%llx", ses->Suid); - if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) { + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) seq_puts(m, " encrypted"); - /* can help in debugging to show encryption type */ - if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM) - seq_puts(m, "(gcm256)"); - } if (ses->sign) seq_puts(m, " signed"); diff --git a/fs/smb/client/cifs_unicode.c b/fs/smb/client/cifs_unicode.c index 4cc6e0896fad..f8659d36793f 100644 --- a/fs/smb/client/cifs_unicode.c +++ b/fs/smb/client/cifs_unicode.c @@ -629,6 +629,9 @@ cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len, int len; __le16 *dst; + if (!src) + return NULL; + len = cifs_local_to_utf16_bytes(src, maxlen, cp); len += 2; /* NULL */ dst = kmalloc(len, GFP_KERNEL); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index e1848276bab4..dcb39d1b5958 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -857,7 +857,7 @@ static int cifs_drop_inode(struct inode *inode) /* no serverino => unconditional eviction */ return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) || - generic_drop_inode(inode); + inode_generic_drop(inode); } static const struct super_operations cifs_super_ops = { @@ -1895,7 +1895,9 @@ init_cifs(void) cifs_dbg(VFS, "dir_cache_timeout set to max of 65000 seconds\n"); } - cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + cifsiod_wq = alloc_workqueue("cifsiod", + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!cifsiod_wq) { rc = -ENOMEM; goto out_clean_proc; @@ -1923,28 +1925,32 @@ init_cifs(void) } cifsoplockd_wq = alloc_workqueue("cifsoplockd", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!cifsoplockd_wq) { rc = -ENOMEM; goto out_destroy_fileinfo_put_wq; } deferredclose_wq = alloc_workqueue("deferredclose", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!deferredclose_wq) { rc = -ENOMEM; goto out_destroy_cifsoplockd_wq; } serverclose_wq = alloc_workqueue("serverclose", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!serverclose_wq) { rc = -ENOMEM; goto out_destroy_deferredclose_wq; } cfid_put_wq = alloc_workqueue("cfid_put_wq", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!cfid_put_wq) { rc = -ENOMEM; goto out_destroy_serverclose_wq; diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 1e64a4fb6af0..3ac254e123dc 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -87,7 +87,7 @@ #define SMB_INTERFACE_POLL_INTERVAL 600 /* maximum number of PDUs in one compound */ -#define MAX_COMPOUND 7 +#define MAX_COMPOUND 10 /* * Default number of credits to keep available for SMB3. @@ -814,6 +814,13 @@ struct TCP_Server_Info { unsigned int max_read; unsigned int max_write; unsigned int min_offload; + /* + * If payload is less than or equal to the threshold, + * use RDMA send/recv to send upper layer I/O. + * If payload is more than the threshold, + * use RDMA read/write through memory registration for I/O. + */ + unsigned int rdma_readwrite_threshold; unsigned int retrans; struct { bool requested; /* "compress" mount option set*/ @@ -1540,7 +1547,7 @@ struct cifs_io_subrequest { struct kvec iov[2]; struct TCP_Server_Info *server; #ifdef CONFIG_CIFS_SMB_DIRECT - struct smbd_mr *mr; + struct smbdirect_mr_io *mr; #endif struct cifs_credits credits; }; @@ -1882,9 +1889,12 @@ static inline bool is_replayable_error(int error) /* cifs_get_writable_file() flags */ -#define FIND_WR_ANY 0 -#define FIND_WR_FSUID_ONLY 1 -#define FIND_WR_WITH_DELETE 2 +enum cifs_writable_file_flags { + FIND_WR_ANY = 0U, + FIND_WR_FSUID_ONLY = (1U << 0), + FIND_WR_WITH_DELETE = (1U << 1), + FIND_WR_NO_PENDING_DELETE = (1U << 2), +}; #define MID_FREE 0 #define MID_REQUEST_ALLOCATED 1 @@ -2343,6 +2353,8 @@ struct smb2_compound_vars { struct kvec qi_iov; struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec unlink_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec rename_iov[SMB2_SET_INFO_IOV_SIZE]; struct kvec close_iov; struct smb2_file_rename_info_hdr rename_info; struct smb2_file_link_info_hdr link_info; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index c34c533b2efa..e8fba98690ce 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -312,8 +312,8 @@ extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode); extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon); -extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon, - const char *path); +void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon, + struct dentry *dentry); extern void cifs_mark_open_handles_for_deleted_file(struct inode *inode, const char *path); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 186e061068be..a5ed742afa00 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -97,8 +97,12 @@ retry: cifs_trace_rw_credits_write_prepare); #ifdef CONFIG_CIFS_SMB_DIRECT - if (server->smbd_conn) - stream->sreq_max_segs = server->smbd_conn->max_frmr_depth; + if (server->smbd_conn) { + const struct smbdirect_socket_parameters *sp = + smbd_get_parameters(server->smbd_conn); + + stream->sreq_max_segs = sp->max_frmr_depth; + } #endif } @@ -187,8 +191,12 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) cifs_trace_rw_credits_read_submit); #ifdef CONFIG_CIFS_SMB_DIRECT - if (server->smbd_conn) - rreq->io_streams[0].sreq_max_segs = server->smbd_conn->max_frmr_depth; + if (server->smbd_conn) { + const struct smbdirect_socket_parameters *sp = + smbd_get_parameters(server->smbd_conn); + + rreq->io_streams[0].sreq_max_segs = sp->max_frmr_depth; + } #endif return 0; } @@ -998,7 +1006,10 @@ int cifs_open(struct inode *inode, struct file *file) /* Get the cached handle as SMB2 close is deferred */ if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) { - rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile); + rc = cifs_get_writable_path(tcon, full_path, + FIND_WR_FSUID_ONLY | + FIND_WR_NO_PENDING_DELETE, + &cfile); } else { rc = cifs_get_readable_path(tcon, full_path, &cfile); } @@ -2530,6 +2541,9 @@ refind_writable: continue; if (with_delete && !(open_file->fid.access & DELETE)) continue; + if ((flags & FIND_WR_NO_PENDING_DELETE) && + open_file->status_file_deleted) + continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { /* found a good writable file */ @@ -2647,6 +2661,16 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, spin_unlock(&tcon->open_file_lock); free_dentry_path(page); *ret_file = find_readable_file(cinode, 0); + if (*ret_file) { + spin_lock(&cinode->open_file_lock); + if ((*ret_file)->status_file_deleted) { + spin_unlock(&cinode->open_file_lock); + cifsFileInfo_put(*ret_file); + *ret_file = NULL; + } else { + spin_unlock(&cinode->open_file_lock); + } + } return *ret_file ? 0 : -ENOENT; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index fe453a4b3dc8..7e9784080501 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1931,7 +1931,7 @@ cifs_drop_nlink(struct inode *inode) * but will return the EACCES to the caller. Note that the VFS does not call * unlink on negative dentries currently. */ -int cifs_unlink(struct inode *dir, struct dentry *dentry) +static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyrename) { int rc = 0; unsigned int xid; @@ -1984,7 +1984,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) } netfs_wait_for_outstanding_io(inode); - cifs_close_deferred_file_under_dentry(tcon, full_path); + cifs_close_deferred_file_under_dentry(tcon, dentry); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { @@ -2003,7 +2003,24 @@ retry_std_delete: goto psx_del_no_retry; } - rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry); + /* For SMB2+, if the file is open, we always perform a silly rename. + * + * We check for d_count() right after calling + * cifs_close_deferred_file_under_dentry() to make sure that the + * dentry's refcount gets dropped in case the file had any deferred + * close. + */ + if (!sillyrename && server->vals->protocol_id > SMB10_PROT_ID) { + spin_lock(&dentry->d_lock); + if (d_count(dentry) > 1) + sillyrename = true; + spin_unlock(&dentry->d_lock); + } + + if (sillyrename) + rc = -EBUSY; + else + rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry); psx_del_no_retry: if (!rc) { @@ -2071,6 +2088,11 @@ unlink_out: return rc; } +int cifs_unlink(struct inode *dir, struct dentry *dentry) +{ + return __cifs_unlink(dir, dentry, false); +} + static int cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, const char *full_path, struct cifs_sb_info *cifs_sb, @@ -2358,14 +2380,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb); cifs_put_tlink(tlink); + cifsInode = CIFS_I(d_inode(direntry)); + if (!rc) { + set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags); spin_lock(&d_inode(direntry)->i_lock); i_size_write(d_inode(direntry), 0); clear_nlink(d_inode(direntry)); spin_unlock(&d_inode(direntry)->i_lock); } - cifsInode = CIFS_I(d_inode(direntry)); /* force revalidate to go get info when needed */ cifsInode->time = 0; @@ -2458,8 +2482,11 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ do_rename_exit: - if (rc == 0) + if (rc == 0) { d_move(from_dentry, to_dentry); + /* Force a new lookup */ + d_drop(from_dentry); + } cifs_put_tlink(tlink); return rc; } @@ -2470,6 +2497,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, struct dentry *target_dentry, unsigned int flags) { const char *from_name, *to_name; + struct TCP_Server_Info *server; void *page1, *page2; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; @@ -2505,6 +2533,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; page1 = alloc_dentry_path(); page2 = alloc_dentry_path(); @@ -2522,10 +2551,10 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, goto cifs_rename_exit; } - cifs_close_deferred_file_under_dentry(tcon, from_name); + cifs_close_deferred_file_under_dentry(tcon, source_dentry); if (d_inode(target_dentry) != NULL) { netfs_wait_for_outstanding_io(d_inode(target_dentry)); - cifs_close_deferred_file_under_dentry(tcon, to_name); + cifs_close_deferred_file_under_dentry(tcon, target_dentry); } rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, @@ -2591,19 +2620,53 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, unlink_target: #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ - - /* Try unlinking the target dentry if it's not negative */ - if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) { - if (d_is_dir(target_dentry)) - tmprc = cifs_rmdir(target_dir, target_dentry); - else - tmprc = cifs_unlink(target_dir, target_dentry); - if (tmprc) - goto cifs_rename_exit; - rc = cifs_do_rename(xid, source_dentry, from_name, - target_dentry, to_name); - if (!rc) - rehash = false; + if (d_really_is_positive(target_dentry)) { + if (!rc) { + struct inode *inode = d_inode(target_dentry); + /* + * Samba and ksmbd servers allow renaming a target + * directory that is open, so make sure to update + * ->i_nlink and then mark it as delete pending. + */ + if (S_ISDIR(inode->i_mode)) { + drop_cached_dir_by_name(xid, tcon, to_name, cifs_sb); + spin_lock(&inode->i_lock); + i_size_write(inode, 0); + clear_nlink(inode); + spin_unlock(&inode->i_lock); + set_bit(CIFS_INO_DELETE_PENDING, &CIFS_I(inode)->flags); + CIFS_I(inode)->time = 0; /* force reval */ + inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + } + } else if (rc == -EACCES || rc == -EEXIST) { + /* + * Rename failed, possibly due to a busy target. + * Retry it by unliking the target first. + */ + if (d_is_dir(target_dentry)) { + tmprc = cifs_rmdir(target_dir, target_dentry); + } else { + tmprc = __cifs_unlink(target_dir, target_dentry, + server->vals->protocol_id > SMB10_PROT_ID); + } + if (tmprc) { + /* + * Some servers will return STATUS_ACCESS_DENIED + * or STATUS_DIRECTORY_NOT_EMPTY when failing to + * rename a non-empty directory. Make sure to + * propagate the appropriate error back to + * userspace. + */ + if (tmprc == -EEXIST || tmprc == -ENOTEMPTY) + rc = tmprc; + goto cifs_rename_exit; + } + rc = cifs_do_rename(xid, source_dentry, from_name, + target_dentry, to_name); + if (!rc) + rehash = false; + } } /* force revalidate to go get info when needed */ @@ -2629,6 +2692,8 @@ cifs_dentry_needs_reval(struct dentry *dentry) struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct cached_fid *cfid = NULL; + if (test_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags)) + return false; if (cifs_i->time == 0) return true; @@ -2779,7 +2844,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) } cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", - full_path, inode, inode->i_count.counter, + full_path, inode, icount_read(inode), dentry, cifs_get_time(dentry), jiffies); again: diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index da23cc12a52c..dda6dece802a 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -832,33 +832,28 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) kfree(tmp_list); } } -void -cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) + +void cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, + struct dentry *dentry) { - struct cifsFileInfo *cfile; struct file_list *tmp_list, *tmp_next_list; - void *page; - const char *full_path; + struct cifsFileInfo *cfile; LIST_HEAD(file_head); - page = alloc_dentry_path(); spin_lock(&tcon->open_file_lock); list_for_each_entry(cfile, &tcon->openFileList, tlist) { - full_path = build_path_from_dentry(cfile->dentry, page); - if (strstr(full_path, path)) { - if (delayed_work_pending(&cfile->deferred)) { - if (cancel_delayed_work(&cfile->deferred)) { - spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - cifs_del_deferred_close(cfile); - spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - - tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); - if (tmp_list == NULL) - break; - tmp_list->cfile = cfile; - list_add_tail(&tmp_list->list, &file_head); - } - } + if ((cfile->dentry == dentry) && + delayed_work_pending(&cfile->deferred) && + cancel_delayed_work(&cfile->deferred)) { + spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + cifs_del_deferred_close(cfile); + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); + if (tmp_list == NULL) + break; + tmp_list->cfile = cfile; + list_add_tail(&tmp_list->list, &file_head); } } spin_unlock(&tcon->open_file_lock); @@ -868,7 +863,6 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) list_del(&tmp_list->list); kfree(tmp_list); } - free_dentry_path(page); } /* diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 7869cec58f52..10c84c095fe7 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -278,7 +278,7 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb, } /* - * For absolute symlinks it is not possible to determinate + * For absolute symlinks it is not possible to determine * if it should point to directory or file. */ if (symname[0] == '/') { diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 893a1ea8c000..a02d41d1ce4a 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -1005,7 +1005,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, rc = -EOPNOTSUPP; } - /* Fallback to SMB_COM_SETATTR command when absolutelty needed. */ + /* Fallback to SMB_COM_SETATTR command when absolutely needed. */ if (rc == -EOPNOTSUPP) { cifs_dbg(FYI, "calling SetInformation since SetPathInfo for attrs/times not supported by this server\n"); rc = SMBSetInformation(xid, tcon, full_path, @@ -1039,7 +1039,7 @@ set_via_filehandle: cifsFileInfo_put(open_file); /* - * Setting the read-only bit is not honered on non-NT servers when done + * Setting the read-only bit is not honored on non-NT servers when done * via open-semantics. So for setting it, use SMB_COM_SETATTR command. * This command works only after the file is closed, so use it only when * operation was called without the filehandle. diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h index 224495322a05..e56e4d402f13 100644 --- a/fs/smb/client/smb2glob.h +++ b/fs/smb/client/smb2glob.h @@ -30,10 +30,9 @@ enum smb2_compound_ops { SMB2_OP_QUERY_DIR, SMB2_OP_MKDIR, SMB2_OP_RENAME, - SMB2_OP_DELETE, SMB2_OP_HARDLINK, SMB2_OP_SET_EOF, - SMB2_OP_RMDIR, + SMB2_OP_UNLINK, SMB2_OP_POSIX_QUERY_INFO, SMB2_OP_SET_REPARSE, SMB2_OP_GET_REPARSE, diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 31c13fb5b85b..0985db9f86e5 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -346,9 +346,6 @@ replay_again: trace_smb3_posix_query_info_compound_enter(xid, tcon->tid, ses->Suid, full_path); break; - case SMB2_OP_DELETE: - trace_smb3_delete_enter(xid, tcon->tid, ses->Suid, full_path); - break; case SMB2_OP_MKDIR: /* * Directories are created through parameters in the @@ -356,23 +353,40 @@ replay_again: */ trace_smb3_mkdir_enter(xid, tcon->tid, ses->Suid, full_path); break; - case SMB2_OP_RMDIR: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; + case SMB2_OP_UNLINK: + rqst[num_rqst].rq_iov = vars->unlink_iov; rqst[num_rqst].rq_nvec = 1; size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ data[0] = &delete_pending[0]; - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_DISPOSITION_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (rc) + if (cfile) { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } else { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); - trace_smb3_rmdir_enter(xid, tcon->tid, ses->Suid, full_path); + } + num_rqst++; + trace_smb3_unlink_enter(xid, tcon->tid, ses->Suid, full_path); break; case SMB2_OP_SET_EOF: rqst[num_rqst].rq_iov = &vars->si_iov[0]; @@ -442,7 +456,7 @@ replay_again: ses->Suid, full_path); break; case SMB2_OP_RENAME: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_iov = vars->rename_iov; rqst[num_rqst].rq_nvec = 2; len = in_iov[i].iov_len; @@ -673,7 +687,7 @@ finished: } for (i = 0; i < num_cmds; i++) { - char *buf = rsp_iov[i + i].iov_base; + char *buf = rsp_iov[i + 1].iov_base; if (buf && resp_buftype[i + 1] != CIFS_NO_BUFFER) rc = server->ops->map_error(buf, false); @@ -732,19 +746,6 @@ finished: trace_smb3_posix_query_info_compound_done(xid, tcon->tid, ses->Suid); break; - case SMB2_OP_DELETE: - if (rc) - trace_smb3_delete_err(xid, tcon->tid, ses->Suid, rc); - else { - /* - * If dentry (hence, inode) is NULL, lease break is going to - * take care of degrading leases on handles for deleted files. - */ - if (inode) - cifs_mark_open_handles_for_deleted_file(inode, full_path); - trace_smb3_delete_done(xid, tcon->tid, ses->Suid); - } - break; case SMB2_OP_MKDIR: if (rc) trace_smb3_mkdir_err(xid, tcon->tid, ses->Suid, rc); @@ -765,11 +766,11 @@ finished: trace_smb3_rename_done(xid, tcon->tid, ses->Suid); SMB2_set_info_free(&rqst[num_rqst++]); break; - case SMB2_OP_RMDIR: - if (rc) - trace_smb3_rmdir_err(xid, tcon->tid, ses->Suid, rc); + case SMB2_OP_UNLINK: + if (!rc) + trace_smb3_unlink_done(xid, tcon->tid, ses->Suid); else - trace_smb3_rmdir_done(xid, tcon->tid, ses->Suid); + trace_smb3_unlink_err(xid, tcon->tid, ses->Suid, rc); SMB2_set_info_free(&rqst[num_rqst++]); break; case SMB2_OP_SET_EOF: @@ -1166,7 +1167,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE); return smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, NULL, - &(int){SMB2_OP_RMDIR}, 1, + &(int){SMB2_OP_UNLINK}, 1, NULL, NULL, NULL, NULL); } @@ -1174,21 +1175,107 @@ int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb, struct dentry *dentry) { + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + __le16 *utf16_path __free(kfree) = NULL; + int retries = 0, cur_sleep = 1; + struct TCP_Server_Info *server; struct cifs_open_parms oparms; + struct smb2_create_req *creq; + struct inode *inode = NULL; + struct smb_rqst rqst[2]; + struct kvec rsp_iov[2]; + struct kvec close_iov; + int resp_buftype[2]; + struct cifs_fid fid; + int flags = 0; + __u8 oplock; + int rc; - oparms = CIFS_OPARMS(cifs_sb, tcon, name, - DELETE, FILE_OPEN, - CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE); - int rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, - NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, dentry); - if (rc == -EINVAL) { - cifs_dbg(FYI, "invalid lease key, resending request without lease"); - rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, - NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, NULL); + utf16_path = cifs_convert_path_to_utf16(name, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; +again: + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(tcon->ses); + + memset(rqst, 0, sizeof(rqst)); + memset(resp_buftype, 0, sizeof(resp_buftype)); + memset(rsp_iov, 0, sizeof(rsp_iov)); + + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = ARRAY_SIZE(open_iov); + + oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE | FILE_READ_ATTRIBUTES, + FILE_OPEN, CREATE_DELETE_ON_CLOSE | + OPEN_REPARSE_POINT, ACL_NO_MODE); + oparms.fid = &fid; + + if (dentry) { + inode = d_inode(dentry); + if (CIFS_I(inode)->lease_granted && server->ops->get_lease_key) { + oplock = SMB2_OPLOCK_LEVEL_LEASE; + server->ops->get_lease_key(inode, &fid); + } } + + rc = SMB2_open_init(tcon, server, + &rqst[0], &oplock, &oparms, utf16_path); + if (rc) + goto err_free; + smb2_set_next_command(tcon, &rqst[0]); + creq = rqst[0].rq_iov[0].iov_base; + creq->ShareAccess = FILE_SHARE_DELETE_LE; + + rqst[1].rq_iov = &close_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_close_init(tcon, server, &rqst[1], + COMPOUND_FID, COMPOUND_FID, false); + smb2_set_related(&rqst[1]); + if (rc) + goto err_free; + + if (retries) { + for (int i = 0; i < ARRAY_SIZE(rqst); i++) + smb2_set_replay(server, &rqst[i]); + } + + rc = compound_send_recv(xid, tcon->ses, server, flags, + ARRAY_SIZE(rqst), rqst, + resp_buftype, rsp_iov); + SMB2_open_free(&rqst[0]); + SMB2_close_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto again; + + /* Retry compound request without lease */ + if (rc == -EINVAL && dentry) { + dentry = NULL; + retries = 0; + cur_sleep = 1; + goto again; + } + /* + * If dentry (hence, inode) is NULL, lease break is going to + * take care of degrading leases on handles for deleted files. + */ + if (!rc && inode) + cifs_mark_open_handles_for_deleted_file(inode, name); + + return rc; + +err_free: + SMB2_open_free(&rqst[0]); + SMB2_close_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); return rc; } @@ -1441,3 +1528,113 @@ out: cifs_free_open_info(&data); return rc; } + +static inline __le16 *utf16_smb2_path(struct cifs_sb_info *cifs_sb, + const char *name, size_t namelen) +{ + int len; + + if (*name == '\\' || + (cifs_sb_master_tlink(cifs_sb) && + cifs_sb_master_tcon(cifs_sb)->posix_extensions && *name == '/')) + name++; + return cifs_strndup_to_utf16(name, namelen, &len, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); +} + +int smb2_rename_pending_delete(const char *full_path, + struct dentry *dentry, + const unsigned int xid) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(d_inode(dentry)->i_sb); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(dentry)); + __le16 *utf16_path __free(kfree) = NULL; + __u32 co = file_create_options(dentry); + int cmds[] = { + SMB2_OP_SET_INFO, + SMB2_OP_RENAME, + SMB2_OP_UNLINK, + }; + const int num_cmds = ARRAY_SIZE(cmds); + char *to_name __free(kfree) = NULL; + __u32 attrs = cinode->cifsAttrs; + struct cifs_open_parms oparms; + static atomic_t sillycounter; + struct cifsFileInfo *cfile; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct kvec iov[2]; + const char *ppath; + void *page; + size_t len; + int rc; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + page = alloc_dentry_path(); + + ppath = build_path_from_dentry(dentry->d_parent, page); + if (IS_ERR(ppath)) { + rc = PTR_ERR(ppath); + goto out; + } + + len = strlen(ppath) + strlen("/.__smb1234") + 1; + to_name = kmalloc(len, GFP_KERNEL); + if (!to_name) { + rc = -ENOMEM; + goto out; + } + + scnprintf(to_name, len, "%s%c.__smb%04X", ppath, CIFS_DIR_SEP(cifs_sb), + atomic_inc_return(&sillycounter) & 0xffff); + + utf16_path = utf16_smb2_path(cifs_sb, to_name, len); + if (!utf16_path) { + rc = -ENOMEM; + goto out; + } + + drop_cached_dir_by_name(xid, tcon, full_path, cifs_sb); + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, + DELETE | FILE_WRITE_ATTRIBUTES, + FILE_OPEN, co, ACL_NO_MODE); + + attrs &= ~ATTR_READONLY; + if (!attrs) + attrs = ATTR_NORMAL; + if (d_inode(dentry)->i_nlink <= 1) + attrs |= ATTR_HIDDEN; + iov[0].iov_base = &(FILE_BASIC_INFO) { + .Attributes = cpu_to_le32(attrs), + }; + iov[0].iov_len = sizeof(FILE_BASIC_INFO); + iov[1].iov_base = utf16_path; + iov[1].iov_len = sizeof(*utf16_path) * UniStrlen((wchar_t *)utf16_path); + + cifs_get_writable_path(tcon, full_path, FIND_WR_WITH_DELETE, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov, + cmds, num_cmds, cfile, NULL, NULL, dentry); + if (rc == -EINVAL) { + cifs_dbg(FYI, "invalid lease key, resending request without lease\n"); + cifs_get_writable_path(tcon, full_path, + FIND_WR_WITH_DELETE, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov, + cmds, num_cmds, cfile, NULL, NULL, NULL); + } + if (!rc) { + set_bit(CIFS_INO_DELETE_PENDING, &cinode->flags); + } else { + cifs_tcon_dbg(FYI, "%s: failed to rename '%s' to '%s': %d\n", + __func__, full_path, to_name, rc); + rc = -EIO; + } +out: + cifs_put_tlink(tlink); + free_dentry_path(page); + return rc; +} diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index cddf273c14ae..89d933b4a8bc 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -614,6 +614,15 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) struct cifs_tcon *tcon; struct cifs_pending_open *open; + /* Trace receipt of lease break request from server */ + trace_smb3_lease_break_enter(le32_to_cpu(rsp->CurrentLeaseState), + le32_to_cpu(rsp->Flags), + le16_to_cpu(rsp->Epoch), + le32_to_cpu(rsp->hdr.Id.SyncId.TreeId), + le64_to_cpu(rsp->hdr.SessionId), + *((u64 *)rsp->LeaseKey), + *((u64 *)&rsp->LeaseKey[8])); + cifs_dbg(FYI, "Checking for lease break\n"); /* If server is a channel, select the primary channel */ @@ -660,10 +669,12 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "Can not process lease break - no lease matched\n"); trace_smb3_lease_not_found(le32_to_cpu(rsp->CurrentLeaseState), - le32_to_cpu(rsp->hdr.Id.SyncId.TreeId), - le64_to_cpu(rsp->hdr.SessionId), - *((u64 *)rsp->LeaseKey), - *((u64 *)&rsp->LeaseKey[8])); + le32_to_cpu(rsp->Flags), + le16_to_cpu(rsp->Epoch), + le32_to_cpu(rsp->hdr.Id.SyncId.TreeId), + le64_to_cpu(rsp->hdr.SessionId), + *((u64 *)rsp->LeaseKey), + *((u64 *)&rsp->LeaseKey[8])); return false; } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 94b1d7a395d5..4711a23c5b38 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -504,8 +504,8 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) wsize = min_t(unsigned int, wsize, server->max_write); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { - struct smbdirect_socket_parameters *sp = - &server->smbd_conn->socket.parameters; + const struct smbdirect_socket_parameters *sp = + smbd_get_parameters(server->smbd_conn); if (server->sign) /* @@ -555,8 +555,8 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) rsize = min_t(unsigned int, rsize, server->max_read); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { - struct smbdirect_socket_parameters *sp = - &server->smbd_conn->socket.parameters; + const struct smbdirect_socket_parameters *sp = + smbd_get_parameters(server->smbd_conn); if (server->sign) /* @@ -2640,13 +2640,35 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) } /* SMB headers in a compound are 8 byte aligned. */ - if (!IS_ALIGNED(len, 8)) { - num_padding = 8 - (len & 7); + if (IS_ALIGNED(len, 8)) + goto out; + + num_padding = 8 - (len & 7); + if (smb3_encryption_required(tcon)) { + int i; + + /* + * Flatten request into a single buffer with required padding as + * the encryption layer can't handle the padding iovs. + */ + for (i = 1; i < rqst->rq_nvec; i++) { + memcpy(rqst->rq_iov[0].iov_base + + rqst->rq_iov[0].iov_len, + rqst->rq_iov[i].iov_base, + rqst->rq_iov[i].iov_len); + rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len; + } + memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len, + 0, num_padding); + rqst->rq_iov[0].iov_len += num_padding; + rqst->rq_nvec = 1; + } else { rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding; rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding; rqst->rq_nvec++; - len += num_padding; } + len += num_padding; +out: shdr->NextCommand = cpu_to_le32(len); } @@ -5376,6 +5398,7 @@ struct smb_version_operations smb20_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; #endif /* CIFS_ALLOW_INSECURE_LEGACY */ @@ -5481,6 +5504,7 @@ struct smb_version_operations smb21_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; struct smb_version_operations smb30_operations = { @@ -5597,6 +5621,7 @@ struct smb_version_operations smb30_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; struct smb_version_operations smb311_operations = { @@ -5713,6 +5738,7 @@ struct smb_version_operations smb311_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 2df93a75e3b8..1c63d2c9cc9c 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4411,7 +4411,7 @@ static inline bool smb3_use_rdma_offload(struct cifs_io_parms *io_parms) return false; /* offload also has its overhead, so only do it if desired */ - if (io_parms->length < server->smbd_conn->rdma_readwrite_threshold) + if (io_parms->length < server->rdma_readwrite_threshold) return false; return true; @@ -6192,11 +6192,11 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, please_key_high = (__u64 *)(lease_key+8); if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); - trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid, + trace_smb3_lease_ack_err(le32_to_cpu(lease_state), tcon->tid, ses->Suid, *please_key_low, *please_key_high, rc); cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc); } else - trace_smb3_lease_done(le32_to_cpu(lease_state), tcon->tid, + trace_smb3_lease_ack_done(le32_to_cpu(lease_state), tcon->tid, ses->Suid, *please_key_low, *please_key_high); return rc; diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 6e805ece6a7b..b3f1398c9f79 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -317,5 +317,8 @@ int posix_info_sid_size(const void *beg, const void *end); int smb2_make_nfs_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); +int smb2_rename_pending_delete(const char *full_path, + struct dentry *dentry, + const unsigned int xid); #endif /* _SMB2PROTO_H */ diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 02d6db431fd4..316f398c70f4 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -13,28 +13,35 @@ #include "cifsproto.h" #include "smb2proto.h" +const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn) +{ + struct smbdirect_socket *sc = &conn->socket; + + return &sc->parameters; +} + static struct smbdirect_recv_io *get_receive_buffer( - struct smbd_connection *info); + struct smbdirect_socket *sc); static void put_receive_buffer( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct smbdirect_recv_io *response); -static int allocate_receive_buffers(struct smbd_connection *info, int num_buf); -static void destroy_receive_buffers(struct smbd_connection *info); +static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf); +static void destroy_receive_buffers(struct smbdirect_socket *sc); static void enqueue_reassembly( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct smbdirect_recv_io *response, int data_length); static struct smbdirect_recv_io *_get_first_reassembly( - struct smbd_connection *info); + struct smbdirect_socket *sc); static int smbd_post_recv( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct smbdirect_recv_io *response); -static int smbd_post_send_empty(struct smbd_connection *info); +static int smbd_post_send_empty(struct smbdirect_socket *sc); -static void destroy_mr_list(struct smbd_connection *info); -static int allocate_mr_list(struct smbd_connection *info); +static void destroy_mr_list(struct smbdirect_socket *sc); +static int allocate_mr_list(struct smbdirect_socket *sc); struct smb_extract_to_rdma { struct ib_sge *sge; @@ -57,6 +64,9 @@ static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, /* SMBD negotiation timeout in seconds */ #define SMBD_NEGOTIATE_TIMEOUT 120 +/* The timeout to wait for a keepalive message from peer in seconds */ +#define KEEPALIVE_RECV_TIMEOUT 5 + /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */ #define SMBD_MIN_RECEIVE_SIZE 128 #define SMBD_MIN_FRAGMENTED_SIZE 131072 @@ -155,65 +165,277 @@ do { \ #define log_rdma_mr(level, fmt, args...) \ log_rdma(level, LOG_RDMA_MR, fmt, ##args) +static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc) +{ + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + wake_up_all(&sc->status_wait); + wake_up_all(&sc->send_io.credits.wait_queue); + wake_up_all(&sc->send_io.pending.dec_wait_queue); + wake_up_all(&sc->send_io.pending.zero_wait_queue); + wake_up_all(&sc->recv_io.reassembly.wait_queue); + wake_up_all(&sc->mr_io.ready.wait_queue); + wake_up_all(&sc->mr_io.cleanup.wait_queue); +} + static void smbd_disconnect_rdma_work(struct work_struct *work) { - struct smbd_connection *info = - container_of(work, struct smbd_connection, disconnect_work); - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, disconnect_work); - if (sc->status == SMBDIRECT_SOCKET_CONNECTED) { + /* + * make sure this and other work is not queued again + * but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->disconnect_work); + disable_work(&sc->recv_io.posted.refill_work); + disable_work(&sc->mr_io.recovery_work); + disable_work(&sc->idle.immediate_work); + disable_delayed_work(&sc->idle.timer_work); + + if (sc->first_error == 0) + sc->first_error = -ECONNABORTED; + + switch (sc->status) { + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_CONNECTED: + case SMBDIRECT_SOCKET_ERROR: sc->status = SMBDIRECT_SOCKET_DISCONNECTING; rdma_disconnect(sc->rdma.cm_id); + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + /* + * rdma_connect() never reached + * RDMA_CM_EVENT_ESTABLISHED + */ + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + break; } + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smbd_disconnect_wake_up_all(sc); } -static void smbd_disconnect_rdma_connection(struct smbd_connection *info) +static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) { - queue_work(info->workqueue, &info->disconnect_work); + /* + * make sure other work (than disconnect_work) is + * not queued again but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->recv_io.posted.refill_work); + disable_work(&sc->mr_io.recovery_work); + disable_work(&sc->idle.immediate_work); + disable_delayed_work(&sc->idle.timer_work); + + if (sc->first_error == 0) + sc->first_error = -ECONNABORTED; + + switch (sc->status) { + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_ERROR: + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + /* + * Keep the current error status + */ + break; + + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; + break; + + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; + break; + + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; + break; + + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_CONNECTED: + sc->status = SMBDIRECT_SOCKET_ERROR; + break; + } + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smbd_disconnect_wake_up_all(sc); + + queue_work(sc->workqueue, &sc->disconnect_work); } /* Upcall from RDMA CM */ static int smbd_conn_upcall( struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct smbd_connection *info = id->context; - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = id->context; + struct smbdirect_socket_parameters *sp = &sc->parameters; const char *event_name = rdma_event_msg(event->event); + u8 peer_initiator_depth; + u8 peer_responder_resources; log_rdma_event(INFO, "event=%s status=%d\n", event_name, event->status); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED; + wake_up(&sc->status_wait); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: - info->ri_rc = 0; - complete(&info->ri_done); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; + wake_up(&sc->status_wait); break; case RDMA_CM_EVENT_ADDR_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); - info->ri_rc = -EHOSTUNREACH; - complete(&info->ri_done); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; + smbd_disconnect_rdma_work(&sc->disconnect_work); break; case RDMA_CM_EVENT_ROUTE_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); - info->ri_rc = -ENETUNREACH; - complete(&info->ri_done); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; + smbd_disconnect_rdma_work(&sc->disconnect_work); break; case RDMA_CM_EVENT_ESTABLISHED: log_rdma_event(INFO, "connected event=%s\n", event_name); - sc->status = SMBDIRECT_SOCKET_CONNECTED; - wake_up_interruptible(&info->status_wait); + + /* + * Here we work around an inconsistency between + * iWarp and other devices (at least rxe and irdma using RoCEv2) + */ + if (rdma_protocol_iwarp(id->device, id->port_num)) { + /* + * iWarp devices report the peer's values + * with the perspective of the peer here. + * Tested with siw and irdma (in iwarp mode) + * We need to change to our perspective here, + * so we need to switch the values. + */ + peer_initiator_depth = event->param.conn.responder_resources; + peer_responder_resources = event->param.conn.initiator_depth; + } else { + /* + * Non iWarp devices report the peer's values + * already changed to our perspective here. + * Tested with rxe and irdma (in roce mode). + */ + peer_initiator_depth = event->param.conn.initiator_depth; + peer_responder_resources = event->param.conn.responder_resources; + } + if (rdma_protocol_iwarp(id->device, id->port_num) && + event->param.conn.private_data_len == 8) { + /* + * Legacy clients with only iWarp MPA v1 support + * need a private blob in order to negotiate + * the IRD/ORD values. + */ + const __be32 *ird_ord_hdr = event->param.conn.private_data; + u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); + u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); + + /* + * cifs.ko sends the legacy IRD/ORD negotiation + * event if iWarp MPA v2 was used. + * + * Here we check that the values match and only + * mark the client as legacy if they don't match. + */ + if ((u32)event->param.conn.initiator_depth != ird32 || + (u32)event->param.conn.responder_resources != ord32) { + /* + * There are broken clients (old cifs.ko) + * using little endian and also + * struct rdma_conn_param only uses u8 + * for initiator_depth and responder_resources, + * so we truncate the value to U8_MAX. + * + * smb_direct_accept_client() will then + * do the real negotiation in order to + * select the minimum between client and + * server. + */ + ird32 = min_t(u32, ird32, U8_MAX); + ord32 = min_t(u32, ord32, U8_MAX); + + sc->rdma.legacy_iwarp = true; + peer_initiator_depth = (u8)ird32; + peer_responder_resources = (u8)ord32; + } + } + + /* + * negotiate the value by using the minimum + * between client and server if the client provided + * non 0 values. + */ + if (peer_initiator_depth != 0) + sp->initiator_depth = + min_t(u8, sp->initiator_depth, + peer_initiator_depth); + if (peer_responder_resources != 0) + sp->responder_resources = + min_t(u8, sp->responder_resources, + peer_responder_resources); + + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; + wake_up(&sc->status_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up_interruptible(&info->status_wait); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; + smbd_disconnect_rdma_work(&sc->disconnect_work); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: @@ -221,15 +443,10 @@ static int smbd_conn_upcall( /* This happens when we fail the negotiation */ if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) { log_rdma_event(ERR, "event=%s during negotiation\n", event_name); - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up(&info->status_wait); - break; } sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up_interruptible(&info->status_wait); - wake_up_interruptible(&sc->recv_io.reassembly.wait_queue); - wake_up_interruptible_all(&info->wait_send_queue); + smbd_disconnect_rdma_work(&sc->disconnect_work); break; default: @@ -245,15 +462,15 @@ static int smbd_conn_upcall( static void smbd_qp_async_error_upcall(struct ib_event *event, void *context) { - struct smbd_connection *info = context; + struct smbdirect_socket *sc = context; - log_rdma_event(ERR, "%s on device %s info %p\n", - ib_event_msg(event->event), event->device->name, info); + log_rdma_event(ERR, "%s on device %s socket %p\n", + ib_event_msg(event->event), event->device->name, sc); switch (event->event) { case IB_EVENT_CQ_ERR: case IB_EVENT_QP_FATAL: - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); break; default: @@ -278,11 +495,9 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_send_io *request = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); struct smbdirect_socket *sc = request->socket; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); - log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%d\n", - request, wc->status); + log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n", + request, ib_wc_status_msg(wc->status)); for (i = 0; i < request->num_sge; i++) ib_dma_unmap_single(sc->ib.dev, @@ -291,17 +506,18 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) DMA_TO_DEVICE); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { - log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n", - wc->status, wc->opcode); + if (wc->status != IB_WC_WR_FLUSH_ERR) + log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n", + ib_wc_status_msg(wc->status), wc->opcode); mempool_free(request, sc->send_io.mem.pool); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); return; } - if (atomic_dec_and_test(&info->send_pending)) - wake_up(&info->wait_send_pending); + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); - wake_up(&info->wait_post_send); + wake_up(&sc->send_io.pending.dec_wait_queue); mempool_free(request, sc->send_io.mem.pool); } @@ -325,8 +541,6 @@ static bool process_negotiation_response( struct smbdirect_recv_io *response, int packet_length) { struct smbdirect_socket *sc = response->socket; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_negotiate_resp *packet = smbdirect_recv_io_payload(response); @@ -341,21 +555,19 @@ static bool process_negotiation_response( le16_to_cpu(packet->negotiated_version)); return false; } - info->protocol = le16_to_cpu(packet->negotiated_version); if (packet->credits_requested == 0) { log_rdma_event(ERR, "error: credits_requested==0\n"); return false; } - info->receive_credit_target = le16_to_cpu(packet->credits_requested); + sc->recv_io.credits.target = le16_to_cpu(packet->credits_requested); + sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); if (packet->credits_granted == 0) { log_rdma_event(ERR, "error: credits_granted==0\n"); return false; } - atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted)); - - atomic_set(&info->receive_credits, 0); + atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted)); if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) { log_rdma_event(ERR, "error: preferred_send_size=%d\n", @@ -380,16 +592,12 @@ static bool process_negotiation_response( } sp->max_fragmented_send_size = le32_to_cpu(packet->max_fragmented_size); - info->rdma_readwrite_threshold = - rdma_readwrite_threshold > sp->max_fragmented_send_size ? - sp->max_fragmented_send_size : - rdma_readwrite_threshold; sp->max_read_write_size = min_t(u32, le32_to_cpu(packet->max_readwrite_size), - info->max_frmr_depth * PAGE_SIZE); - info->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE; + sp->max_frmr_depth * PAGE_SIZE); + sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE; sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; return true; @@ -397,52 +605,40 @@ static bool process_negotiation_response( static void smbd_post_send_credits(struct work_struct *work) { - int ret = 0; int rc; struct smbdirect_recv_io *response; - struct smbd_connection *info = - container_of(work, struct smbd_connection, - post_send_credits_work); - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { - wake_up(&info->wait_receive_queues); return; } - if (info->receive_credit_target > - atomic_read(&info->receive_credits)) { + if (sc->recv_io.credits.target > + atomic_read(&sc->recv_io.credits.count)) { while (true) { - response = get_receive_buffer(info); + response = get_receive_buffer(sc); if (!response) break; response->first_segment = false; - rc = smbd_post_recv(info, response); + rc = smbd_post_recv(sc, response); if (rc) { log_rdma_recv(ERR, "post_recv failed rc=%d\n", rc); - put_receive_buffer(info, response); + put_receive_buffer(sc, response); break; } - ret++; + atomic_inc(&sc->recv_io.posted.count); } } - spin_lock(&info->lock_new_credits_offered); - info->new_credits_offered += ret; - spin_unlock(&info->lock_new_credits_offered); - /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */ - info->send_immediate = true; - if (atomic_read(&info->receive_credits) < - info->receive_credit_target - 1) { - if (info->keep_alive_requested == KEEP_ALIVE_PENDING || - info->send_immediate) { - log_keep_alive(INFO, "send an empty message\n"); - smbd_post_send_empty(info); - } + if (atomic_read(&sc->recv_io.credits.count) < + sc->recv_io.credits.target - 1) { + log_keep_alive(INFO, "schedule send of an empty message\n"); + queue_work(sc->workqueue, &sc->idle.immediate_work); } } @@ -453,17 +649,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_recv_io *response = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); struct smbdirect_socket *sc = response->socket; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); - int data_length = 0; - - log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n", - response, sc->recv_io.expected, wc->status, wc->opcode, + struct smbdirect_socket_parameters *sp = &sc->parameters; + u16 old_recv_credit_target; + u32 data_offset = 0; + u32 data_length = 0; + u32 remaining_data_length = 0; + bool negotiate_done = false; + + log_rdma_recv(INFO, + "response=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n", + response, sc->recv_io.expected, + ib_wc_status_msg(wc->status), wc->opcode, wc->byte_len, wc->pkey_index); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { - log_rdma_recv(INFO, "wc->status=%d opcode=%d\n", - wc->status, wc->opcode); + if (wc->status != IB_WC_WR_FLUSH_ERR) + log_rdma_recv(ERR, "wc->status=%s opcode=%d\n", + ib_wc_status_msg(wc->status), wc->opcode); goto error; } @@ -473,21 +675,52 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) response->sge.length, DMA_FROM_DEVICE); + /* + * Reset timer to the keepalive interval in + * order to trigger our next keepalive message. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_interval_msec)); + switch (sc->recv_io.expected) { /* SMBD negotiation response */ case SMBDIRECT_EXPECT_NEGOTIATE_REP: dump_smbdirect_negotiate_resp(smbdirect_recv_io_payload(response)); sc->recv_io.reassembly.full_packet_received = true; - info->negotiate_done = + negotiate_done = process_negotiation_response(response, wc->byte_len); - put_receive_buffer(info, response); - complete(&info->negotiate_completion); + put_receive_buffer(sc, response); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING); + if (!negotiate_done) { + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; + smbd_disconnect_rdma_connection(sc); + } else { + sc->status = SMBDIRECT_SOCKET_CONNECTED; + wake_up(&sc->status_wait); + } + return; /* SMBD data transfer packet */ case SMBDIRECT_EXPECT_DATA_TRANSFER: data_transfer = smbdirect_recv_io_payload(response); + + if (wc->byte_len < + offsetof(struct smbdirect_data_transfer, padding)) + goto error; + + remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); + data_offset = le32_to_cpu(data_transfer->data_offset); data_length = le32_to_cpu(data_transfer->data_length); + if (wc->byte_len < data_offset || + (u64)wc->byte_len < (u64)data_offset + data_length) + goto error; + + if (remaining_data_length > sp->max_fragmented_recv_size || + data_length > sp->max_fragmented_recv_size || + (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size) + goto error; if (data_length) { if (sc->recv_io.reassembly.full_packet_received) @@ -499,17 +732,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) sc->recv_io.reassembly.full_packet_received = true; } - atomic_dec(&info->receive_credits); - info->receive_credit_target = + atomic_dec(&sc->recv_io.posted.count); + atomic_dec(&sc->recv_io.credits.count); + old_recv_credit_target = sc->recv_io.credits.target; + sc->recv_io.credits.target = le16_to_cpu(data_transfer->credits_requested); + sc->recv_io.credits.target = + min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); + sc->recv_io.credits.target = + max_t(u16, sc->recv_io.credits.target, 1); if (le16_to_cpu(data_transfer->credits_granted)) { atomic_add(le16_to_cpu(data_transfer->credits_granted), - &info->send_credits); + &sc->send_io.credits.count); /* * We have new send credits granted from remote peer * If any sender is waiting for credits, unblock it */ - wake_up_interruptible(&info->wait_send_queue); + wake_up(&sc->send_io.credits.wait_queue); } log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n", @@ -518,11 +757,11 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) le32_to_cpu(data_transfer->data_length), le32_to_cpu(data_transfer->remaining_data_length)); - /* Send a KEEP_ALIVE response right away if requested */ - info->keep_alive_requested = KEEP_ALIVE_NONE; + /* Send an immediate response right away if requested */ if (le16_to_cpu(data_transfer->flags) & SMBDIRECT_FLAG_RESPONSE_REQUESTED) { - info->keep_alive_requested = KEEP_ALIVE_PENDING; + log_keep_alive(INFO, "schedule send of immediate response\n"); + queue_work(sc->workqueue, &sc->idle.immediate_work); } /* @@ -530,10 +769,13 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) * reassembly queue and wake up the reading thread */ if (data_length) { - enqueue_reassembly(info, response, data_length); - wake_up_interruptible(&sc->recv_io.reassembly.wait_queue); + if (sc->recv_io.credits.target > old_recv_credit_target) + queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); + + enqueue_reassembly(sc, response, data_length); + wake_up(&sc->recv_io.reassembly.wait_queue); } else - put_receive_buffer(info, response); + put_receive_buffer(sc, response); return; @@ -548,19 +790,20 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) log_rdma_recv(ERR, "unexpected response type=%d\n", sc->recv_io.expected); WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); error: - put_receive_buffer(info, response); - smbd_disconnect_rdma_connection(info); + put_receive_buffer(sc, response); + smbd_disconnect_rdma_connection(sc); } static struct rdma_cm_id *smbd_create_id( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct sockaddr *dstaddr, int port) { + struct smbdirect_socket_parameters *sp = &sc->parameters; struct rdma_cm_id *id; int rc; __be16 *sport; - id = rdma_create_id(&init_net, smbd_conn_upcall, info, + id = rdma_create_id(&init_net, smbd_conn_upcall, sc, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) { rc = PTR_ERR(id); @@ -575,43 +818,57 @@ static struct rdma_cm_id *smbd_create_id( *sport = htons(port); - init_completion(&info->ri_done); - info->ri_rc = -ETIMEDOUT; - + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING; rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr, - RDMA_RESOLVE_TIMEOUT); + sp->resolve_addr_timeout_msec); if (rc) { log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc); goto out; } - rc = wait_for_completion_interruptible_timeout( - &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + rc = wait_event_interruptible_timeout( + sc->status_wait, + sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING, + msecs_to_jiffies(sp->resolve_addr_timeout_msec)); /* e.g. if interrupted returns -ERESTARTSYS */ if (rc < 0) { log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); goto out; } - rc = info->ri_rc; - if (rc) { + if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING) { + rc = -ETIMEDOUT; + log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); + goto out; + } + if (sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED) { + rc = -EHOSTUNREACH; log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); goto out; } - info->ri_rc = -ETIMEDOUT; - rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING; + rc = rdma_resolve_route(id, sp->resolve_route_timeout_msec); if (rc) { log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc); goto out; } - rc = wait_for_completion_interruptible_timeout( - &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + rc = wait_event_interruptible_timeout( + sc->status_wait, + sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING, + msecs_to_jiffies(sp->resolve_route_timeout_msec)); /* e.g. if interrupted returns -ERESTARTSYS */ if (rc < 0) { log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); goto out; } - rc = info->ri_rc; - if (rc) { + if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING) { + rc = -ETIMEDOUT; + log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); + goto out; + } + if (sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED) { + rc = -ENETUNREACH; log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); goto out; } @@ -638,13 +895,16 @@ static bool frwr_is_supported(struct ib_device_attr *attrs) } static int smbd_ia_open( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct sockaddr *dstaddr, int port) { - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; int rc; - sc->rdma.cm_id = smbd_create_id(info, dstaddr, port); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED; + + sc->rdma.cm_id = smbd_create_id(sc, dstaddr, port); if (IS_ERR(sc->rdma.cm_id)) { rc = PTR_ERR(sc->rdma.cm_id); goto out1; @@ -659,19 +919,12 @@ static int smbd_ia_open( rc = -EPROTONOSUPPORT; goto out2; } - info->max_frmr_depth = min_t(int, - smbd_max_frmr_depth, + sp->max_frmr_depth = min_t(u32, + sp->max_frmr_depth, sc->ib.dev->attrs.max_fast_reg_page_list_len); - info->mr_type = IB_MR_TYPE_MEM_REG; + sc->mr_io.type = IB_MR_TYPE_MEM_REG; if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) - info->mr_type = IB_MR_TYPE_SG_GAPS; - - sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); - if (IS_ERR(sc->ib.pd)) { - rc = PTR_ERR(sc->ib.pd); - log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc); - goto out2; - } + sc->mr_io.type = IB_MR_TYPE_SG_GAPS; return 0; @@ -689,9 +942,8 @@ out1: * After negotiation, the transport is connected and ready for * carrying upper layer SMB payload */ -static int smbd_post_send_negotiate_req(struct smbd_connection *info) +static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_send_wr send_wr; int rc = -ENOMEM; @@ -743,18 +995,18 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) request->sge[0].addr, request->sge[0].length, request->sge[0].lkey); - atomic_inc(&info->send_pending); + atomic_inc(&sc->send_io.pending.count); rc = ib_post_send(sc->ib.qp, &send_wr, NULL); if (!rc) return 0; /* if we reach here, post send failed */ log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); - atomic_dec(&info->send_pending); + atomic_dec(&sc->send_io.pending.count); ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr, request->sge[0].length, DMA_TO_DEVICE); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); dma_mapping_failed: mempool_free(request, sc->send_io.mem.pool); @@ -769,14 +1021,20 @@ dma_mapping_failed: * buffer as possible, and extend the receive credits to remote peer * return value: the new credtis being granted. */ -static int manage_credits_prior_sending(struct smbd_connection *info) +static int manage_credits_prior_sending(struct smbdirect_socket *sc) { int new_credits; - spin_lock(&info->lock_new_credits_offered); - new_credits = info->new_credits_offered; - info->new_credits_offered = 0; - spin_unlock(&info->lock_new_credits_offered); + if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) + return 0; + + new_credits = atomic_read(&sc->recv_io.posted.count); + if (new_credits == 0) + return 0; + + new_credits -= atomic_read(&sc->recv_io.credits.count); + if (new_credits <= 0) + return 0; return new_credits; } @@ -790,21 +1048,27 @@ static int manage_credits_prior_sending(struct smbd_connection *info) * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set * 0: otherwise */ -static int manage_keep_alive_before_sending(struct smbd_connection *info) +static int manage_keep_alive_before_sending(struct smbdirect_socket *sc) { - if (info->keep_alive_requested == KEEP_ALIVE_PENDING) { - info->keep_alive_requested = KEEP_ALIVE_SENT; + struct smbdirect_socket_parameters *sp = &sc->parameters; + + if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); return 1; } return 0; } /* Post the send request */ -static int smbd_post_send(struct smbd_connection *info, +static int smbd_post_send(struct smbdirect_socket *sc, struct smbdirect_send_io *request) { - struct smbdirect_socket *sc = &info->socket; - struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_send_wr send_wr; int rc, i; @@ -831,21 +1095,17 @@ static int smbd_post_send(struct smbd_connection *info, rc = ib_post_send(sc->ib.qp, &send_wr, NULL); if (rc) { log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); rc = -EAGAIN; - } else - /* Reset timer for idle connection after packet is sent */ - mod_delayed_work(info->workqueue, &info->idle_timer_work, - msecs_to_jiffies(sp->keepalive_interval_msec)); + } return rc; } -static int smbd_post_send_iter(struct smbd_connection *info, +static int smbd_post_send_iter(struct smbdirect_socket *sc, struct iov_iter *iter, int *_remaining_data_length) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; int i, rc; int header_length; @@ -856,8 +1116,8 @@ static int smbd_post_send_iter(struct smbd_connection *info, wait_credit: /* Wait for send credits. A SMBD packet needs one credit */ - rc = wait_event_interruptible(info->wait_send_queue, - atomic_read(&info->send_credits) > 0 || + rc = wait_event_interruptible(sc->send_io.credits.wait_queue, + atomic_read(&sc->send_io.credits.count) > 0 || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (rc) goto err_wait_credit; @@ -867,14 +1127,14 @@ wait_credit: rc = -EAGAIN; goto err_wait_credit; } - if (unlikely(atomic_dec_return(&info->send_credits) < 0)) { - atomic_inc(&info->send_credits); + if (unlikely(atomic_dec_return(&sc->send_io.credits.count) < 0)) { + atomic_inc(&sc->send_io.credits.count); goto wait_credit; } wait_send_queue: - wait_event(info->wait_post_send, - atomic_read(&info->send_pending) < sp->send_credit_target || + wait_event(sc->send_io.pending.dec_wait_queue, + atomic_read(&sc->send_io.pending.count) < sp->send_credit_target || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { @@ -883,9 +1143,9 @@ wait_send_queue: goto err_wait_send_queue; } - if (unlikely(atomic_inc_return(&info->send_pending) > + if (unlikely(atomic_inc_return(&sc->send_io.pending.count) > sp->send_credit_target)) { - atomic_dec(&info->send_pending); + atomic_dec(&sc->send_io.pending.count); goto wait_send_queue; } @@ -898,10 +1158,30 @@ wait_send_queue: request->socket = sc; memset(request->sge, 0, sizeof(request->sge)); + /* Map the packet to DMA */ + header_length = sizeof(struct smbdirect_data_transfer); + /* If this is a packet without payload, don't send padding */ + if (!iter) + header_length = offsetof(struct smbdirect_data_transfer, padding); + + packet = smbdirect_send_io_payload(request); + request->sge[0].addr = ib_dma_map_single(sc->ib.dev, + (void *)packet, + header_length, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) { + rc = -EIO; + goto err_dma; + } + + request->sge[0].length = header_length; + request->sge[0].lkey = sc->ib.pd->local_dma_lkey; + request->num_sge = 1; + /* Fill in the data payload to find out how much data we can add */ if (iter) { struct smb_extract_to_rdma extract = { - .nr_sge = 1, + .nr_sge = request->num_sge, .max_sge = SMBDIRECT_SEND_IO_MAX_SGE, .sge = request->sge, .device = sc->ib.dev, @@ -920,21 +1200,17 @@ wait_send_queue: *_remaining_data_length -= data_length; } else { data_length = 0; - request->num_sge = 1; } /* Fill in the packet header */ - packet = smbdirect_send_io_payload(request); packet->credits_requested = cpu_to_le16(sp->send_credit_target); - new_credits = manage_credits_prior_sending(info); - atomic_add(new_credits, &info->receive_credits); + new_credits = manage_credits_prior_sending(sc); + atomic_add(new_credits, &sc->recv_io.credits.count); packet->credits_granted = cpu_to_le16(new_credits); - info->send_immediate = false; - packet->flags = 0; - if (manage_keep_alive_before_sending(info)) + if (manage_keep_alive_before_sending(sc)) packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); packet->reserved = 0; @@ -953,26 +1229,7 @@ wait_send_queue: le32_to_cpu(packet->data_length), le32_to_cpu(packet->remaining_data_length)); - /* Map the packet to DMA */ - header_length = sizeof(struct smbdirect_data_transfer); - /* If this is a packet without payload, don't send padding */ - if (!data_length) - header_length = offsetof(struct smbdirect_data_transfer, padding); - - request->sge[0].addr = ib_dma_map_single(sc->ib.dev, - (void *)packet, - header_length, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) { - rc = -EIO; - request->sge[0].addr = 0; - goto err_dma; - } - - request->sge[0].length = header_length; - request->sge[0].lkey = sc->ib.pd->local_dma_lkey; - - rc = smbd_post_send(info, request); + rc = smbd_post_send(sc, request); if (!rc) return 0; @@ -985,19 +1242,16 @@ err_dma: DMA_TO_DEVICE); mempool_free(request, sc->send_io.mem.pool); - /* roll back receive credits and credits to be offered */ - spin_lock(&info->lock_new_credits_offered); - info->new_credits_offered += new_credits; - spin_unlock(&info->lock_new_credits_offered); - atomic_sub(new_credits, &info->receive_credits); + /* roll back the granted receive credits */ + atomic_sub(new_credits, &sc->recv_io.credits.count); err_alloc: - if (atomic_dec_and_test(&info->send_pending)) - wake_up(&info->wait_send_pending); + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); err_wait_send_queue: /* roll back send credits and pending */ - atomic_inc(&info->send_credits); + atomic_inc(&sc->send_io.credits.count); err_wait_credit: return rc; @@ -1008,15 +1262,15 @@ err_wait_credit: * Empty message is used to extend credits to peer to for keep live * while there is no upper layer payload to send at the time */ -static int smbd_post_send_empty(struct smbd_connection *info) +static int smbd_post_send_empty(struct smbdirect_socket *sc) { int remaining_data_length = 0; - info->count_send_empty++; - return smbd_post_send_iter(info, NULL, &remaining_data_length); + sc->statistics.send_empty++; + return smbd_post_send_iter(sc, NULL, &remaining_data_length); } -static int smbd_post_send_full_iter(struct smbd_connection *info, +static int smbd_post_send_full_iter(struct smbdirect_socket *sc, struct iov_iter *iter, int *_remaining_data_length) { @@ -1029,7 +1283,7 @@ static int smbd_post_send_full_iter(struct smbd_connection *info, */ while (iov_iter_count(iter) > 0) { - rc = smbd_post_send_iter(info, iter, _remaining_data_length); + rc = smbd_post_send_iter(sc, iter, _remaining_data_length); if (rc < 0) break; } @@ -1043,9 +1297,8 @@ static int smbd_post_send_full_iter(struct smbd_connection *info, * The interaction is controlled by send/receive credit system */ static int smbd_post_recv( - struct smbd_connection *info, struct smbdirect_recv_io *response) + struct smbdirect_socket *sc, struct smbdirect_recv_io *response) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_recv_wr recv_wr; int rc = -EIO; @@ -1071,7 +1324,7 @@ static int smbd_post_recv( ib_dma_unmap_single(sc->ib.dev, response->sge.addr, response->sge.length, DMA_FROM_DEVICE); response->sge.length = 0; - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc); } @@ -1079,31 +1332,36 @@ static int smbd_post_recv( } /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */ -static int smbd_negotiate(struct smbd_connection *info) +static int smbd_negotiate(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; int rc; - struct smbdirect_recv_io *response = get_receive_buffer(info); + struct smbdirect_recv_io *response = get_receive_buffer(sc); + + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED); + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP; - rc = smbd_post_recv(info, response); + rc = smbd_post_recv(sc, response); log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", rc, response->sge.addr, response->sge.length, response->sge.lkey); - if (rc) + if (rc) { + put_receive_buffer(sc, response); return rc; + } - init_completion(&info->negotiate_completion); - info->negotiate_done = false; - rc = smbd_post_send_negotiate_req(info); + rc = smbd_post_send_negotiate_req(sc); if (rc) return rc; - rc = wait_for_completion_interruptible_timeout( - &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ); - log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc); + rc = wait_event_interruptible_timeout( + sc->status_wait, + sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING, + msecs_to_jiffies(sp->negotiate_timeout_msec)); + log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc); - if (info->negotiate_done) + if (sc->status == SMBDIRECT_SOCKET_CONNECTED) return 0; if (rc == 0) @@ -1127,13 +1385,13 @@ static int smbd_negotiate(struct smbd_connection *info) * data_length: the size of payload in this packet */ static void enqueue_reassembly( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct smbdirect_recv_io *response, int data_length) { - struct smbdirect_socket *sc = &info->socket; + unsigned long flags; - spin_lock(&sc->recv_io.reassembly.lock); + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); list_add_tail(&response->list, &sc->recv_io.reassembly.list); sc->recv_io.reassembly.queue_length++; /* @@ -1144,9 +1402,8 @@ static void enqueue_reassembly( */ virt_wmb(); sc->recv_io.reassembly.data_length += data_length; - spin_unlock(&sc->recv_io.reassembly.lock); - info->count_reassembly_queue++; - info->count_enqueue_reassembly_queue++; + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); + sc->statistics.enqueue_reassembly_queue++; } /* @@ -1154,9 +1411,8 @@ static void enqueue_reassembly( * Caller is responsible for locking * return value: the first entry if any, NULL if queue is empty */ -static struct smbdirect_recv_io *_get_first_reassembly(struct smbd_connection *info) +static struct smbdirect_recv_io *_get_first_reassembly(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *ret = NULL; if (!list_empty(&sc->recv_io.reassembly.list)) { @@ -1173,9 +1429,8 @@ static struct smbdirect_recv_io *_get_first_reassembly(struct smbd_connection *i * pre-allocated in advance. * return value: the receive buffer, NULL if none is available */ -static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info) +static struct smbdirect_recv_io *get_receive_buffer(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *ret = NULL; unsigned long flags; @@ -1185,8 +1440,7 @@ static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info &sc->recv_io.free.list, struct smbdirect_recv_io, list); list_del(&ret->list); - info->count_receive_queue--; - info->count_get_receive_buffer++; + sc->statistics.get_receive_buffer++; } spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); @@ -1200,9 +1454,8 @@ static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info * receive buffer is returned. */ static void put_receive_buffer( - struct smbd_connection *info, struct smbdirect_recv_io *response) + struct smbdirect_socket *sc, struct smbdirect_recv_io *response) { - struct smbdirect_socket *sc = &info->socket; unsigned long flags; if (likely(response->sge.length != 0)) { @@ -1215,31 +1468,18 @@ static void put_receive_buffer( spin_lock_irqsave(&sc->recv_io.free.lock, flags); list_add_tail(&response->list, &sc->recv_io.free.list); - info->count_receive_queue++; - info->count_put_receive_buffer++; + sc->statistics.put_receive_buffer++; spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); - queue_work(info->workqueue, &info->post_send_credits_work); + queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); } /* Preallocate all receive buffer on transport establishment */ -static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) +static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *response; int i; - INIT_LIST_HEAD(&sc->recv_io.reassembly.list); - spin_lock_init(&sc->recv_io.reassembly.lock); - sc->recv_io.reassembly.data_length = 0; - sc->recv_io.reassembly.queue_length = 0; - - INIT_LIST_HEAD(&sc->recv_io.free.list); - spin_lock_init(&sc->recv_io.free.lock); - info->count_receive_queue = 0; - - init_waitqueue_head(&info->wait_receive_queues); - for (i = 0; i < num_buf; i++) { response = mempool_alloc(sc->recv_io.mem.pool, GFP_KERNEL); if (!response) @@ -1248,7 +1488,6 @@ static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) response->socket = sc; response->sge.length = 0; list_add_tail(&response->list, &sc->recv_io.free.list); - info->count_receive_queue++; } return 0; @@ -1259,45 +1498,59 @@ allocate_failed: &sc->recv_io.free.list, struct smbdirect_recv_io, list); list_del(&response->list); - info->count_receive_queue--; mempool_free(response, sc->recv_io.mem.pool); } return -ENOMEM; } -static void destroy_receive_buffers(struct smbd_connection *info) +static void destroy_receive_buffers(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *response; - while ((response = get_receive_buffer(info))) + while ((response = get_receive_buffer(sc))) mempool_free(response, sc->recv_io.mem.pool); } +static void send_immediate_empty_message(struct work_struct *work) +{ + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.immediate_work); + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return; + + log_keep_alive(INFO, "send an empty message\n"); + smbd_post_send_empty(sc); +} + /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */ static void idle_connection_timer(struct work_struct *work) { - struct smbd_connection *info = container_of( - work, struct smbd_connection, - idle_timer_work.work); - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.timer_work.work); struct smbdirect_socket_parameters *sp = &sc->parameters; - if (info->keep_alive_requested != KEEP_ALIVE_NONE) { + if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { log_keep_alive(ERR, - "error status info->keep_alive_requested=%d\n", - info->keep_alive_requested); - smbd_disconnect_rdma_connection(info); + "error status sc->idle.keepalive=%d\n", + sc->idle.keepalive); + smbd_disconnect_rdma_connection(sc); return; } - log_keep_alive(INFO, "about to send an empty idle message\n"); - smbd_post_send_empty(info); + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return; - /* Setup the next idle timeout work */ - queue_delayed_work(info->workqueue, &info->idle_timer_work, - msecs_to_jiffies(sp->keepalive_interval_msec)); + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); + log_keep_alive(INFO, "schedule send of empty idle message\n"); + queue_work(sc->workqueue, &sc->idle.immediate_work); } /* @@ -1309,7 +1562,6 @@ void smbd_destroy(struct TCP_Server_Info *server) { struct smbd_connection *info = server->smbd_conn; struct smbdirect_socket *sc; - struct smbdirect_socket_parameters *sp; struct smbdirect_recv_io *response; unsigned long flags; @@ -1318,35 +1570,51 @@ void smbd_destroy(struct TCP_Server_Info *server) return; } sc = &info->socket; - sp = &sc->parameters; + + log_rdma_event(INFO, "cancelling and disable disconnect_work\n"); + disable_work_sync(&sc->disconnect_work); log_rdma_event(INFO, "destroying rdma session\n"); - if (sc->status != SMBDIRECT_SOCKET_DISCONNECTED) { - rdma_disconnect(sc->rdma.cm_id); + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) { + smbd_disconnect_rdma_work(&sc->disconnect_work); log_rdma_event(INFO, "wait for transport being disconnected\n"); wait_event_interruptible( - info->status_wait, + sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); } + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + * + * Most likely this was already called via + * smbd_disconnect_rdma_work(), but call it again... + */ + smbd_disconnect_wake_up_all(sc); + + log_rdma_event(INFO, "cancelling recv_io.posted.refill_work\n"); + disable_work_sync(&sc->recv_io.posted.refill_work); + log_rdma_event(INFO, "destroying qp\n"); ib_drain_qp(sc->ib.qp); rdma_destroy_qp(sc->rdma.cm_id); sc->ib.qp = NULL; log_rdma_event(INFO, "cancelling idle timer\n"); - cancel_delayed_work_sync(&info->idle_timer_work); + disable_delayed_work_sync(&sc->idle.timer_work); + log_rdma_event(INFO, "cancelling send immediate work\n"); + disable_work_sync(&sc->idle.immediate_work); /* It's not possible for upper layer to get to reassembly */ log_rdma_event(INFO, "drain the reassembly queue\n"); do { spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); - response = _get_first_reassembly(info); + response = _get_first_reassembly(sc); if (response) { list_del(&response->list); spin_unlock_irqrestore( &sc->recv_io.reassembly.lock, flags); - put_receive_buffer(info, response); + put_receive_buffer(sc, response); } else spin_unlock_irqrestore( &sc->recv_io.reassembly.lock, flags); @@ -1354,9 +1622,7 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->recv_io.reassembly.data_length = 0; log_rdma_event(INFO, "free receive buffers\n"); - wait_event(info->wait_receive_queues, - info->count_receive_queue == sp->recv_credit_max); - destroy_receive_buffers(info); + destroy_receive_buffers(sc); /* * For performance reasons, memory registration and deregistration @@ -1366,13 +1632,12 @@ void smbd_destroy(struct TCP_Server_Info *server) * path when sending data, and then release memory registrations. */ log_rdma_event(INFO, "freeing mr list\n"); - wake_up_interruptible_all(&info->wait_mr); - while (atomic_read(&info->mr_used_count)) { + while (atomic_read(&sc->mr_io.used.count)) { cifs_server_unlock(server); msleep(1000); cifs_server_lock(server); } - destroy_mr_list(info); + destroy_mr_list(sc); ib_free_cq(sc->ib.send_cq); ib_free_cq(sc->ib.recv_cq); @@ -1388,7 +1653,7 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->status = SMBDIRECT_SOCKET_DESTROYED; - destroy_workqueue(info->workqueue); + destroy_workqueue(sc->workqueue); log_rdma_event(INFO, "rdma session destroyed\n"); kfree(info); server->smbd_conn = NULL; @@ -1430,12 +1695,9 @@ create_conn: return -ENOENT; } -static void destroy_caches_and_workqueue(struct smbd_connection *info) +static void destroy_caches(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; - - destroy_receive_buffers(info); - destroy_workqueue(info->workqueue); + destroy_receive_buffers(sc); mempool_destroy(sc->recv_io.mem.pool); kmem_cache_destroy(sc->recv_io.mem.cache); mempool_destroy(sc->send_io.mem.pool); @@ -1443,9 +1705,8 @@ static void destroy_caches_and_workqueue(struct smbd_connection *info) } #define MAX_NAME_LEN 80 -static int allocate_caches_and_workqueue(struct smbd_connection *info) +static int allocate_caches(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; char name[MAX_NAME_LEN]; int rc; @@ -1453,7 +1714,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer))) return -ENOMEM; - scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", sc); sc->send_io.mem.cache = kmem_cache_create( name, @@ -1469,7 +1730,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (!sc->send_io.mem.pool) goto out1; - scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", sc); struct kmem_cache_args response_args = { .align = __alignof__(struct smbdirect_recv_io), @@ -1490,21 +1751,14 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (!sc->recv_io.mem.pool) goto out3; - scnprintf(name, MAX_NAME_LEN, "smbd_%p", info); - info->workqueue = create_workqueue(name); - if (!info->workqueue) - goto out4; - - rc = allocate_receive_buffers(info, sp->recv_credit_max); + rc = allocate_receive_buffers(sc, sp->recv_credit_max); if (rc) { log_rdma_event(ERR, "failed to allocate receive buffers\n"); - goto out5; + goto out4; } return 0; -out5: - destroy_workqueue(info->workqueue); out4: mempool_destroy(sc->recv_io.mem.pool); out3: @@ -1528,46 +1782,63 @@ static struct smbd_connection *_smbd_get_connection( struct ib_qp_init_attr qp_attr; struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr; struct ib_port_immutable port_immutable; - u32 ird_ord_hdr[2]; + __be32 ird_ord_hdr[2]; + char wq_name[80]; + struct workqueue_struct *workqueue; info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL); if (!info) return NULL; sc = &info->socket; + scnprintf(wq_name, ARRAY_SIZE(wq_name), "smbd_%p", sc); + workqueue = create_workqueue(wq_name); + if (!workqueue) + goto create_wq_failed; + smbdirect_socket_init(sc); + sc->workqueue = workqueue; sp = &sc->parameters; - sc->status = SMBDIRECT_SOCKET_CONNECTING; - rc = smbd_ia_open(info, dstaddr, port); + INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work); + + sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT; + sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT; + sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT; + sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000; + sp->initiator_depth = 1; + sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES; + sp->recv_credit_max = smbd_receive_credit_max; + sp->send_credit_target = smbd_send_credit_target; + sp->max_send_size = smbd_max_send_size; + sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; + sp->max_recv_size = smbd_max_receive_size; + sp->max_frmr_depth = smbd_max_frmr_depth; + sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; + sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000; + + rc = smbd_ia_open(sc, dstaddr, port); if (rc) { log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); goto create_id_failed; } - if (smbd_send_credit_target > sc->ib.dev->attrs.max_cqe || - smbd_send_credit_target > sc->ib.dev->attrs.max_qp_wr) { + if (sp->send_credit_target > sc->ib.dev->attrs.max_cqe || + sp->send_credit_target > sc->ib.dev->attrs.max_qp_wr) { log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", - smbd_send_credit_target, + sp->send_credit_target, sc->ib.dev->attrs.max_cqe, sc->ib.dev->attrs.max_qp_wr); goto config_failed; } - if (smbd_receive_credit_max > sc->ib.dev->attrs.max_cqe || - smbd_receive_credit_max > sc->ib.dev->attrs.max_qp_wr) { + if (sp->recv_credit_max > sc->ib.dev->attrs.max_cqe || + sp->recv_credit_max > sc->ib.dev->attrs.max_qp_wr) { log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", - smbd_receive_credit_max, + sp->recv_credit_max, sc->ib.dev->attrs.max_cqe, sc->ib.dev->attrs.max_qp_wr); goto config_failed; } - sp->recv_credit_max = smbd_receive_credit_max; - sp->send_credit_target = smbd_send_credit_target; - sp->max_send_size = smbd_max_send_size; - sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; - sp->max_recv_size = smbd_max_receive_size; - sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; - if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE || sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) { log_rdma_event(ERR, @@ -1579,8 +1850,16 @@ static struct smbd_connection *_smbd_get_connection( goto config_failed; } + sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); + if (IS_ERR(sc->ib.pd)) { + rc = PTR_ERR(sc->ib.pd); + sc->ib.pd = NULL; + log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc); + goto alloc_pd_failed; + } + sc->ib.send_cq = - ib_alloc_cq_any(sc->ib.dev, info, + ib_alloc_cq_any(sc->ib.dev, sc, sp->send_credit_target, IB_POLL_SOFTIRQ); if (IS_ERR(sc->ib.send_cq)) { sc->ib.send_cq = NULL; @@ -1588,7 +1867,7 @@ static struct smbd_connection *_smbd_get_connection( } sc->ib.recv_cq = - ib_alloc_cq_any(sc->ib.dev, info, + ib_alloc_cq_any(sc->ib.dev, sc, sp->recv_credit_max, IB_POLL_SOFTIRQ); if (IS_ERR(sc->ib.recv_cq)) { sc->ib.recv_cq = NULL; @@ -1597,7 +1876,7 @@ static struct smbd_connection *_smbd_get_connection( memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.event_handler = smbd_qp_async_error_upcall; - qp_attr.qp_context = info; + qp_attr.qp_context = sc; qp_attr.cap.max_send_wr = sp->send_credit_target; qp_attr.cap.max_recv_wr = sp->recv_credit_max; qp_attr.cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; @@ -1616,22 +1895,22 @@ static struct smbd_connection *_smbd_get_connection( } sc->ib.qp = sc->rdma.cm_id->qp; - memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = 0; - - conn_param.responder_resources = - min(sc->ib.dev->attrs.max_qp_rd_atom, - SMBD_CM_RESPONDER_RESOURCES); - info->responder_resources = conn_param.responder_resources; + sp->responder_resources = + min_t(u8, sp->responder_resources, + sc->ib.dev->attrs.max_qp_rd_atom); log_rdma_mr(INFO, "responder_resources=%d\n", - info->responder_resources); + sp->responder_resources); + + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.initiator_depth = sp->initiator_depth; + conn_param.responder_resources = sp->responder_resources; /* Need to send IRD/ORD in private data for iWARP */ sc->ib.dev->ops.get_port_immutable( sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable); if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { - ird_ord_hdr[0] = info->responder_resources; - ird_ord_hdr[1] = 1; + ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); + ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); conn_param.private_data = ird_ord_hdr; conn_param.private_data_len = sizeof(ird_ord_hdr); } else { @@ -1646,8 +1925,8 @@ static struct smbd_connection *_smbd_get_connection( log_rdma_event(INFO, "connecting to IP %pI4 port %d\n", &addr_in->sin_addr, port); - init_waitqueue_head(&info->status_wait); - init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; rc = rdma_connect(sc->rdma.cm_id, &conn_param); if (rc) { log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc); @@ -1655,45 +1934,42 @@ static struct smbd_connection *_smbd_get_connection( } wait_event_interruptible_timeout( - info->status_wait, - sc->status != SMBDIRECT_SOCKET_CONNECTING, - msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + sc->status_wait, + sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING, + msecs_to_jiffies(sp->rdma_connect_timeout_msec)); - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) { log_rdma_event(ERR, "rdma_connect failed port=%d\n", port); goto rdma_connect_failed; } log_rdma_event(INFO, "rdma_connect connected\n"); - rc = allocate_caches_and_workqueue(info); + rc = allocate_caches(sc); if (rc) { log_rdma_event(ERR, "cache allocation failed\n"); goto allocate_cache_failed; } - init_waitqueue_head(&info->wait_send_queue); - INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer); - queue_delayed_work(info->workqueue, &info->idle_timer_work, - msecs_to_jiffies(sp->keepalive_interval_msec)); - - init_waitqueue_head(&info->wait_send_pending); - atomic_set(&info->send_pending, 0); - - init_waitqueue_head(&info->wait_post_send); + INIT_WORK(&sc->idle.immediate_work, send_immediate_empty_message); + INIT_DELAYED_WORK(&sc->idle.timer_work, idle_connection_timer); + /* + * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING + * so that the timer will cause a disconnect. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->negotiate_timeout_msec)); - INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work); - INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits); - info->new_credits_offered = 0; - spin_lock_init(&info->lock_new_credits_offered); + INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits); - rc = smbd_negotiate(info); + rc = smbd_negotiate(sc); if (rc) { log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc); goto negotiation_failed; } - rc = allocate_mr_list(info); + rc = allocate_mr_list(sc); if (rc) { log_rdma_mr(ERR, "memory registration allocation failed\n"); goto allocate_mr_failed; @@ -1708,11 +1984,11 @@ allocate_mr_failed: return NULL; negotiation_failed: - cancel_delayed_work_sync(&info->idle_timer_work); - destroy_caches_and_workqueue(info); + disable_delayed_work_sync(&sc->idle.timer_work); + destroy_caches(sc); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; rdma_disconnect(sc->rdma.cm_id); - wait_event(info->status_wait, + wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); allocate_cache_failed: @@ -1726,11 +2002,15 @@ alloc_cq_failed: if (sc->ib.recv_cq) ib_free_cq(sc->ib.recv_cq); -config_failed: ib_dealloc_pd(sc->ib.pd); + +alloc_pd_failed: +config_failed: rdma_destroy_id(sc->rdma.cm_id); create_id_failed: + destroy_workqueue(sc->workqueue); +create_wq_failed: kfree(info); return NULL; } @@ -1739,6 +2019,7 @@ struct smbd_connection *smbd_get_connection( struct TCP_Server_Info *server, struct sockaddr *dstaddr) { struct smbd_connection *ret; + const struct smbdirect_socket_parameters *sp; int port = SMBD_PORT; try_again: @@ -1749,6 +2030,16 @@ try_again: port = SMB_PORT; goto try_again; } + if (!ret) + return NULL; + + sp = &ret->socket.parameters; + + server->rdma_readwrite_threshold = + rdma_readwrite_threshold > sp->max_fragmented_send_size ? + sp->max_fragmented_send_size : + rdma_readwrite_threshold; + return ret; } @@ -1790,6 +2081,7 @@ again: if (sc->recv_io.reassembly.data_length >= size) { int queue_length; int queue_removed = 0; + unsigned long flags; /* * Need to make sure reassembly_data_length is read before @@ -1804,7 +2096,7 @@ again: to_read = size; offset = sc->recv_io.reassembly.first_entry_offset; while (data_read < size) { - response = _get_first_reassembly(info); + response = _get_first_reassembly(sc); data_transfer = smbdirect_recv_io_payload(response); data_length = le32_to_cpu(data_transfer->data_length); remaining_data_length = @@ -1849,16 +2141,15 @@ again: if (queue_length) list_del(&response->list); else { - spin_lock_irq( - &sc->recv_io.reassembly.lock); + spin_lock_irqsave( + &sc->recv_io.reassembly.lock, flags); list_del(&response->list); - spin_unlock_irq( - &sc->recv_io.reassembly.lock); + spin_unlock_irqrestore( + &sc->recv_io.reassembly.lock, flags); } queue_removed++; - info->count_reassembly_queue--; - info->count_dequeue_reassembly_queue++; - put_receive_buffer(info, response); + sc->statistics.dequeue_reassembly_queue++; + put_receive_buffer(sc, response); offset = 0; log_read(INFO, "put_receive_buffer offset=0\n"); } else @@ -1872,10 +2163,10 @@ again: to_read, data_read, offset); } - spin_lock_irq(&sc->recv_io.reassembly.lock); + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); sc->recv_io.reassembly.data_length -= data_read; sc->recv_io.reassembly.queue_length -= queue_removed; - spin_unlock_irq(&sc->recv_io.reassembly.lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); sc->recv_io.reassembly.first_entry_offset = offset; log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", @@ -1960,13 +2251,13 @@ int smbd_send(struct TCP_Server_Info *server, klen += rqst->rq_iov[i].iov_len; iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); - rc = smbd_post_send_full_iter(info, &iter, &remaining_data_length); + rc = smbd_post_send_full_iter(sc, &iter, &remaining_data_length); if (rc < 0) break; if (iov_iter_count(&rqst->rq_iter) > 0) { /* And then the data pages if there are any */ - rc = smbd_post_send_full_iter(info, &rqst->rq_iter, + rc = smbd_post_send_full_iter(sc, &rqst->rq_iter, &remaining_data_length); if (rc < 0) break; @@ -1981,8 +2272,8 @@ int smbd_send(struct TCP_Server_Info *server, * that means all the I/Os have been out and we are good to return */ - wait_event(info->wait_send_pending, - atomic_read(&info->send_pending) == 0 || + wait_event(sc->send_io.pending.zero_wait_queue, + atomic_read(&sc->send_io.pending.count) == 0 || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0) @@ -1993,14 +2284,13 @@ int smbd_send(struct TCP_Server_Info *server, static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smbd_mr *mr; - struct ib_cqe *cqe; + struct smbdirect_mr_io *mr = + container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe); + struct smbdirect_socket *sc = mr->socket; if (wc->status) { log_rdma_mr(ERR, "status=%d\n", wc->status); - cqe = wc->wr_cqe; - mr = container_of(cqe, struct smbd_mr, cqe); - smbd_disconnect_rdma_connection(mr->conn); + smbd_disconnect_rdma_connection(sc); } } @@ -2015,14 +2305,14 @@ static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc) */ static void smbd_mr_recovery_work(struct work_struct *work) { - struct smbd_connection *info = - container_of(work, struct smbd_connection, mr_recovery_work); - struct smbdirect_socket *sc = &info->socket; - struct smbd_mr *smbdirect_mr; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, mr_io.recovery_work); + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_mr_io *smbdirect_mr; int rc; - list_for_each_entry(smbdirect_mr, &info->mr_list, list) { - if (smbdirect_mr->state == MR_ERROR) { + list_for_each_entry(smbdirect_mr, &sc->mr_io.all.list, list) { + if (smbdirect_mr->state == SMBDIRECT_MR_ERROR) { /* recover this MR entry */ rc = ib_dereg_mr(smbdirect_mr->mr); @@ -2030,25 +2320,25 @@ static void smbd_mr_recovery_work(struct work_struct *work) log_rdma_mr(ERR, "ib_dereg_mr failed rc=%x\n", rc); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); continue; } smbdirect_mr->mr = ib_alloc_mr( - sc->ib.pd, info->mr_type, - info->max_frmr_depth); + sc->ib.pd, sc->mr_io.type, + sp->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", - info->mr_type, - info->max_frmr_depth); - smbd_disconnect_rdma_connection(info); + sc->mr_io.type, + sp->max_frmr_depth); + smbd_disconnect_rdma_connection(sc); continue; } } else /* This MR is being used, don't recover it */ continue; - smbdirect_mr->state = MR_READY; + smbdirect_mr->state = SMBDIRECT_MR_READY; /* smbdirect_mr->state is updated by this function * and is read and updated by I/O issuing CPUs trying @@ -2057,19 +2347,18 @@ static void smbd_mr_recovery_work(struct work_struct *work) * value is updated before waking up any calls to * get_mr() from the I/O issuing CPUs */ - if (atomic_inc_return(&info->mr_ready_count) == 1) - wake_up_interruptible(&info->wait_mr); + if (atomic_inc_return(&sc->mr_io.ready.count) == 1) + wake_up(&sc->mr_io.ready.wait_queue); } } -static void destroy_mr_list(struct smbd_connection *info) +static void destroy_mr_list(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; - struct smbd_mr *mr, *tmp; + struct smbdirect_mr_io *mr, *tmp; - cancel_work_sync(&info->mr_recovery_work); - list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { - if (mr->state == MR_INVALIDATED) + disable_work_sync(&sc->mr_io.recovery_work); + list_for_each_entry_safe(mr, tmp, &sc->mr_io.all.list, list) { + if (mr->state == SMBDIRECT_MR_INVALIDATED) ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); ib_dereg_mr(mr->mr); @@ -2085,32 +2374,32 @@ static void destroy_mr_list(struct smbd_connection *info) * Recovery is done in smbd_mr_recovery_work. The content of list entry changes * as MRs are used and recovered for I/O, but the list links will not change */ -static int allocate_mr_list(struct smbd_connection *info) +static int allocate_mr_list(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; int i; - struct smbd_mr *smbdirect_mr, *tmp; - - INIT_LIST_HEAD(&info->mr_list); - init_waitqueue_head(&info->wait_mr); - spin_lock_init(&info->mr_list_lock); - atomic_set(&info->mr_ready_count, 0); - atomic_set(&info->mr_used_count, 0); - init_waitqueue_head(&info->wait_for_mr_cleanup); - INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); + struct smbdirect_mr_io *smbdirect_mr, *tmp; + + INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work); + + if (sp->responder_resources == 0) { + log_rdma_mr(ERR, "responder_resources negotiated as 0\n"); + return -EINVAL; + } + /* Allocate more MRs (2x) than hardware responder_resources */ - for (i = 0; i < info->responder_resources * 2; i++) { + for (i = 0; i < sp->responder_resources * 2; i++) { smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); if (!smbdirect_mr) goto cleanup_entries; - smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, info->mr_type, - info->max_frmr_depth); + smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, sc->mr_io.type, + sp->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", - info->mr_type, info->max_frmr_depth); + sc->mr_io.type, sp->max_frmr_depth); goto out; } - smbdirect_mr->sgt.sgl = kcalloc(info->max_frmr_depth, + smbdirect_mr->sgt.sgl = kcalloc(sp->max_frmr_depth, sizeof(struct scatterlist), GFP_KERNEL); if (!smbdirect_mr->sgt.sgl) { @@ -2118,18 +2407,18 @@ static int allocate_mr_list(struct smbd_connection *info) ib_dereg_mr(smbdirect_mr->mr); goto out; } - smbdirect_mr->state = MR_READY; - smbdirect_mr->conn = info; + smbdirect_mr->state = SMBDIRECT_MR_READY; + smbdirect_mr->socket = sc; - list_add_tail(&smbdirect_mr->list, &info->mr_list); - atomic_inc(&info->mr_ready_count); + list_add_tail(&smbdirect_mr->list, &sc->mr_io.all.list); + atomic_inc(&sc->mr_io.ready.count); } return 0; out: kfree(smbdirect_mr); cleanup_entries: - list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) { + list_for_each_entry_safe(smbdirect_mr, tmp, &sc->mr_io.all.list, list) { list_del(&smbdirect_mr->list); ib_dereg_mr(smbdirect_mr->mr); kfree(smbdirect_mr->sgt.sgl); @@ -2146,14 +2435,14 @@ cleanup_entries: * issuing I/O trying to get MR at the same time, mr_list_lock is used to * protect this situation. */ -static struct smbd_mr *get_mr(struct smbd_connection *info) +static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; - struct smbd_mr *ret; + struct smbdirect_mr_io *ret; + unsigned long flags; int rc; again: - rc = wait_event_interruptible(info->wait_mr, - atomic_read(&info->mr_ready_count) || + rc = wait_event_interruptible(sc->mr_io.ready.wait_queue, + atomic_read(&sc->mr_io.ready.count) || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (rc) { log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc); @@ -2165,18 +2454,18 @@ again: return NULL; } - spin_lock(&info->mr_list_lock); - list_for_each_entry(ret, &info->mr_list, list) { - if (ret->state == MR_READY) { - ret->state = MR_REGISTERED; - spin_unlock(&info->mr_list_lock); - atomic_dec(&info->mr_ready_count); - atomic_inc(&info->mr_used_count); + spin_lock_irqsave(&sc->mr_io.all.lock, flags); + list_for_each_entry(ret, &sc->mr_io.all.list, list) { + if (ret->state == SMBDIRECT_MR_READY) { + ret->state = SMBDIRECT_MR_REGISTERED; + spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); + atomic_dec(&sc->mr_io.ready.count); + atomic_inc(&sc->mr_io.used.count); return ret; } } - spin_unlock(&info->mr_list_lock); + spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); /* * It is possible that we could fail to get MR because other processes may * try to acquire a MR at the same time. If this is the case, retry it. @@ -2187,8 +2476,7 @@ again: /* * Transcribe the pages from an iterator into an MR scatterlist. */ -static int smbd_iter_to_mr(struct smbd_connection *info, - struct iov_iter *iter, +static int smbd_iter_to_mr(struct iov_iter *iter, struct sg_table *sgt, unsigned int max_sg) { @@ -2210,25 +2498,26 @@ static int smbd_iter_to_mr(struct smbd_connection *info, * need_invalidate: true if this MR needs to be locally invalidated after I/O * return value: the MR registered, NULL if failed. */ -struct smbd_mr *smbd_register_mr(struct smbd_connection *info, +struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, struct iov_iter *iter, bool writing, bool need_invalidate) { struct smbdirect_socket *sc = &info->socket; - struct smbd_mr *smbdirect_mr; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_mr_io *smbdirect_mr; int rc, num_pages; enum dma_data_direction dir; struct ib_reg_wr *reg_wr; - num_pages = iov_iter_npages(iter, info->max_frmr_depth + 1); - if (num_pages > info->max_frmr_depth) { + num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1); + if (num_pages > sp->max_frmr_depth) { log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n", - num_pages, info->max_frmr_depth); + num_pages, sp->max_frmr_depth); WARN_ON_ONCE(1); return NULL; } - smbdirect_mr = get_mr(info); + smbdirect_mr = get_mr(sc); if (!smbdirect_mr) { log_rdma_mr(ERR, "get_mr returning NULL\n"); return NULL; @@ -2241,8 +2530,8 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info, smbdirect_mr->sgt.orig_nents = 0; log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n", - num_pages, iov_iter_count(iter), info->max_frmr_depth); - smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth); + num_pages, iov_iter_count(iter), sp->max_frmr_depth); + smbd_iter_to_mr(iter, &smbdirect_mr->sgt, sp->max_frmr_depth); rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, dir); @@ -2287,32 +2576,32 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info, log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n", rc, reg_wr->key); - /* If all failed, attempt to recover this MR by setting it MR_ERROR*/ + /* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/ map_mr_error: ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, smbdirect_mr->dir); dma_map_error: - smbdirect_mr->state = MR_ERROR; - if (atomic_dec_and_test(&info->mr_used_count)) - wake_up(&info->wait_for_mr_cleanup); + smbdirect_mr->state = SMBDIRECT_MR_ERROR; + if (atomic_dec_and_test(&sc->mr_io.used.count)) + wake_up(&sc->mr_io.cleanup.wait_queue); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); return NULL; } static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smbd_mr *smbdirect_mr; + struct smbdirect_mr_io *smbdirect_mr; struct ib_cqe *cqe; cqe = wc->wr_cqe; - smbdirect_mr = container_of(cqe, struct smbd_mr, cqe); - smbdirect_mr->state = MR_INVALIDATED; + smbdirect_mr = container_of(cqe, struct smbdirect_mr_io, cqe); + smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED; if (wc->status != IB_WC_SUCCESS) { log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status); - smbdirect_mr->state = MR_ERROR; + smbdirect_mr->state = SMBDIRECT_MR_ERROR; } complete(&smbdirect_mr->invalidate_done); } @@ -2323,11 +2612,10 @@ static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) * and we have to locally invalidate the buffer to prevent data is being * modified by remote peer after upper layer consumes it */ -int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) +int smbd_deregister_mr(struct smbdirect_mr_io *smbdirect_mr) { struct ib_send_wr *wr; - struct smbd_connection *info = smbdirect_mr->conn; - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = smbdirect_mr->socket; int rc = 0; if (smbdirect_mr->need_invalidate) { @@ -2344,36 +2632,36 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) rc = ib_post_send(sc->ib.qp, wr, NULL); if (rc) { log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); goto done; } wait_for_completion(&smbdirect_mr->invalidate_done); smbdirect_mr->need_invalidate = false; } else /* - * For remote invalidation, just set it to MR_INVALIDATED + * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED * and defer to mr_recovery_work to recover the MR for next use */ - smbdirect_mr->state = MR_INVALIDATED; + smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED; - if (smbdirect_mr->state == MR_INVALIDATED) { + if (smbdirect_mr->state == SMBDIRECT_MR_INVALIDATED) { ib_dma_unmap_sg( sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, smbdirect_mr->dir); - smbdirect_mr->state = MR_READY; - if (atomic_inc_return(&info->mr_ready_count) == 1) - wake_up_interruptible(&info->wait_mr); + smbdirect_mr->state = SMBDIRECT_MR_READY; + if (atomic_inc_return(&sc->mr_io.ready.count) == 1) + wake_up(&sc->mr_io.ready.wait_queue); } else /* * Schedule the work to do MR recovery for future I/Os MR * recovery is slow and don't want it to block current I/O */ - queue_work(info->workqueue, &info->mr_recovery_work); + queue_work(sc->workqueue, &sc->mr_io.recovery_work); done: - if (atomic_dec_and_test(&info->mr_used_count)) - wake_up(&info->wait_for_mr_cleanup); + if (atomic_dec_and_test(&sc->mr_io.used.count)) + wake_up(&sc->mr_io.cleanup.wait_queue); return rc; } diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index e45aa9ddd71d..d67ac5ddaff4 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -27,12 +27,6 @@ extern int smbd_max_send_size; extern int smbd_send_credit_target; extern int smbd_receive_credit_max; -enum keep_alive_status { - KEEP_ALIVE_NONE, - KEEP_ALIVE_PENDING, - KEEP_ALIVE_SENT, -}; - /* * The context for the SMBDirect transport * Everything related to the transport is here. It has several logical parts @@ -44,79 +38,14 @@ enum keep_alive_status { */ struct smbd_connection { struct smbdirect_socket socket; - - int ri_rc; - struct completion ri_done; - wait_queue_head_t status_wait; - - struct completion negotiate_completion; - bool negotiate_done; - - struct work_struct disconnect_work; - struct work_struct post_send_credits_work; - - spinlock_t lock_new_credits_offered; - int new_credits_offered; - - /* dynamic connection parameters defined in [MS-SMBD] 3.1.1.1 */ - enum keep_alive_status keep_alive_requested; - int protocol; - atomic_t send_credits; - atomic_t receive_credits; - int receive_credit_target; - - /* Memory registrations */ - /* Maximum number of RDMA read/write outstanding on this connection */ - int responder_resources; - /* Maximum number of pages in a single RDMA write/read on this connection */ - int max_frmr_depth; - /* - * If payload is less than or equal to the threshold, - * use RDMA send/recv to send upper layer I/O. - * If payload is more than the threshold, - * use RDMA read/write through memory registration for I/O. - */ - int rdma_readwrite_threshold; - enum ib_mr_type mr_type; - struct list_head mr_list; - spinlock_t mr_list_lock; - /* The number of available MRs ready for memory registration */ - atomic_t mr_ready_count; - atomic_t mr_used_count; - wait_queue_head_t wait_mr; - struct work_struct mr_recovery_work; - /* Used by transport to wait until all MRs are returned */ - wait_queue_head_t wait_for_mr_cleanup; - - /* Activity accounting */ - atomic_t send_pending; - wait_queue_head_t wait_send_pending; - wait_queue_head_t wait_post_send; - - /* Receive queue */ - int count_receive_queue; - wait_queue_head_t wait_receive_queues; - - bool send_immediate; - - wait_queue_head_t wait_send_queue; - - struct workqueue_struct *workqueue; - struct delayed_work idle_timer_work; - - /* for debug purposes */ - unsigned int count_get_receive_buffer; - unsigned int count_put_receive_buffer; - unsigned int count_reassembly_queue; - unsigned int count_enqueue_reassembly_queue; - unsigned int count_dequeue_reassembly_queue; - unsigned int count_send_empty; }; /* Create a SMBDirect session */ struct smbd_connection *smbd_get_connection( struct TCP_Server_Info *server, struct sockaddr *dstaddr); +const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn); + /* Reconnect SMBDirect session */ int smbd_reconnect(struct TCP_Server_Info *server); /* Destroy SMBDirect session */ @@ -127,34 +56,11 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg); int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst); -enum mr_state { - MR_READY, - MR_REGISTERED, - MR_INVALIDATED, - MR_ERROR -}; - -struct smbd_mr { - struct smbd_connection *conn; - struct list_head list; - enum mr_state state; - struct ib_mr *mr; - struct sg_table sgt; - enum dma_data_direction dir; - union { - struct ib_reg_wr wr; - struct ib_send_wr inv_wr; - }; - struct ib_cqe cqe; - bool need_invalidate; - struct completion invalidate_done; -}; - /* Interfaces to register and deregister MR for RDMA read/write */ -struct smbd_mr *smbd_register_mr( +struct smbdirect_mr_io *smbd_register_mr( struct smbd_connection *info, struct iov_iter *iter, bool writing, bool need_invalidate); -int smbd_deregister_mr(struct smbd_mr *mr); +int smbd_deregister_mr(struct smbdirect_mr_io *mr); #else #define cifs_rdma_enabled(server) 0 diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 93e5b2bb9f28..fd650e2afc76 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -669,13 +669,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(posix_query_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter); -DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(unlink_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_wsl_ea_compound_enter); -DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter); @@ -710,13 +709,12 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(posix_query_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done); -DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(unlink_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done); -DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done); @@ -756,14 +754,13 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(posix_query_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err); -DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(unlink_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); -DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err); @@ -1171,8 +1168,54 @@ DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \ __u64 lease_key_high), \ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high)) -DEFINE_SMB3_LEASE_DONE_EVENT(lease_done); -DEFINE_SMB3_LEASE_DONE_EVENT(lease_not_found); +DEFINE_SMB3_LEASE_DONE_EVENT(lease_ack_done); +/* Tracepoint when a lease break request is received/entered (includes epoch and flags) */ +DECLARE_EVENT_CLASS(smb3_lease_enter_class, + TP_PROTO(__u32 lease_state, + __u32 flags, + __u16 epoch, + __u32 tid, + __u64 sesid, + __u64 lease_key_low, + __u64 lease_key_high), + TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high), + TP_STRUCT__entry( + __field(__u32, lease_state) + __field(__u32, flags) + __field(__u16, epoch) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, lease_key_low) + __field(__u64, lease_key_high) + ), + TP_fast_assign( + __entry->lease_state = lease_state; + __entry->flags = flags; + __entry->epoch = epoch; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->lease_key_low = lease_key_low; + __entry->lease_key_high = lease_key_high; + ), + TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x flags=0x%x epoch=%u", + __entry->sesid, __entry->tid, __entry->lease_key_high, + __entry->lease_key_low, __entry->lease_state, __entry->flags, __entry->epoch) +) + +#define DEFINE_SMB3_LEASE_ENTER_EVENT(name) \ +DEFINE_EVENT(smb3_lease_enter_class, smb3_##name, \ + TP_PROTO(__u32 lease_state, \ + __u32 flags, \ + __u16 epoch, \ + __u32 tid, \ + __u64 sesid, \ + __u64 lease_key_low, \ + __u64 lease_key_high), \ + TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high)) + +DEFINE_SMB3_LEASE_ENTER_EVENT(lease_break_enter); +/* Lease not found: reuse lease_enter payload (includes epoch and flags) */ +DEFINE_SMB3_LEASE_ENTER_EVENT(lease_not_found); DECLARE_EVENT_CLASS(smb3_lease_err_class, TP_PROTO(__u32 lease_state, @@ -1213,7 +1256,7 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \ int rc), \ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc)) -DEFINE_SMB3_LEASE_ERR_EVENT(lease_err); +DEFINE_SMB3_LEASE_ERR_EVENT(lease_ack_err); DECLARE_EVENT_CLASS(smb3_connect_class, TP_PROTO(char *hostname, diff --git a/fs/smb/common/smbdirect/smbdirect.h b/fs/smb/common/smbdirect/smbdirect.h index b9a385344ff3..05cc6a9d0ccd 100644 --- a/fs/smb/common/smbdirect/smbdirect.h +++ b/fs/smb/common/smbdirect/smbdirect.h @@ -23,6 +23,12 @@ struct smbdirect_buffer_descriptor_v1 { * Some values are important for the upper layer. */ struct smbdirect_socket_parameters { + __u32 resolve_addr_timeout_msec; + __u32 resolve_route_timeout_msec; + __u32 rdma_connect_timeout_msec; + __u32 negotiate_timeout_msec; + __u8 initiator_depth; + __u8 responder_resources; __u16 recv_credit_max; __u16 send_credit_target; __u32 max_send_size; @@ -30,6 +36,7 @@ struct smbdirect_socket_parameters { __u32 max_recv_size; __u32 max_fragmented_recv_size; __u32 max_read_write_size; + __u32 max_frmr_depth; __u32 keepalive_interval_msec; __u32 keepalive_timeout_msec; } __packed; diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 3c4a8d627aa3..db22a1d0546b 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -6,22 +6,102 @@ #ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ #define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ +#include <rdma/rw.h> + enum smbdirect_socket_status { SMBDIRECT_SOCKET_CREATED, - SMBDIRECT_SOCKET_CONNECTING, - SMBDIRECT_SOCKET_CONNECTED, + SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED, + SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING, + SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED, + SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED, + SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING, + SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED, + SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED, + SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING, + SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED, + SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, + SMBDIRECT_SOCKET_NEGOTIATE_RUNNING, SMBDIRECT_SOCKET_NEGOTIATE_FAILED, + SMBDIRECT_SOCKET_CONNECTED, + SMBDIRECT_SOCKET_ERROR, SMBDIRECT_SOCKET_DISCONNECTING, SMBDIRECT_SOCKET_DISCONNECTED, SMBDIRECT_SOCKET_DESTROYED }; +static __always_inline +const char *smbdirect_socket_status_string(enum smbdirect_socket_status status) +{ + switch (status) { + case SMBDIRECT_SOCKET_CREATED: + return "CREATED"; + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + return "RESOLVE_ADDR_NEEDED"; + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + return "RESOLVE_ADDR_RUNNING"; + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + return "RESOLVE_ADDR_FAILED"; + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + return "RESOLVE_ROUTE_NEEDED"; + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + return "RESOLVE_ROUTE_RUNNING"; + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + return "RESOLVE_ROUTE_FAILED"; + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + return "RDMA_CONNECT_NEEDED"; + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + return "RDMA_CONNECT_RUNNING"; + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + return "RDMA_CONNECT_FAILED"; + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + return "NEGOTIATE_NEEDED"; + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + return "NEGOTIATE_RUNNING"; + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + return "NEGOTIATE_FAILED"; + case SMBDIRECT_SOCKET_CONNECTED: + return "CONNECTED"; + case SMBDIRECT_SOCKET_ERROR: + return "ERROR"; + case SMBDIRECT_SOCKET_DISCONNECTING: + return "DISCONNECTING"; + case SMBDIRECT_SOCKET_DISCONNECTED: + return "DISCONNECTED"; + case SMBDIRECT_SOCKET_DESTROYED: + return "DESTROYED"; + } + + return "<unknown>"; +} + +enum smbdirect_keepalive_status { + SMBDIRECT_KEEPALIVE_NONE, + SMBDIRECT_KEEPALIVE_PENDING, + SMBDIRECT_KEEPALIVE_SENT +}; + struct smbdirect_socket { enum smbdirect_socket_status status; + wait_queue_head_t status_wait; + int first_error; + + /* + * This points to the workqueue to + * be used for this socket. + * It can be per socket (on the client) + * or point to a global workqueue (on the server) + */ + struct workqueue_struct *workqueue; + + struct work_struct disconnect_work; /* RDMA related */ struct { struct rdma_cm_id *cm_id; + /* + * This is for iWarp MPA v1 + */ + bool legacy_iwarp; } rdma; /* IB verbs related */ @@ -40,6 +120,15 @@ struct smbdirect_socket { struct smbdirect_socket_parameters parameters; /* + * The state for keepalive and timeout handling + */ + struct { + enum smbdirect_keepalive_status keepalive; + struct work_struct immediate_work; + struct delayed_work timer_work; + } idle; + + /* * The state for posted send buffers */ struct { @@ -51,6 +140,29 @@ struct smbdirect_socket { struct kmem_cache *cache; mempool_t *pool; } mem; + + /* + * The credit state for the send side + */ + struct { + atomic_t count; + wait_queue_head_t wait_queue; + } credits; + + /* + * The state about posted/pending sends + */ + struct { + atomic_t count; + /* + * woken when count is decremented + */ + wait_queue_head_t dec_wait_queue; + /* + * woken when count reached zero + */ + wait_queue_head_t zero_wait_queue; + } pending; } send_io; /* @@ -85,6 +197,23 @@ struct smbdirect_socket { } free; /* + * The state for posted recv_io messages + * and the refill work struct. + */ + struct { + atomic_t count; + struct work_struct refill_work; + } posted; + + /* + * The credit state for the recv side + */ + struct { + u16 target; + atomic_t count; + } credits; + + /* * The list of arrived non-empty smbdirect_recv_io * structures * @@ -110,8 +239,137 @@ struct smbdirect_socket { bool full_packet_received; } reassembly; } recv_io; + + /* + * The state for Memory registrations on the client + */ + struct { + enum ib_mr_type type; + + /* + * The list of free smbdirect_mr_io + * structures + */ + struct { + struct list_head list; + spinlock_t lock; + } all; + + /* + * The number of available MRs ready for memory registration + */ + struct { + atomic_t count; + wait_queue_head_t wait_queue; + } ready; + + /* + * The number of used MRs + */ + struct { + atomic_t count; + } used; + + struct work_struct recovery_work; + + /* Used by transport to wait until all MRs are returned */ + struct { + wait_queue_head_t wait_queue; + } cleanup; + } mr_io; + + /* + * The state for RDMA read/write requests on the server + */ + struct { + /* + * The credit state for the send side + */ + struct { + /* + * The maximum number of rw credits + */ + size_t max; + /* + * The number of pages per credit + */ + size_t num_pages; + atomic_t count; + wait_queue_head_t wait_queue; + } credits; + } rw_io; + + /* + * For debug purposes + */ + struct { + u64 get_receive_buffer; + u64 put_receive_buffer; + u64 enqueue_reassembly_queue; + u64 dequeue_reassembly_queue; + u64 send_empty; + } statistics; }; +static void __smbdirect_socket_disabled_work(struct work_struct *work) +{ + /* + * Should never be called as disable_[delayed_]work_sync() was used. + */ + WARN_ON_ONCE(1); +} + +static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) +{ + /* + * This also sets status = SMBDIRECT_SOCKET_CREATED + */ + BUILD_BUG_ON(SMBDIRECT_SOCKET_CREATED != 0); + memset(sc, 0, sizeof(*sc)); + + init_waitqueue_head(&sc->status_wait); + + INIT_WORK(&sc->disconnect_work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->disconnect_work); + + INIT_WORK(&sc->idle.immediate_work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->idle.immediate_work); + INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work); + disable_delayed_work_sync(&sc->idle.timer_work); + + atomic_set(&sc->send_io.credits.count, 0); + init_waitqueue_head(&sc->send_io.credits.wait_queue); + + atomic_set(&sc->send_io.pending.count, 0); + init_waitqueue_head(&sc->send_io.pending.dec_wait_queue); + init_waitqueue_head(&sc->send_io.pending.zero_wait_queue); + + INIT_LIST_HEAD(&sc->recv_io.free.list); + spin_lock_init(&sc->recv_io.free.lock); + + atomic_set(&sc->recv_io.posted.count, 0); + INIT_WORK(&sc->recv_io.posted.refill_work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->recv_io.posted.refill_work); + + atomic_set(&sc->recv_io.credits.count, 0); + + INIT_LIST_HEAD(&sc->recv_io.reassembly.list); + spin_lock_init(&sc->recv_io.reassembly.lock); + init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); + + atomic_set(&sc->rw_io.credits.count, 0); + init_waitqueue_head(&sc->rw_io.credits.wait_queue); + + spin_lock_init(&sc->mr_io.all.lock); + INIT_LIST_HEAD(&sc->mr_io.all.list); + atomic_set(&sc->mr_io.ready.count, 0); + init_waitqueue_head(&sc->mr_io.ready.wait_queue); + atomic_set(&sc->mr_io.used.count, 0); + INIT_WORK(&sc->mr_io.recovery_work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->mr_io.recovery_work); + init_waitqueue_head(&sc->mr_io.cleanup.wait_queue); +} + struct smbdirect_send_io { struct smbdirect_socket *socket; struct ib_cqe cqe; @@ -136,6 +394,23 @@ struct smbdirect_send_io { u8 packet[]; }; +struct smbdirect_send_batch { + /* + * List of smbdirect_send_io messages + */ + struct list_head msg_list; + /* + * Number of list entries + */ + size_t wr_cnt; + + /* + * Possible remote key invalidation state + */ + bool need_invalidate_rkey; + u32 remote_key; +}; + struct smbdirect_recv_io { struct smbdirect_socket *socket; struct ib_cqe cqe; @@ -158,4 +433,44 @@ struct smbdirect_recv_io { u8 packet[]; }; +enum smbdirect_mr_state { + SMBDIRECT_MR_READY, + SMBDIRECT_MR_REGISTERED, + SMBDIRECT_MR_INVALIDATED, + SMBDIRECT_MR_ERROR +}; + +struct smbdirect_mr_io { + struct smbdirect_socket *socket; + struct ib_cqe cqe; + + struct list_head list; + + enum smbdirect_mr_state state; + struct ib_mr *mr; + struct sg_table sgt; + enum dma_data_direction dir; + union { + struct ib_reg_wr wr; + struct ib_send_wr inv_wr; + }; + + bool need_invalidate; + struct completion invalidate_done; +}; + +struct smbdirect_rw_io { + struct smbdirect_socket *socket; + struct ib_cqe cqe; + + struct list_head list; + + int error; + struct completion *completion; + + struct rdma_rw_ctx rdma_ctx; + struct sg_table sgt; + struct scatterlist sg_list[]; +}; + #endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ */ diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 67c4f73398df..91a934411134 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -243,7 +243,7 @@ int ksmbd_conn_write(struct ksmbd_work *work) int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { int ret = -EINVAL; @@ -257,7 +257,7 @@ int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { int ret = -EINVAL; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 2aa8084bb593..07b43634262a 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -19,6 +19,8 @@ #include "smb_common.h" #include "ksmbd_work.h" +struct smbdirect_buffer_descriptor_v1; + #define KSMBD_SOCKET_BACKLOG 16 enum { @@ -133,11 +135,11 @@ struct ksmbd_transport_ops { unsigned int remote_key); int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len); int (*rdma_write)(struct ksmbd_transport *t, void *buf, unsigned int len, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len); void (*free_transport)(struct ksmbd_transport *kt); }; @@ -163,11 +165,11 @@ bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); int ksmbd_conn_write(struct ksmbd_work *work); int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len); int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len); void ksmbd_conn_enqueue_request(struct ksmbd_work *work); void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c index 72b00ca6e455..4a71f46d7020 100644 --- a/fs/smb/server/ksmbd_work.c +++ b/fs/smb/server/ksmbd_work.c @@ -78,7 +78,7 @@ int ksmbd_work_pool_init(void) int ksmbd_workqueue_init(void) { - ksmbd_wq = alloc_workqueue("ksmbd-io", 0, 0); + ksmbd_wq = alloc_workqueue("ksmbd-io", WQ_PERCPU, 0); if (!ksmbd_wq) return -ENOMEM; return 0; diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 8c9c49c3a0a4..40420544cc25 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -365,6 +365,7 @@ static void server_ctrl_handle_init(struct server_ctrl_struct *ctrl) return; } + pr_info("running\n"); WRITE_ONCE(server_conf.state, SERVER_STATE_RUNNING); } diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 0d92ce49aed7..0c069eff80b7 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -23,6 +23,7 @@ #include "asn1.h" #include "connection.h" #include "transport_ipc.h" +#include "../common/smbdirect/smbdirect.h" #include "transport_rdma.h" #include "vfs.h" #include "vfs_cache.h" @@ -2951,18 +2952,19 @@ int smb2_open(struct ksmbd_work *work) } ksmbd_debug(SMB, "converted name = %s\n", name); - if (strchr(name, ':')) { - if (!test_share_config_flag(work->tcon->share_conf, - KSMBD_SHARE_FLAG_STREAMS)) { - rc = -EBADF; - goto err_out2; - } - rc = parse_stream_name(name, &stream_name, &s_type); - if (rc < 0) - goto err_out2; - } if (posix_ctxt == false) { + if (strchr(name, ':')) { + if (!test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_STREAMS)) { + rc = -EBADF; + goto err_out2; + } + rc = parse_stream_name(name, &stream_name, &s_type); + if (rc < 0) + goto err_out2; + } + rc = ksmbd_validate_filename(name); if (rc < 0) goto err_out2; @@ -3443,6 +3445,8 @@ int smb2_open(struct ksmbd_work *work) fp->attrib_only = !(req->DesiredAccess & ~(FILE_READ_ATTRIBUTES_LE | FILE_WRITE_ATTRIBUTES_LE | FILE_SYNCHRONIZE_LE)); + fp->is_posix_ctxt = posix_ctxt; + /* fp should be searchable through ksmbd_inode.m_fp_list * after daccess, saccess, attrib_only, and stream are * initialized. @@ -5988,7 +5992,7 @@ static int smb2_rename(struct ksmbd_work *work, if (IS_ERR(new_name)) return PTR_ERR(new_name); - if (strchr(new_name, ':')) { + if (fp->is_posix_ctxt == false && strchr(new_name, ':')) { int s_type; char *xattr_stream_name, *stream_name = NULL; size_t xattr_stream_size; @@ -6662,7 +6666,7 @@ out: } static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, __le32 Channel, __le16 ChannelInfoLength) { @@ -6698,7 +6702,7 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, int err; err = ksmbd_conn_rdma_write(work->conn, data_buf, length, - (struct smb2_buffer_desc_v1 *) + (struct smbdirect_buffer_descriptor_v1 *) ((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)), le16_to_cpu(req->ReadChannelInfoLength)); if (err) @@ -6758,7 +6762,11 @@ int smb2_read(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || req->Channel == SMB2_CHANNEL_RDMA_V1) { is_rdma_channel = true; - max_read_size = get_smbd_max_read_write_size(); + max_read_size = get_smbd_max_read_write_size(work->conn->transport); + if (max_read_size == 0) { + err = -EINVAL; + goto out; + } } if (is_rdma_channel == true) { @@ -6769,7 +6777,7 @@ int smb2_read(struct ksmbd_work *work) goto out; } err = smb2_set_remote_key_for_rdma(work, - (struct smb2_buffer_desc_v1 *) + (struct smbdirect_buffer_descriptor_v1 *) ((char *)req + ch_offset), req->Channel, req->ReadChannelInfoLength); @@ -6964,7 +6972,7 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, return -ENOMEM; ret = ksmbd_conn_rdma_read(work->conn, data_buf, length, - (struct smb2_buffer_desc_v1 *) + (struct smbdirect_buffer_descriptor_v1 *) ((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)), le16_to_cpu(req->WriteChannelInfoLength)); if (ret < 0) { @@ -7016,7 +7024,11 @@ int smb2_write(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1 || req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { is_rdma_channel = true; - max_write_size = get_smbd_max_read_write_size(); + max_write_size = get_smbd_max_read_write_size(work->conn->transport); + if (max_write_size == 0) { + err = -EINVAL; + goto out; + } length = le32_to_cpu(req->RemainingBytes); } @@ -7029,7 +7041,7 @@ int smb2_write(struct ksmbd_work *work) goto out; } err = smb2_set_remote_key_for_rdma(work, - (struct smb2_buffer_desc_v1 *) + (struct smbdirect_buffer_descriptor_v1 *) ((char *)req + ch_offset), req->Channel, req->WriteChannelInfoLength); diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h index 16ae8a10490b..5163d5241b90 100644 --- a/fs/smb/server/smb2pdu.h +++ b/fs/smb/server/smb2pdu.h @@ -136,12 +136,6 @@ struct create_posix_rsp { u8 SidBuffer[44]; } __packed; -struct smb2_buffer_desc_v1 { - __le64 offset; - __le32 token; - __le32 length; -} __packed; - #define SMB2_0_IOCTL_IS_FSCTL 0x00000001 struct smb_sockaddr_in { diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 5466aa8c39b1..9e644a0daf1c 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -23,18 +23,24 @@ #include "connection.h" #include "smb_common.h" #include "../common/smb2status.h" +#include "../common/smbdirect/smbdirect.h" +#include "../common/smbdirect/smbdirect_pdu.h" +#include "../common/smbdirect/smbdirect_socket.h" #include "transport_rdma.h" #define SMB_DIRECT_PORT_IWARP 5445 #define SMB_DIRECT_PORT_INFINIBAND 445 -#define SMB_DIRECT_VERSION_LE cpu_to_le16(0x0100) +#define SMB_DIRECT_VERSION_LE cpu_to_le16(SMBDIRECT_V1) -/* SMB_DIRECT negotiation timeout in seconds */ -#define SMB_DIRECT_NEGOTIATE_TIMEOUT 120 +/* SMB_DIRECT negotiation timeout (for the server) in seconds */ +#define SMB_DIRECT_NEGOTIATE_TIMEOUT 5 -#define SMB_DIRECT_MAX_SEND_SGES 6 -#define SMB_DIRECT_MAX_RECV_SGES 1 +/* The timeout to wait for a keepalive message from peer in seconds */ +#define SMB_DIRECT_KEEPALIVE_SEND_INTERVAL 120 + +/* The timeout to wait for a keepalive message from peer in seconds */ +#define SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT 5 /* * Default maximum number of RDMA read/write outstanding on this connection @@ -87,131 +93,38 @@ static struct smb_direct_listener { static struct workqueue_struct *smb_direct_wq; -enum smb_direct_status { - SMB_DIRECT_CS_NEW = 0, - SMB_DIRECT_CS_CONNECTED, - SMB_DIRECT_CS_DISCONNECTING, - SMB_DIRECT_CS_DISCONNECTED, -}; - struct smb_direct_transport { struct ksmbd_transport transport; - enum smb_direct_status status; - bool full_packet_received; - wait_queue_head_t wait_status; - - struct rdma_cm_id *cm_id; - struct ib_cq *send_cq; - struct ib_cq *recv_cq; - struct ib_pd *pd; - struct ib_qp *qp; - - int max_send_size; - int max_recv_size; - int max_fragmented_send_size; - int max_fragmented_recv_size; - int max_rdma_rw_size; - - spinlock_t reassembly_queue_lock; - struct list_head reassembly_queue; - int reassembly_data_length; - int reassembly_queue_length; - int first_entry_offset; - wait_queue_head_t wait_reassembly_queue; - - spinlock_t receive_credit_lock; - int recv_credits; - int count_avail_recvmsg; - int recv_credit_max; - int recv_credit_target; - - spinlock_t recvmsg_queue_lock; - struct list_head recvmsg_queue; - - int send_credit_target; - atomic_t send_credits; - spinlock_t lock_new_recv_credits; - int new_recv_credits; - int max_rw_credits; - int pages_per_rw_credit; - atomic_t rw_credits; - - wait_queue_head_t wait_send_credits; - wait_queue_head_t wait_rw_credits; - - mempool_t *sendmsg_mempool; - struct kmem_cache *sendmsg_cache; - mempool_t *recvmsg_mempool; - struct kmem_cache *recvmsg_cache; - - wait_queue_head_t wait_send_pending; - atomic_t send_pending; - - struct delayed_work post_recv_credits_work; - struct work_struct send_immediate_work; - struct work_struct disconnect_work; - - bool negotiation_requested; + struct smbdirect_socket socket; }; -#define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport)) -#define SMBD_TRANS(t) ((struct smb_direct_transport *)container_of(t, \ +#define KSMBD_TRANS(t) (&(t)->transport) +#define SMBD_TRANS(t) (container_of(t, \ struct smb_direct_transport, transport)) -enum { - SMB_DIRECT_MSG_NEGOTIATE_REQ = 0, - SMB_DIRECT_MSG_DATA_TRANSFER -}; static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops; -struct smb_direct_send_ctx { - struct list_head msg_list; - int wr_cnt; - bool need_invalidate_rkey; - unsigned int remote_key; -}; - -struct smb_direct_sendmsg { - struct smb_direct_transport *transport; - struct ib_send_wr wr; - struct list_head list; - int num_sge; - struct ib_sge sge[SMB_DIRECT_MAX_SEND_SGES]; - struct ib_cqe cqe; - u8 packet[]; -}; - -struct smb_direct_recvmsg { - struct smb_direct_transport *transport; - struct list_head list; - int type; - struct ib_sge sge; - struct ib_cqe cqe; - bool first_segment; - u8 packet[]; -}; - -struct smb_direct_rdma_rw_msg { - struct smb_direct_transport *t; - struct ib_cqe cqe; - int status; - struct completion *completion; - struct list_head list; - struct rdma_rw_ctx rw_ctx; - struct sg_table sgt; - struct scatterlist sg_list[]; -}; - void init_smbd_max_io_size(unsigned int sz) { sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE); smb_direct_max_read_write_size = sz; } -unsigned int get_smbd_max_read_write_size(void) +unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { - return smb_direct_max_read_write_size; + struct smb_direct_transport *t; + struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; + + if (kt->ops != &ksmbd_smb_direct_transport_ops) + return 0; + + t = SMBD_TRANS(kt); + sc = &t->socket; + sp = &sc->parameters; + + return sp->max_read_write_size; } static inline int get_buf_page_count(void *buf, int size) @@ -220,71 +133,65 @@ static inline int get_buf_page_count(void *buf, int size) (uintptr_t)buf / PAGE_SIZE; } -static void smb_direct_destroy_pools(struct smb_direct_transport *transport); +static void smb_direct_destroy_pools(struct smbdirect_socket *sc); static void smb_direct_post_recv_credits(struct work_struct *work); -static int smb_direct_post_send_data(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, +static int smb_direct_post_send_data(struct smbdirect_socket *sc, + struct smbdirect_send_batch *send_ctx, struct kvec *iov, int niov, int remaining_data_length); -static inline struct smb_direct_transport * -smb_trans_direct_transfort(struct ksmbd_transport *t) -{ - return container_of(t, struct smb_direct_transport, transport); -} - static inline void -*smb_direct_recvmsg_payload(struct smb_direct_recvmsg *recvmsg) +*smbdirect_recv_io_payload(struct smbdirect_recv_io *recvmsg) { return (void *)recvmsg->packet; } -static inline bool is_receive_credit_post_required(int receive_credits, - int avail_recvmsg_count) -{ - return receive_credits <= (smb_direct_receive_credit_max >> 3) && - avail_recvmsg_count >= (receive_credits >> 2); -} - static struct -smb_direct_recvmsg *get_free_recvmsg(struct smb_direct_transport *t) +smbdirect_recv_io *get_free_recvmsg(struct smbdirect_socket *sc) { - struct smb_direct_recvmsg *recvmsg = NULL; + struct smbdirect_recv_io *recvmsg = NULL; + unsigned long flags; - spin_lock(&t->recvmsg_queue_lock); - if (!list_empty(&t->recvmsg_queue)) { - recvmsg = list_first_entry(&t->recvmsg_queue, - struct smb_direct_recvmsg, + spin_lock_irqsave(&sc->recv_io.free.lock, flags); + if (!list_empty(&sc->recv_io.free.list)) { + recvmsg = list_first_entry(&sc->recv_io.free.list, + struct smbdirect_recv_io, list); list_del(&recvmsg->list); } - spin_unlock(&t->recvmsg_queue_lock); + spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); return recvmsg; } -static void put_recvmsg(struct smb_direct_transport *t, - struct smb_direct_recvmsg *recvmsg) +static void put_recvmsg(struct smbdirect_socket *sc, + struct smbdirect_recv_io *recvmsg) { + unsigned long flags; + if (likely(recvmsg->sge.length != 0)) { - ib_dma_unmap_single(t->cm_id->device, + ib_dma_unmap_single(sc->ib.dev, recvmsg->sge.addr, recvmsg->sge.length, DMA_FROM_DEVICE); recvmsg->sge.length = 0; } - spin_lock(&t->recvmsg_queue_lock); - list_add(&recvmsg->list, &t->recvmsg_queue); - spin_unlock(&t->recvmsg_queue_lock); + spin_lock_irqsave(&sc->recv_io.free.lock, flags); + list_add(&recvmsg->list, &sc->recv_io.free.list); + spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); + + queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); } -static void enqueue_reassembly(struct smb_direct_transport *t, - struct smb_direct_recvmsg *recvmsg, +static void enqueue_reassembly(struct smbdirect_socket *sc, + struct smbdirect_recv_io *recvmsg, int data_length) { - spin_lock(&t->reassembly_queue_lock); - list_add_tail(&recvmsg->list, &t->reassembly_queue); - t->reassembly_queue_length++; + unsigned long flags; + + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + list_add_tail(&recvmsg->list, &sc->recv_io.reassembly.list); + sc->recv_io.reassembly.queue_length++; /* * Make sure reassembly_data_length is updated after list and * reassembly_queue_length are updated. On the dequeue side @@ -292,85 +199,228 @@ static void enqueue_reassembly(struct smb_direct_transport *t, * if reassembly_queue_length and list is up to date */ virt_wmb(); - t->reassembly_data_length += data_length; - spin_unlock(&t->reassembly_queue_lock); + sc->recv_io.reassembly.data_length += data_length; + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); } -static struct smb_direct_recvmsg *get_first_reassembly(struct smb_direct_transport *t) +static struct smbdirect_recv_io *get_first_reassembly(struct smbdirect_socket *sc) { - if (!list_empty(&t->reassembly_queue)) - return list_first_entry(&t->reassembly_queue, - struct smb_direct_recvmsg, list); + if (!list_empty(&sc->recv_io.reassembly.list)) + return list_first_entry(&sc->recv_io.reassembly.list, + struct smbdirect_recv_io, list); else return NULL; } +static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc) +{ + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + wake_up_all(&sc->status_wait); + wake_up_all(&sc->send_io.credits.wait_queue); + wake_up_all(&sc->send_io.pending.zero_wait_queue); + wake_up_all(&sc->recv_io.reassembly.wait_queue); + wake_up_all(&sc->rw_io.credits.wait_queue); +} + static void smb_direct_disconnect_rdma_work(struct work_struct *work) { - struct smb_direct_transport *t = - container_of(work, struct smb_direct_transport, - disconnect_work); + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, disconnect_work); - if (t->status == SMB_DIRECT_CS_CONNECTED) { - t->status = SMB_DIRECT_CS_DISCONNECTING; - rdma_disconnect(t->cm_id); + /* + * make sure this and other work is not queued again + * but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->disconnect_work); + disable_work(&sc->recv_io.posted.refill_work); + disable_delayed_work(&sc->idle.timer_work); + disable_work(&sc->idle.immediate_work); + + if (sc->first_error == 0) + sc->first_error = -ECONNABORTED; + + switch (sc->status) { + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_CONNECTED: + case SMBDIRECT_SOCKET_ERROR: + sc->status = SMBDIRECT_SOCKET_DISCONNECTING; + rdma_disconnect(sc->rdma.cm_id); + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + /* + * rdma_accept() never reached + * RDMA_CM_EVENT_ESTABLISHED + */ + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + break; } + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smb_direct_disconnect_wake_up_all(sc); } static void -smb_direct_disconnect_rdma_connection(struct smb_direct_transport *t) +smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) { - if (t->status == SMB_DIRECT_CS_CONNECTED) - queue_work(smb_direct_wq, &t->disconnect_work); + /* + * make sure other work (than disconnect_work) is + * not queued again but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->recv_io.posted.refill_work); + disable_work(&sc->idle.immediate_work); + disable_delayed_work(&sc->idle.timer_work); + + if (sc->first_error == 0) + sc->first_error = -ECONNABORTED; + + switch (sc->status) { + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_ERROR: + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + /* + * Keep the current error status + */ + break; + + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; + break; + + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; + break; + + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; + break; + + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_CONNECTED: + sc->status = SMBDIRECT_SOCKET_ERROR; + break; + } + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smb_direct_disconnect_wake_up_all(sc); + + queue_work(sc->workqueue, &sc->disconnect_work); } static void smb_direct_send_immediate_work(struct work_struct *work) { - struct smb_direct_transport *t = container_of(work, - struct smb_direct_transport, send_immediate_work); + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.immediate_work); + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return; + + smb_direct_post_send_data(sc, NULL, NULL, 0, 0); +} + +static void smb_direct_idle_connection_timer(struct work_struct *work) +{ + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.timer_work.work); + struct smbdirect_socket_parameters *sp = &sc->parameters; + + if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { + smb_direct_disconnect_rdma_connection(sc); + return; + } - if (t->status != SMB_DIRECT_CS_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return; - smb_direct_post_send_data(t, NULL, NULL, 0, 0); + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); + queue_work(sc->workqueue, &sc->idle.immediate_work); } static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) { struct smb_direct_transport *t; + struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; struct ksmbd_conn *conn; t = kzalloc(sizeof(*t), KSMBD_DEFAULT_GFP); if (!t) return NULL; + sc = &t->socket; + smbdirect_socket_init(sc); + sp = &sc->parameters; - t->cm_id = cm_id; - cm_id->context = t; - - t->status = SMB_DIRECT_CS_NEW; - init_waitqueue_head(&t->wait_status); + sc->workqueue = smb_direct_wq; - spin_lock_init(&t->reassembly_queue_lock); - INIT_LIST_HEAD(&t->reassembly_queue); - t->reassembly_data_length = 0; - t->reassembly_queue_length = 0; - init_waitqueue_head(&t->wait_reassembly_queue); - init_waitqueue_head(&t->wait_send_credits); - init_waitqueue_head(&t->wait_rw_credits); + INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work); - spin_lock_init(&t->receive_credit_lock); - spin_lock_init(&t->recvmsg_queue_lock); - INIT_LIST_HEAD(&t->recvmsg_queue); + sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000; + sp->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH; + sp->responder_resources = 1; + sp->recv_credit_max = smb_direct_receive_credit_max; + sp->send_credit_target = smb_direct_send_credit_target; + sp->max_send_size = smb_direct_max_send_size; + sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; + sp->max_recv_size = smb_direct_max_receive_size; + sp->max_read_write_size = smb_direct_max_read_write_size; + sp->keepalive_interval_msec = SMB_DIRECT_KEEPALIVE_SEND_INTERVAL * 1000; + sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000; - init_waitqueue_head(&t->wait_send_pending); - atomic_set(&t->send_pending, 0); + sc->rdma.cm_id = cm_id; + cm_id->context = sc; - spin_lock_init(&t->lock_new_recv_credits); + sc->ib.dev = sc->rdma.cm_id->device; - INIT_DELAYED_WORK(&t->post_recv_credits_work, - smb_direct_post_recv_credits); - INIT_WORK(&t->send_immediate_work, smb_direct_send_immediate_work); - INIT_WORK(&t->disconnect_work, smb_direct_disconnect_rdma_work); + INIT_WORK(&sc->recv_io.posted.refill_work, + smb_direct_post_recv_credits); + INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); + INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer); conn = ksmbd_conn_alloc(); if (!conn) @@ -391,89 +441,104 @@ static void smb_direct_free_transport(struct ksmbd_transport *kt) static void free_transport(struct smb_direct_transport *t) { - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_socket *sc = &t->socket; + struct smbdirect_recv_io *recvmsg; - wake_up_interruptible(&t->wait_send_credits); + disable_work_sync(&sc->disconnect_work); + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) { + smb_direct_disconnect_rdma_work(&sc->disconnect_work); + wait_event_interruptible(sc->status_wait, + sc->status == SMBDIRECT_SOCKET_DISCONNECTED); + } - ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n"); - wait_event(t->wait_send_pending, - atomic_read(&t->send_pending) == 0); + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + * + * Most likely this was already called via + * smb_direct_disconnect_rdma_work(), but call it again... + */ + smb_direct_disconnect_wake_up_all(sc); - cancel_work_sync(&t->disconnect_work); - cancel_delayed_work_sync(&t->post_recv_credits_work); - cancel_work_sync(&t->send_immediate_work); + disable_work_sync(&sc->recv_io.posted.refill_work); + disable_delayed_work_sync(&sc->idle.timer_work); + disable_work_sync(&sc->idle.immediate_work); - if (t->qp) { - ib_drain_qp(t->qp); - ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs); - t->qp = NULL; - rdma_destroy_qp(t->cm_id); + if (sc->ib.qp) { + ib_drain_qp(sc->ib.qp); + ib_mr_pool_destroy(sc->ib.qp, &sc->ib.qp->rdma_mrs); + sc->ib.qp = NULL; + rdma_destroy_qp(sc->rdma.cm_id); } ksmbd_debug(RDMA, "drain the reassembly queue\n"); do { - spin_lock(&t->reassembly_queue_lock); - recvmsg = get_first_reassembly(t); + unsigned long flags; + + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + recvmsg = get_first_reassembly(sc); if (recvmsg) { list_del(&recvmsg->list); - spin_unlock(&t->reassembly_queue_lock); - put_recvmsg(t, recvmsg); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); + put_recvmsg(sc, recvmsg); } else { - spin_unlock(&t->reassembly_queue_lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); } } while (recvmsg); - t->reassembly_data_length = 0; - - if (t->send_cq) - ib_free_cq(t->send_cq); - if (t->recv_cq) - ib_free_cq(t->recv_cq); - if (t->pd) - ib_dealloc_pd(t->pd); - if (t->cm_id) - rdma_destroy_id(t->cm_id); - - smb_direct_destroy_pools(t); + sc->recv_io.reassembly.data_length = 0; + + if (sc->ib.send_cq) + ib_free_cq(sc->ib.send_cq); + if (sc->ib.recv_cq) + ib_free_cq(sc->ib.recv_cq); + if (sc->ib.pd) + ib_dealloc_pd(sc->ib.pd); + if (sc->rdma.cm_id) + rdma_destroy_id(sc->rdma.cm_id); + + smb_direct_destroy_pools(sc); ksmbd_conn_free(KSMBD_TRANS(t)->conn); } -static struct smb_direct_sendmsg -*smb_direct_alloc_sendmsg(struct smb_direct_transport *t) +static struct smbdirect_send_io +*smb_direct_alloc_sendmsg(struct smbdirect_socket *sc) { - struct smb_direct_sendmsg *msg; + struct smbdirect_send_io *msg; - msg = mempool_alloc(t->sendmsg_mempool, KSMBD_DEFAULT_GFP); + msg = mempool_alloc(sc->send_io.mem.pool, KSMBD_DEFAULT_GFP); if (!msg) return ERR_PTR(-ENOMEM); - msg->transport = t; - INIT_LIST_HEAD(&msg->list); + msg->socket = sc; + INIT_LIST_HEAD(&msg->sibling_list); msg->num_sge = 0; return msg; } -static void smb_direct_free_sendmsg(struct smb_direct_transport *t, - struct smb_direct_sendmsg *msg) +static void smb_direct_free_sendmsg(struct smbdirect_socket *sc, + struct smbdirect_send_io *msg) { int i; if (msg->num_sge > 0) { - ib_dma_unmap_single(t->cm_id->device, + ib_dma_unmap_single(sc->ib.dev, msg->sge[0].addr, msg->sge[0].length, DMA_TO_DEVICE); for (i = 1; i < msg->num_sge; i++) - ib_dma_unmap_page(t->cm_id->device, + ib_dma_unmap_page(sc->ib.dev, msg->sge[i].addr, msg->sge[i].length, DMA_TO_DEVICE); } - mempool_free(msg, t->sendmsg_mempool); + mempool_free(msg, sc->send_io.mem.pool); } -static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) +static int smb_direct_check_recvmsg(struct smbdirect_recv_io *recvmsg) { - switch (recvmsg->type) { - case SMB_DIRECT_MSG_DATA_TRANSFER: { - struct smb_direct_data_transfer *req = - (struct smb_direct_data_transfer *)recvmsg->packet; + struct smbdirect_socket *sc = recvmsg->socket; + + switch (sc->recv_io.expected) { + case SMBDIRECT_EXPECT_DATA_TRANSFER: { + struct smbdirect_data_transfer *req = + (struct smbdirect_data_transfer *)recvmsg->packet; struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet + le32_to_cpu(req->data_offset)); ksmbd_debug(RDMA, @@ -482,11 +547,11 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) le16_to_cpu(req->credits_requested), req->data_length, req->remaining_data_length, hdr->ProtocolId, hdr->Command); - break; + return 0; } - case SMB_DIRECT_MSG_NEGOTIATE_REQ: { - struct smb_direct_negotiate_req *req = - (struct smb_direct_negotiate_req *)recvmsg->packet; + case SMBDIRECT_EXPECT_NEGOTIATE_REQ: { + struct smbdirect_negotiate_req *req = + (struct smbdirect_negotiate_req *)recvmsg->packet; ksmbd_debug(RDMA, "MinVersion: %u, MaxVersion: %u, CreditRequested: %u, MaxSendSize: %u, MaxRecvSize: %u, MaxFragmentedSize: %u\n", le16_to_cpu(req->min_version), @@ -504,29 +569,34 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) 128 * 1024) return -ECONNABORTED; - break; + return 0; } - default: - return -EINVAL; + case SMBDIRECT_EXPECT_NEGOTIATE_REP: + /* client only */ + break; } - return 0; + + /* This is an internal error */ + return -EINVAL; } static void recv_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smb_direct_recvmsg *recvmsg; - struct smb_direct_transport *t; + struct smbdirect_recv_io *recvmsg; + struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; - recvmsg = container_of(wc->wr_cqe, struct smb_direct_recvmsg, cqe); - t = recvmsg->transport; + recvmsg = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); + sc = recvmsg->socket; + sp = &sc->parameters; if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); if (wc->status != IB_WC_WR_FLUSH_ERR) { pr_err("Recv error. status='%s (%d)' opcode=%d\n", ib_wc_status_msg(wc->status), wc->status, wc->opcode); - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); } return; } @@ -538,108 +608,128 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr, recvmsg->sge.length, DMA_FROM_DEVICE); - switch (recvmsg->type) { - case SMB_DIRECT_MSG_NEGOTIATE_REQ: - if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) { - put_recvmsg(t, recvmsg); - smb_direct_disconnect_rdma_connection(t); + /* + * Reset timer to the keepalive interval in + * order to trigger our next keepalive message. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_interval_msec)); + + switch (sc->recv_io.expected) { + case SMBDIRECT_EXPECT_NEGOTIATE_REQ: + if (wc->byte_len < sizeof(struct smbdirect_negotiate_req)) { + put_recvmsg(sc, recvmsg); + smb_direct_disconnect_rdma_connection(sc); return; } - t->negotiation_requested = true; - t->full_packet_received = true; - t->status = SMB_DIRECT_CS_CONNECTED; - enqueue_reassembly(t, recvmsg, 0); - wake_up_interruptible(&t->wait_status); + sc->recv_io.reassembly.full_packet_received = true; + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED); + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; + enqueue_reassembly(sc, recvmsg, 0); + wake_up(&sc->status_wait); return; - case SMB_DIRECT_MSG_DATA_TRANSFER: { - struct smb_direct_data_transfer *data_transfer = - (struct smb_direct_data_transfer *)recvmsg->packet; - unsigned int data_length; - int avail_recvmsg_count, receive_credits; + case SMBDIRECT_EXPECT_DATA_TRANSFER: { + struct smbdirect_data_transfer *data_transfer = + (struct smbdirect_data_transfer *)recvmsg->packet; + u32 remaining_data_length, data_offset, data_length; + u16 old_recv_credit_target; if (wc->byte_len < - offsetof(struct smb_direct_data_transfer, padding)) { - put_recvmsg(t, recvmsg); - smb_direct_disconnect_rdma_connection(t); + offsetof(struct smbdirect_data_transfer, padding)) { + put_recvmsg(sc, recvmsg); + smb_direct_disconnect_rdma_connection(sc); return; } + remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); data_length = le32_to_cpu(data_transfer->data_length); - if (data_length) { - if (wc->byte_len < sizeof(struct smb_direct_data_transfer) + - (u64)data_length) { - put_recvmsg(t, recvmsg); - smb_direct_disconnect_rdma_connection(t); - return; - } + data_offset = le32_to_cpu(data_transfer->data_offset); + if (wc->byte_len < data_offset || + wc->byte_len < (u64)data_offset + data_length) { + put_recvmsg(sc, recvmsg); + smb_direct_disconnect_rdma_connection(sc); + return; + } + if (remaining_data_length > sp->max_fragmented_recv_size || + data_length > sp->max_fragmented_recv_size || + (u64)remaining_data_length + (u64)data_length > + (u64)sp->max_fragmented_recv_size) { + put_recvmsg(sc, recvmsg); + smb_direct_disconnect_rdma_connection(sc); + return; + } - if (t->full_packet_received) + if (data_length) { + if (sc->recv_io.reassembly.full_packet_received) recvmsg->first_segment = true; if (le32_to_cpu(data_transfer->remaining_data_length)) - t->full_packet_received = false; + sc->recv_io.reassembly.full_packet_received = false; else - t->full_packet_received = true; - - spin_lock(&t->receive_credit_lock); - receive_credits = --(t->recv_credits); - avail_recvmsg_count = t->count_avail_recvmsg; - spin_unlock(&t->receive_credit_lock); - } else { - spin_lock(&t->receive_credit_lock); - receive_credits = --(t->recv_credits); - avail_recvmsg_count = ++(t->count_avail_recvmsg); - spin_unlock(&t->receive_credit_lock); + sc->recv_io.reassembly.full_packet_received = true; } - t->recv_credit_target = + atomic_dec(&sc->recv_io.posted.count); + atomic_dec(&sc->recv_io.credits.count); + + old_recv_credit_target = sc->recv_io.credits.target; + sc->recv_io.credits.target = le16_to_cpu(data_transfer->credits_requested); + sc->recv_io.credits.target = + min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); + sc->recv_io.credits.target = + max_t(u16, sc->recv_io.credits.target, 1); atomic_add(le16_to_cpu(data_transfer->credits_granted), - &t->send_credits); + &sc->send_io.credits.count); if (le16_to_cpu(data_transfer->flags) & - SMB_DIRECT_RESPONSE_REQUESTED) - queue_work(smb_direct_wq, &t->send_immediate_work); - - if (atomic_read(&t->send_credits) > 0) - wake_up_interruptible(&t->wait_send_credits); + SMBDIRECT_FLAG_RESPONSE_REQUESTED) + queue_work(sc->workqueue, &sc->idle.immediate_work); - if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count)) - mod_delayed_work(smb_direct_wq, - &t->post_recv_credits_work, 0); + if (atomic_read(&sc->send_io.credits.count) > 0) + wake_up(&sc->send_io.credits.wait_queue); if (data_length) { - enqueue_reassembly(t, recvmsg, (int)data_length); - wake_up_interruptible(&t->wait_reassembly_queue); + if (sc->recv_io.credits.target > old_recv_credit_target) + queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); + + enqueue_reassembly(sc, recvmsg, (int)data_length); + wake_up(&sc->recv_io.reassembly.wait_queue); } else - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); return; } + case SMBDIRECT_EXPECT_NEGOTIATE_REP: + /* client only */ + break; } /* * This is an internal error! */ - WARN_ON_ONCE(recvmsg->type != SMB_DIRECT_MSG_DATA_TRANSFER); - put_recvmsg(t, recvmsg); - smb_direct_disconnect_rdma_connection(t); + WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); + put_recvmsg(sc, recvmsg); + smb_direct_disconnect_rdma_connection(sc); } -static int smb_direct_post_recv(struct smb_direct_transport *t, - struct smb_direct_recvmsg *recvmsg) +static int smb_direct_post_recv(struct smbdirect_socket *sc, + struct smbdirect_recv_io *recvmsg) { + struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_recv_wr wr; int ret; - recvmsg->sge.addr = ib_dma_map_single(t->cm_id->device, - recvmsg->packet, t->max_recv_size, + recvmsg->sge.addr = ib_dma_map_single(sc->ib.dev, + recvmsg->packet, + sp->max_recv_size, DMA_FROM_DEVICE); - ret = ib_dma_mapping_error(t->cm_id->device, recvmsg->sge.addr); + ret = ib_dma_mapping_error(sc->ib.dev, recvmsg->sge.addr); if (ret) return ret; - recvmsg->sge.length = t->max_recv_size; - recvmsg->sge.lkey = t->pd->local_dma_lkey; + recvmsg->sge.length = sp->max_recv_size; + recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey; recvmsg->cqe.done = recv_done; wr.wr_cqe = &recvmsg->cqe; @@ -647,14 +737,14 @@ static int smb_direct_post_recv(struct smb_direct_transport *t, wr.sg_list = &recvmsg->sge; wr.num_sge = 1; - ret = ib_post_recv(t->qp, &wr, NULL); + ret = ib_post_recv(sc->ib.qp, &wr, NULL); if (ret) { pr_err("Can't post recv: %d\n", ret); - ib_dma_unmap_single(t->cm_id->device, + ib_dma_unmap_single(sc->ib.dev, recvmsg->sge.addr, recvmsg->sge.length, DMA_FROM_DEVICE); recvmsg->sge.length = 0; - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); return ret; } return ret; @@ -663,15 +753,16 @@ static int smb_direct_post_recv(struct smb_direct_transport *t, static int smb_direct_read(struct ksmbd_transport *t, char *buf, unsigned int size, int unused) { - struct smb_direct_recvmsg *recvmsg; - struct smb_direct_data_transfer *data_transfer; + struct smbdirect_recv_io *recvmsg; + struct smbdirect_data_transfer *data_transfer; int to_copy, to_read, data_read, offset; u32 data_length, remaining_data_length, data_offset; int rc; - struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_transport *st = SMBD_TRANS(t); + struct smbdirect_socket *sc = &st->socket; again: - if (st->status != SMB_DIRECT_CS_CONNECTED) { + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { pr_err("disconnected\n"); return -ENOTCONN; } @@ -681,9 +772,10 @@ again: * the only one reading from the front of the queue. The transport * may add more entries to the back of the queue at the same time */ - if (st->reassembly_data_length >= size) { + if (sc->recv_io.reassembly.data_length >= size) { int queue_length; int queue_removed = 0; + unsigned long flags; /* * Need to make sure reassembly_data_length is read before @@ -693,13 +785,13 @@ again: * updated in SOFTIRQ as more data is received */ virt_rmb(); - queue_length = st->reassembly_queue_length; + queue_length = sc->recv_io.reassembly.queue_length; data_read = 0; to_read = size; - offset = st->first_entry_offset; + offset = sc->recv_io.reassembly.first_entry_offset; while (data_read < size) { - recvmsg = get_first_reassembly(st); - data_transfer = smb_direct_recvmsg_payload(recvmsg); + recvmsg = get_first_reassembly(sc); + data_transfer = smbdirect_recv_io_payload(recvmsg); data_length = le32_to_cpu(data_transfer->data_length); remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); @@ -739,12 +831,12 @@ again: if (queue_length) { list_del(&recvmsg->list); } else { - spin_lock_irq(&st->reassembly_queue_lock); + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); list_del(&recvmsg->list); - spin_unlock_irq(&st->reassembly_queue_lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); } queue_removed++; - put_recvmsg(st, recvmsg); + put_recvmsg(sc, recvmsg); offset = 0; } else { offset += to_copy; @@ -754,34 +846,24 @@ again: data_read += to_copy; } - spin_lock_irq(&st->reassembly_queue_lock); - st->reassembly_data_length -= data_read; - st->reassembly_queue_length -= queue_removed; - spin_unlock_irq(&st->reassembly_queue_lock); - - spin_lock(&st->receive_credit_lock); - st->count_avail_recvmsg += queue_removed; - if (is_receive_credit_post_required(st->recv_credits, st->count_avail_recvmsg)) { - spin_unlock(&st->receive_credit_lock); - mod_delayed_work(smb_direct_wq, - &st->post_recv_credits_work, 0); - } else { - spin_unlock(&st->receive_credit_lock); - } + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + sc->recv_io.reassembly.data_length -= data_read; + sc->recv_io.reassembly.queue_length -= queue_removed; + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); - st->first_entry_offset = offset; + sc->recv_io.reassembly.first_entry_offset = offset; ksmbd_debug(RDMA, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", - data_read, st->reassembly_data_length, - st->first_entry_offset); + data_read, sc->recv_io.reassembly.data_length, + sc->recv_io.reassembly.first_entry_offset); read_rfc1002_done: return data_read; } ksmbd_debug(RDMA, "wait_event on more data\n"); - rc = wait_event_interruptible(st->wait_reassembly_queue, - st->reassembly_data_length >= size || - st->status != SMB_DIRECT_CS_CONNECTED); + rc = wait_event_interruptible(sc->recv_io.reassembly.wait_queue, + sc->recv_io.reassembly.data_length >= size || + sc->status != SMBDIRECT_SOCKET_CONNECTED); if (rc) return -EINTR; @@ -790,56 +872,44 @@ read_rfc1002_done: static void smb_direct_post_recv_credits(struct work_struct *work) { - struct smb_direct_transport *t = container_of(work, - struct smb_direct_transport, post_recv_credits_work.work); - struct smb_direct_recvmsg *recvmsg; - int receive_credits, credits = 0; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); + struct smbdirect_recv_io *recvmsg; + int credits = 0; int ret; - spin_lock(&t->receive_credit_lock); - receive_credits = t->recv_credits; - spin_unlock(&t->receive_credit_lock); - - if (receive_credits < t->recv_credit_target) { + if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target) { while (true) { - recvmsg = get_free_recvmsg(t); + recvmsg = get_free_recvmsg(sc); if (!recvmsg) break; - recvmsg->type = SMB_DIRECT_MSG_DATA_TRANSFER; recvmsg->first_segment = false; - ret = smb_direct_post_recv(t, recvmsg); + ret = smb_direct_post_recv(sc, recvmsg); if (ret) { pr_err("Can't post recv: %d\n", ret); - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); break; } credits++; + + atomic_inc(&sc->recv_io.posted.count); } } - spin_lock(&t->receive_credit_lock); - t->recv_credits += credits; - t->count_avail_recvmsg -= credits; - spin_unlock(&t->receive_credit_lock); - - spin_lock(&t->lock_new_recv_credits); - t->new_recv_credits += credits; - spin_unlock(&t->lock_new_recv_credits); - if (credits) - queue_work(smb_direct_wq, &t->send_immediate_work); + queue_work(sc->workqueue, &sc->idle.immediate_work); } static void send_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smb_direct_sendmsg *sendmsg, *sibling; - struct smb_direct_transport *t; + struct smbdirect_send_io *sendmsg, *sibling; + struct smbdirect_socket *sc; struct list_head *pos, *prev, *end; - sendmsg = container_of(wc->wr_cqe, struct smb_direct_sendmsg, cqe); - t = sendmsg->transport; + sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); + sc = sendmsg->socket; ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n", ib_wc_status_msg(wc->status), wc->status, @@ -849,55 +919,78 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) pr_err("Send error. status='%s (%d)', opcode=%d\n", ib_wc_status_msg(wc->status), wc->status, wc->opcode); - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); } - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); /* iterate and free the list of messages in reverse. the list's head * is invalid. */ - for (pos = &sendmsg->list, prev = pos->prev, end = sendmsg->list.next; + for (pos = &sendmsg->sibling_list, prev = pos->prev, end = sendmsg->sibling_list.next; prev != end; pos = prev, prev = prev->prev) { - sibling = container_of(pos, struct smb_direct_sendmsg, list); - smb_direct_free_sendmsg(t, sibling); + sibling = container_of(pos, struct smbdirect_send_io, sibling_list); + smb_direct_free_sendmsg(sc, sibling); } - sibling = container_of(pos, struct smb_direct_sendmsg, list); - smb_direct_free_sendmsg(t, sibling); + sibling = container_of(pos, struct smbdirect_send_io, sibling_list); + smb_direct_free_sendmsg(sc, sibling); } -static int manage_credits_prior_sending(struct smb_direct_transport *t) +static int manage_credits_prior_sending(struct smbdirect_socket *sc) { int new_credits; - spin_lock(&t->lock_new_recv_credits); - new_credits = t->new_recv_credits; - t->new_recv_credits = 0; - spin_unlock(&t->lock_new_recv_credits); + if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) + return 0; + + new_credits = atomic_read(&sc->recv_io.posted.count); + if (new_credits == 0) + return 0; + + new_credits -= atomic_read(&sc->recv_io.credits.count); + if (new_credits <= 0) + return 0; + atomic_add(new_credits, &sc->recv_io.credits.count); return new_credits; } -static int smb_direct_post_send(struct smb_direct_transport *t, +static int manage_keep_alive_before_sending(struct smbdirect_socket *sc) +{ + struct smbdirect_socket_parameters *sp = &sc->parameters; + + if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); + return 1; + } + return 0; +} + +static int smb_direct_post_send(struct smbdirect_socket *sc, struct ib_send_wr *wr) { int ret; - atomic_inc(&t->send_pending); - ret = ib_post_send(t->qp, wr, NULL); + atomic_inc(&sc->send_io.pending.count); + ret = ib_post_send(sc->ib.qp, wr, NULL); if (ret) { pr_err("failed to post send: %d\n", ret); - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); - smb_direct_disconnect_rdma_connection(t); + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); + smb_direct_disconnect_rdma_connection(sc); } return ret; } -static void smb_direct_send_ctx_init(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, +static void smb_direct_send_ctx_init(struct smbdirect_send_batch *send_ctx, bool need_invalidate_rkey, unsigned int remote_key) { @@ -907,47 +1000,50 @@ static void smb_direct_send_ctx_init(struct smb_direct_transport *t, send_ctx->remote_key = remote_key; } -static int smb_direct_flush_send_list(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, +static int smb_direct_flush_send_list(struct smbdirect_socket *sc, + struct smbdirect_send_batch *send_ctx, bool is_last) { - struct smb_direct_sendmsg *first, *last; + struct smbdirect_send_io *first, *last; int ret; if (list_empty(&send_ctx->msg_list)) return 0; first = list_first_entry(&send_ctx->msg_list, - struct smb_direct_sendmsg, - list); + struct smbdirect_send_io, + sibling_list); last = list_last_entry(&send_ctx->msg_list, - struct smb_direct_sendmsg, - list); + struct smbdirect_send_io, + sibling_list); + + if (send_ctx->need_invalidate_rkey) { + first->wr.opcode = IB_WR_SEND_WITH_INV; + first->wr.ex.invalidate_rkey = send_ctx->remote_key; + send_ctx->need_invalidate_rkey = false; + send_ctx->remote_key = 0; + } last->wr.send_flags = IB_SEND_SIGNALED; last->wr.wr_cqe = &last->cqe; - if (is_last && send_ctx->need_invalidate_rkey) { - last->wr.opcode = IB_WR_SEND_WITH_INV; - last->wr.ex.invalidate_rkey = send_ctx->remote_key; - } - ret = smb_direct_post_send(t, &first->wr); + ret = smb_direct_post_send(sc, &first->wr); if (!ret) { - smb_direct_send_ctx_init(t, send_ctx, + smb_direct_send_ctx_init(send_ctx, send_ctx->need_invalidate_rkey, send_ctx->remote_key); } else { - atomic_add(send_ctx->wr_cnt, &t->send_credits); - wake_up(&t->wait_send_credits); + atomic_add(send_ctx->wr_cnt, &sc->send_io.credits.count); + wake_up(&sc->send_io.credits.wait_queue); list_for_each_entry_safe(first, last, &send_ctx->msg_list, - list) { - smb_direct_free_sendmsg(t, first); + sibling_list) { + smb_direct_free_sendmsg(sc, first); } } return ret; } -static int wait_for_credits(struct smb_direct_transport *t, +static int wait_for_credits(struct smbdirect_socket *sc, wait_queue_head_t *waitq, atomic_t *total_credits, int needed) { @@ -960,61 +1056,68 @@ static int wait_for_credits(struct smb_direct_transport *t, atomic_add(needed, total_credits); ret = wait_event_interruptible(*waitq, atomic_read(total_credits) >= needed || - t->status != SMB_DIRECT_CS_CONNECTED); + sc->status != SMBDIRECT_SOCKET_CONNECTED); - if (t->status != SMB_DIRECT_CS_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return -ENOTCONN; else if (ret < 0) return ret; } while (true); } -static int wait_for_send_credits(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx) +static int wait_for_send_credits(struct smbdirect_socket *sc, + struct smbdirect_send_batch *send_ctx) { int ret; if (send_ctx && - (send_ctx->wr_cnt >= 16 || atomic_read(&t->send_credits) <= 1)) { - ret = smb_direct_flush_send_list(t, send_ctx, false); + (send_ctx->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) { + ret = smb_direct_flush_send_list(sc, send_ctx, false); if (ret) return ret; } - return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1); + return wait_for_credits(sc, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1); } -static int wait_for_rw_credits(struct smb_direct_transport *t, int credits) +static int wait_for_rw_credits(struct smbdirect_socket *sc, int credits) { - return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits); + return wait_for_credits(sc, + &sc->rw_io.credits.wait_queue, + &sc->rw_io.credits.count, + credits); } -static int calc_rw_credits(struct smb_direct_transport *t, +static int calc_rw_credits(struct smbdirect_socket *sc, char *buf, unsigned int len) { return DIV_ROUND_UP(get_buf_page_count(buf, len), - t->pages_per_rw_credit); + sc->rw_io.credits.num_pages); } -static int smb_direct_create_header(struct smb_direct_transport *t, +static int smb_direct_create_header(struct smbdirect_socket *sc, int size, int remaining_data_length, - struct smb_direct_sendmsg **sendmsg_out) + struct smbdirect_send_io **sendmsg_out) { - struct smb_direct_sendmsg *sendmsg; - struct smb_direct_data_transfer *packet; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_send_io *sendmsg; + struct smbdirect_data_transfer *packet; int header_length; int ret; - sendmsg = smb_direct_alloc_sendmsg(t); + sendmsg = smb_direct_alloc_sendmsg(sc); if (IS_ERR(sendmsg)) return PTR_ERR(sendmsg); /* Fill in the packet header */ - packet = (struct smb_direct_data_transfer *)sendmsg->packet; - packet->credits_requested = cpu_to_le16(t->send_credit_target); - packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(t)); + packet = (struct smbdirect_data_transfer *)sendmsg->packet; + packet->credits_requested = cpu_to_le16(sp->send_credit_target); + packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); packet->flags = 0; + if (manage_keep_alive_before_sending(sc)) + packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); + packet->reserved = 0; if (!size) packet->data_offset = 0; @@ -1033,25 +1136,25 @@ static int smb_direct_create_header(struct smb_direct_transport *t, le32_to_cpu(packet->remaining_data_length)); /* Map the packet to DMA */ - header_length = sizeof(struct smb_direct_data_transfer); + header_length = sizeof(struct smbdirect_data_transfer); /* If this is a packet without payload, don't send padding */ if (!size) header_length = - offsetof(struct smb_direct_data_transfer, padding); + offsetof(struct smbdirect_data_transfer, padding); - sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device, + sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, (void *)packet, header_length, DMA_TO_DEVICE); - ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr); + ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); if (ret) { - smb_direct_free_sendmsg(t, sendmsg); + smb_direct_free_sendmsg(sc, sendmsg); return ret; } sendmsg->num_sge = 1; sendmsg->sge[0].length = header_length; - sendmsg->sge[0].lkey = t->pd->local_dma_lkey; + sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; *sendmsg_out = sendmsg; return 0; @@ -1101,14 +1204,14 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, return ib_dma_map_sg(device, sg_list, npages, dir); } -static int post_sendmsg(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, - struct smb_direct_sendmsg *msg) +static int post_sendmsg(struct smbdirect_socket *sc, + struct smbdirect_send_batch *send_ctx, + struct smbdirect_send_io *msg) { int i; for (i = 0; i < msg->num_sge; i++) - ib_dma_sync_single_for_device(t->cm_id->device, + ib_dma_sync_single_for_device(sc->ib.dev, msg->sge[i].addr, msg->sge[i].length, DMA_TO_DEVICE); @@ -1122,34 +1225,34 @@ static int post_sendmsg(struct smb_direct_transport *t, msg->wr.wr_cqe = NULL; msg->wr.send_flags = 0; if (!list_empty(&send_ctx->msg_list)) { - struct smb_direct_sendmsg *last; + struct smbdirect_send_io *last; last = list_last_entry(&send_ctx->msg_list, - struct smb_direct_sendmsg, - list); + struct smbdirect_send_io, + sibling_list); last->wr.next = &msg->wr; } - list_add_tail(&msg->list, &send_ctx->msg_list); + list_add_tail(&msg->sibling_list, &send_ctx->msg_list); send_ctx->wr_cnt++; return 0; } msg->wr.wr_cqe = &msg->cqe; msg->wr.send_flags = IB_SEND_SIGNALED; - return smb_direct_post_send(t, &msg->wr); + return smb_direct_post_send(sc, &msg->wr); } -static int smb_direct_post_send_data(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, +static int smb_direct_post_send_data(struct smbdirect_socket *sc, + struct smbdirect_send_batch *send_ctx, struct kvec *iov, int niov, int remaining_data_length) { int i, j, ret; - struct smb_direct_sendmsg *msg; + struct smbdirect_send_io *msg; int data_length; - struct scatterlist sg[SMB_DIRECT_MAX_SEND_SGES - 1]; + struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1]; - ret = wait_for_send_credits(t, send_ctx); + ret = wait_for_send_credits(sc, send_ctx); if (ret) return ret; @@ -1157,10 +1260,10 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, for (i = 0; i < niov; i++) data_length += iov[i].iov_len; - ret = smb_direct_create_header(t, data_length, remaining_data_length, + ret = smb_direct_create_header(sc, data_length, remaining_data_length, &msg); if (ret) { - atomic_inc(&t->send_credits); + atomic_inc(&sc->send_io.credits.count); return ret; } @@ -1168,19 +1271,19 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, struct ib_sge *sge; int sg_cnt; - sg_init_table(sg, SMB_DIRECT_MAX_SEND_SGES - 1); - sg_cnt = get_mapped_sg_list(t->cm_id->device, + sg_init_table(sg, SMBDIRECT_SEND_IO_MAX_SGE - 1); + sg_cnt = get_mapped_sg_list(sc->ib.dev, iov[i].iov_base, iov[i].iov_len, - sg, SMB_DIRECT_MAX_SEND_SGES - 1, + sg, SMBDIRECT_SEND_IO_MAX_SGE - 1, DMA_TO_DEVICE); if (sg_cnt <= 0) { pr_err("failed to map buffer\n"); ret = -ENOMEM; goto err; - } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES) { + } else if (sg_cnt + msg->num_sge > SMBDIRECT_SEND_IO_MAX_SGE) { pr_err("buffer not fitted into sges\n"); ret = -E2BIG; - ib_dma_unmap_sg(t->cm_id->device, sg, sg_cnt, + ib_dma_unmap_sg(sc->ib.dev, sg, sg_cnt, DMA_TO_DEVICE); goto err; } @@ -1189,18 +1292,18 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, sge = &msg->sge[msg->num_sge]; sge->addr = sg_dma_address(&sg[j]); sge->length = sg_dma_len(&sg[j]); - sge->lkey = t->pd->local_dma_lkey; + sge->lkey = sc->ib.pd->local_dma_lkey; msg->num_sge++; } } - ret = post_sendmsg(t, send_ctx, msg); + ret = post_sendmsg(sc, send_ctx, msg); if (ret) goto err; return 0; err: - smb_direct_free_sendmsg(t, msg); - atomic_inc(&t->send_credits); + smb_direct_free_sendmsg(sc, msg); + atomic_inc(&sc->send_io.credits.count); return ret; } @@ -1208,79 +1311,133 @@ static int smb_direct_writev(struct ksmbd_transport *t, struct kvec *iov, int niovs, int buflen, bool need_invalidate, unsigned int remote_key) { - struct smb_direct_transport *st = smb_trans_direct_transfort(t); - int remaining_data_length; - int start, i, j; - int max_iov_size = st->max_send_size - - sizeof(struct smb_direct_data_transfer); + struct smb_direct_transport *st = SMBD_TRANS(t); + struct smbdirect_socket *sc = &st->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; + size_t remaining_data_length; + size_t iov_idx; + size_t iov_ofs; + size_t max_iov_size = sp->max_send_size - + sizeof(struct smbdirect_data_transfer); int ret; - struct kvec vec; - struct smb_direct_send_ctx send_ctx; + struct smbdirect_send_batch send_ctx; + int error = 0; - if (st->status != SMB_DIRECT_CS_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return -ENOTCONN; //FIXME: skip RFC1002 header.. + if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4)) + return -EINVAL; buflen -= 4; + iov_idx = 1; + iov_ofs = 0; remaining_data_length = buflen; ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); - smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key); - start = i = 1; - buflen = 0; - while (true) { - buflen += iov[i].iov_len; - if (buflen > max_iov_size) { - if (i > start) { - remaining_data_length -= - (buflen - iov[i].iov_len); - ret = smb_direct_post_send_data(st, &send_ctx, - &iov[start], i - start, - remaining_data_length); - if (ret) + smb_direct_send_ctx_init(&send_ctx, need_invalidate, remote_key); + while (remaining_data_length) { + struct kvec vecs[SMBDIRECT_SEND_IO_MAX_SGE - 1]; /* minus smbdirect hdr */ + size_t possible_bytes = max_iov_size; + size_t possible_vecs; + size_t bytes = 0; + size_t nvecs = 0; + + /* + * For the last message remaining_data_length should be + * have been 0 already! + */ + if (WARN_ON_ONCE(iov_idx >= niovs)) { + error = -EINVAL; + goto done; + } + + /* + * We have 2 factors which limit the arguments we pass + * to smb_direct_post_send_data(): + * + * 1. The number of supported sges for the send, + * while one is reserved for the smbdirect header. + * And we currently need one SGE per page. + * 2. The number of negotiated payload bytes per send. + */ + possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx); + + while (iov_idx < niovs && possible_vecs && possible_bytes) { + struct kvec *v = &vecs[nvecs]; + int page_count; + + v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs; + v->iov_len = min_t(size_t, + iov[iov_idx].iov_len - iov_ofs, + possible_bytes); + page_count = get_buf_page_count(v->iov_base, v->iov_len); + if (page_count > possible_vecs) { + /* + * If the number of pages in the buffer + * is to much (because we currently require + * one SGE per page), we need to limit the + * length. + * + * We know possible_vecs is at least 1, + * so we always keep the first page. + * + * We need to calculate the number extra + * pages (epages) we can also keep. + * + * We calculate the number of bytes in the + * first page (fplen), this should never be + * larger than v->iov_len because page_count is + * at least 2, but adding a limitation feels + * better. + * + * Then we calculate the number of bytes (elen) + * we can keep for the extra pages. + */ + size_t epages = possible_vecs - 1; + size_t fpofs = offset_in_page(v->iov_base); + size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len); + size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE); + + v->iov_len = fplen + elen; + page_count = get_buf_page_count(v->iov_base, v->iov_len); + if (WARN_ON_ONCE(page_count > possible_vecs)) { + /* + * Something went wrong in the above + * logic... + */ + error = -EINVAL; goto done; - } else { - /* iov[start] is too big, break it */ - int nvec = (buflen + max_iov_size - 1) / - max_iov_size; - - for (j = 0; j < nvec; j++) { - vec.iov_base = - (char *)iov[start].iov_base + - j * max_iov_size; - vec.iov_len = - min_t(int, max_iov_size, - buflen - max_iov_size * j); - remaining_data_length -= vec.iov_len; - ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1, - remaining_data_length); - if (ret) - goto done; } - i++; - if (i == niovs) - break; } - start = i; - buflen = 0; - } else { - i++; - if (i == niovs) { - /* send out all remaining vecs */ - remaining_data_length -= buflen; - ret = smb_direct_post_send_data(st, &send_ctx, - &iov[start], i - start, - remaining_data_length); - if (ret) - goto done; - break; + possible_vecs -= page_count; + nvecs += 1; + possible_bytes -= v->iov_len; + bytes += v->iov_len; + + iov_ofs += v->iov_len; + if (iov_ofs >= iov[iov_idx].iov_len) { + iov_idx += 1; + iov_ofs = 0; } } + + remaining_data_length -= bytes; + + ret = smb_direct_post_send_data(sc, &send_ctx, + vecs, nvecs, + remaining_data_length); + if (unlikely(ret)) { + error = ret; + goto done; + } } done: - ret = smb_direct_flush_send_list(st, &send_ctx, true); + ret = smb_direct_flush_send_list(sc, &send_ctx, true); + if (unlikely(!ret && error)) + ret = error; /* * As an optimization, we don't wait for individual I/O to finish @@ -1289,16 +1446,22 @@ done: * that means all the I/Os have been out and we are good to return */ - wait_event(st->wait_send_pending, - atomic_read(&st->send_pending) == 0); + wait_event(sc->send_io.pending.zero_wait_queue, + atomic_read(&sc->send_io.pending.count) == 0 || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + if (sc->status != SMBDIRECT_SOCKET_CONNECTED && ret == 0) + ret = -ENOTCONN; + return ret; } static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, - struct smb_direct_rdma_rw_msg *msg, + struct smbdirect_rw_io *msg, enum dma_data_direction dir) { - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, + struct smbdirect_socket *sc = &t->socket; + + rdma_rw_ctx_destroy(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, msg->sgt.sgl, msg->sgt.nents, dir); sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); kfree(msg); @@ -1307,16 +1470,16 @@ static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, enum dma_data_direction dir) { - struct smb_direct_rdma_rw_msg *msg = container_of(wc->wr_cqe, - struct smb_direct_rdma_rw_msg, cqe); - struct smb_direct_transport *t = msg->t; + struct smbdirect_rw_io *msg = + container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe); + struct smbdirect_socket *sc = msg->socket; if (wc->status != IB_WC_SUCCESS) { - msg->status = -EIO; + msg->error = -EIO; pr_err("read/write error. opcode = %d, status = %s(%d)\n", wc->opcode, ib_wc_status_msg(wc->status), wc->status); if (wc->status != IB_WC_WR_FLUSH_ERR) - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); } complete(msg->completion); @@ -1334,11 +1497,13 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc) static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf, int buf_len, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len, bool is_read) { - struct smb_direct_rdma_rw_msg *msg, *next_msg; + struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_rw_io *msg, *next_msg; int i, ret; DECLARE_COMPLETION_ONSTACK(completion); struct ib_send_wr *first_wr; @@ -1347,10 +1512,10 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, int credits_needed; unsigned int desc_buf_len, desc_num = 0; - if (t->status != SMB_DIRECT_CS_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return -ENOTCONN; - if (buf_len > t->max_rdma_rw_size) + if (buf_len > sp->max_read_write_size) return -EINVAL; /* calculate needed credits */ @@ -1370,7 +1535,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, buf_len = 0; } - credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len); + credits_needed += calc_rw_credits(sc, desc_buf, desc_buf_len); desc_buf += desc_buf_len; buf_len -= desc_buf_len; desc_num++; @@ -1379,7 +1544,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", str_read_write(is_read), buf_len, credits_needed); - ret = wait_for_rw_credits(t, credits_needed); + ret = wait_for_rw_credits(sc, credits_needed); if (ret < 0) return ret; @@ -1395,7 +1560,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, desc_buf_len = le32_to_cpu(desc[i].length); - msg->t = t; + msg->socket = sc; msg->cqe.done = is_read ? read_done : write_done; msg->completion = &completion; @@ -1417,7 +1582,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, goto out; } - ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, + ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, msg->sgt.sgl, get_buf_page_count(desc_buf, desc_buf_len), 0, @@ -1438,96 +1603,94 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, /* concatenate work requests of rdma_rw_ctxs */ first_wr = NULL; list_for_each_entry_reverse(msg, &msg_list, list) { - first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, + first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, &msg->cqe, first_wr); } - ret = ib_post_send(t->qp, first_wr, NULL); + ret = ib_post_send(sc->ib.qp, first_wr, NULL); if (ret) { pr_err("failed to post send wr for RDMA R/W: %d\n", ret); goto out; } - msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list); + msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list); wait_for_completion(&completion); - ret = msg->status; + ret = msg->error; out: list_for_each_entry_safe(msg, next_msg, &msg_list, list) { list_del(&msg->list); smb_direct_free_rdma_rw_msg(t, msg, is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } - atomic_add(credits_needed, &t->rw_credits); - wake_up(&t->wait_rw_credits); + atomic_add(credits_needed, &sc->rw_io.credits.count); + wake_up(&sc->rw_io.credits.wait_queue); return ret; } static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { - return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, + return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, desc, desc_len, false); } static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { - return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, + return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, desc, desc_len, true); } static void smb_direct_disconnect(struct ksmbd_transport *t) { - struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_transport *st = SMBD_TRANS(t); + struct smbdirect_socket *sc = &st->socket; - ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", st->cm_id); + ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id); - smb_direct_disconnect_rdma_work(&st->disconnect_work); - wait_event_interruptible(st->wait_status, - st->status == SMB_DIRECT_CS_DISCONNECTED); free_transport(st); } static void smb_direct_shutdown(struct ksmbd_transport *t) { - struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_transport *st = SMBD_TRANS(t); + struct smbdirect_socket *sc = &st->socket; - ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", st->cm_id); + ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", sc->rdma.cm_id); - smb_direct_disconnect_rdma_work(&st->disconnect_work); + smb_direct_disconnect_rdma_work(&sc->disconnect_work); } static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { - struct smb_direct_transport *t = cm_id->context; + struct smbdirect_socket *sc = cm_id->context; ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n", cm_id, rdma_event_msg(event->event), event->event); switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: { - t->status = SMB_DIRECT_CS_CONNECTED; - wake_up_interruptible(&t->wait_status); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; + wake_up(&sc->status_wait); break; } case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_DISCONNECTED: { - ib_drain_qp(t->qp); + ib_drain_qp(sc->ib.qp); - t->status = SMB_DIRECT_CS_DISCONNECTED; - wake_up_interruptible(&t->wait_status); - wake_up_interruptible(&t->wait_reassembly_queue); - wake_up(&t->wait_send_credits); + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + smb_direct_disconnect_rdma_work(&sc->disconnect_work); break; } case RDMA_CM_EVENT_CONNECT_ERROR: { - t->status = SMB_DIRECT_CS_DISCONNECTED; - wake_up_interruptible(&t->wait_status); + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + smb_direct_disconnect_rdma_work(&sc->disconnect_work); break; } default: @@ -1541,38 +1704,41 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, static void smb_direct_qpair_handler(struct ib_event *event, void *context) { - struct smb_direct_transport *t = context; + struct smbdirect_socket *sc = context; ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n", - t->cm_id, ib_event_msg(event->event), event->event); + sc->rdma.cm_id, ib_event_msg(event->event), event->event); switch (event->event) { case IB_EVENT_CQ_ERR: case IB_EVENT_QP_FATAL: - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); break; default: break; } } -static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, +static int smb_direct_send_negotiate_response(struct smbdirect_socket *sc, int failed) { - struct smb_direct_sendmsg *sendmsg; - struct smb_direct_negotiate_resp *resp; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_send_io *sendmsg; + struct smbdirect_negotiate_resp *resp; int ret; - sendmsg = smb_direct_alloc_sendmsg(t); + sendmsg = smb_direct_alloc_sendmsg(sc); if (IS_ERR(sendmsg)) return -ENOMEM; - resp = (struct smb_direct_negotiate_resp *)sendmsg->packet; + resp = (struct smbdirect_negotiate_resp *)sendmsg->packet; if (failed) { memset(resp, 0, sizeof(*resp)); - resp->min_version = cpu_to_le16(0x0100); - resp->max_version = cpu_to_le16(0x0100); + resp->min_version = SMB_DIRECT_VERSION_LE; + resp->max_version = SMB_DIRECT_VERSION_LE; resp->status = STATUS_NOT_SUPPORTED; + + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; } else { resp->status = STATUS_SUCCESS; resp->min_version = SMB_DIRECT_VERSION_LE; @@ -1580,57 +1746,65 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, resp->negotiated_version = SMB_DIRECT_VERSION_LE; resp->reserved = 0; resp->credits_requested = - cpu_to_le16(t->send_credit_target); - resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(t)); - resp->max_readwrite_size = cpu_to_le32(t->max_rdma_rw_size); - resp->preferred_send_size = cpu_to_le32(t->max_send_size); - resp->max_receive_size = cpu_to_le32(t->max_recv_size); + cpu_to_le16(sp->send_credit_target); + resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); + resp->max_readwrite_size = cpu_to_le32(sp->max_read_write_size); + resp->preferred_send_size = cpu_to_le32(sp->max_send_size); + resp->max_receive_size = cpu_to_le32(sp->max_recv_size); resp->max_fragmented_size = - cpu_to_le32(t->max_fragmented_recv_size); + cpu_to_le32(sp->max_fragmented_recv_size); + + sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; + sc->status = SMBDIRECT_SOCKET_CONNECTED; } - sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device, + sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, (void *)resp, sizeof(*resp), DMA_TO_DEVICE); - ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr); + ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); if (ret) { - smb_direct_free_sendmsg(t, sendmsg); + smb_direct_free_sendmsg(sc, sendmsg); return ret; } sendmsg->num_sge = 1; sendmsg->sge[0].length = sizeof(*resp); - sendmsg->sge[0].lkey = t->pd->local_dma_lkey; + sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; - ret = post_sendmsg(t, NULL, sendmsg); + ret = post_sendmsg(sc, NULL, sendmsg); if (ret) { - smb_direct_free_sendmsg(t, sendmsg); + smb_direct_free_sendmsg(sc, sendmsg); return ret; } - wait_event(t->wait_send_pending, - atomic_read(&t->send_pending) == 0); + wait_event(sc->send_io.pending.zero_wait_queue, + atomic_read(&sc->send_io.pending.count) == 0 || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return -ENOTCONN; + return 0; } -static int smb_direct_accept_client(struct smb_direct_transport *t) +static int smb_direct_accept_client(struct smbdirect_socket *sc) { + struct smbdirect_socket_parameters *sp = &sc->parameters; struct rdma_conn_param conn_param; - struct ib_port_immutable port_immutable; - u32 ird_ord_hdr[2]; + __be32 ird_ord_hdr[2]; int ret; + /* + * smb_direct_handle_connect_request() + * already negotiated sp->initiator_depth + * and sp->responder_resources + */ memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = min_t(u8, t->cm_id->device->attrs.max_qp_rd_atom, - SMB_DIRECT_CM_INITIATOR_DEPTH); - conn_param.responder_resources = 0; - - t->cm_id->device->ops.get_port_immutable(t->cm_id->device, - t->cm_id->port_num, - &port_immutable); - if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { - ird_ord_hdr[0] = conn_param.responder_resources; - ird_ord_hdr[1] = 1; + conn_param.initiator_depth = sp->initiator_depth; + conn_param.responder_resources = sp->responder_resources; + + if (sc->rdma.legacy_iwarp) { + ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); + ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); conn_param.private_data = ird_ord_hdr; conn_param.private_data_len = sizeof(ird_ord_hdr); } else { @@ -1641,7 +1815,17 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY; conn_param.flow_control = 0; - ret = rdma_accept(t->cm_id, &conn_param); + /* + * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING + * so that the timer will cause a disconnect. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->negotiate_timeout_msec)); + + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; + ret = rdma_accept(sc->rdma.cm_id, &conn_param); if (ret) { pr_err("error at rdma_accept: %d\n", ret); return ret; @@ -1649,57 +1833,60 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) return 0; } -static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) +static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) { + struct smbdirect_recv_io *recvmsg; int ret; - struct smb_direct_recvmsg *recvmsg; - recvmsg = get_free_recvmsg(t); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; + + sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ; + + recvmsg = get_free_recvmsg(sc); if (!recvmsg) return -ENOMEM; - recvmsg->type = SMB_DIRECT_MSG_NEGOTIATE_REQ; - ret = smb_direct_post_recv(t, recvmsg); + ret = smb_direct_post_recv(sc, recvmsg); if (ret) { pr_err("Can't post recv: %d\n", ret); goto out_err; } - t->negotiation_requested = false; - ret = smb_direct_accept_client(t); + ret = smb_direct_accept_client(sc); if (ret) { pr_err("Can't accept client\n"); goto out_err; } - smb_direct_post_recv_credits(&t->post_recv_credits_work.work); + smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); return 0; out_err: - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); return ret; } -static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t) +static unsigned int smb_direct_get_max_fr_pages(struct smbdirect_socket *sc) { return min_t(unsigned int, - t->cm_id->device->attrs.max_fast_reg_page_list_len, + sc->ib.dev->attrs.max_fast_reg_page_list_len, 256); } -static int smb_direct_init_params(struct smb_direct_transport *t, +static int smb_direct_init_params(struct smbdirect_socket *sc, struct ib_qp_cap *cap) { - struct ib_device *device = t->cm_id->device; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct ib_device *device = sc->ib.dev; int max_send_sges, max_rw_wrs, max_send_wrs; unsigned int max_sge_per_wr, wrs_per_credit; /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, * SMB2 response could be mapped. */ - t->max_send_size = smb_direct_max_send_size; - max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 3; - if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) { - pr_err("max_send_size %d is too large\n", t->max_send_size); + max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3; + if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) { + pr_err("max_send_size %d is too large\n", sp->max_send_size); return -EINVAL; } @@ -1710,10 +1897,9 @@ static int smb_direct_init_params(struct smb_direct_transport *t, * are needed for MR registration, RDMA R/W, local & remote * MR invalidation. */ - t->max_rdma_rw_size = smb_direct_max_read_write_size; - t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t); - t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size, - (t->pages_per_rw_credit - 1) * + sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(sc); + sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size, + (sc->rw_io.credits.num_pages - 1) * PAGE_SIZE); max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge, @@ -1721,233 +1907,244 @@ static int smb_direct_init_params(struct smb_direct_transport *t, max_sge_per_wr = max_t(unsigned int, max_sge_per_wr, max_send_sges); wrs_per_credit = max_t(unsigned int, 4, - DIV_ROUND_UP(t->pages_per_rw_credit, + DIV_ROUND_UP(sc->rw_io.credits.num_pages, max_sge_per_wr) + 1); - max_rw_wrs = t->max_rw_credits * wrs_per_credit; + max_rw_wrs = sc->rw_io.credits.max * wrs_per_credit; - max_send_wrs = smb_direct_send_credit_target + max_rw_wrs; + max_send_wrs = sp->send_credit_target + max_rw_wrs; if (max_send_wrs > device->attrs.max_cqe || max_send_wrs > device->attrs.max_qp_wr) { pr_err("consider lowering send_credit_target = %d\n", - smb_direct_send_credit_target); + sp->send_credit_target); pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", device->attrs.max_cqe, device->attrs.max_qp_wr); return -EINVAL; } - if (smb_direct_receive_credit_max > device->attrs.max_cqe || - smb_direct_receive_credit_max > device->attrs.max_qp_wr) { + if (sp->recv_credit_max > device->attrs.max_cqe || + sp->recv_credit_max > device->attrs.max_qp_wr) { pr_err("consider lowering receive_credit_max = %d\n", - smb_direct_receive_credit_max); + sp->recv_credit_max); pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", device->attrs.max_cqe, device->attrs.max_qp_wr); return -EINVAL; } - if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) { + if (device->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE) { + pr_err("warning: device max_send_sge = %d too small\n", + device->attrs.max_send_sge); + return -EINVAL; + } + if (device->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) { pr_err("warning: device max_recv_sge = %d too small\n", device->attrs.max_recv_sge); return -EINVAL; } - t->recv_credits = 0; - t->count_avail_recvmsg = 0; - - t->recv_credit_max = smb_direct_receive_credit_max; - t->recv_credit_target = 10; - t->new_recv_credits = 0; - - t->send_credit_target = smb_direct_send_credit_target; - atomic_set(&t->send_credits, 0); - atomic_set(&t->rw_credits, t->max_rw_credits); + sc->recv_io.credits.target = 1; - t->max_send_size = smb_direct_max_send_size; - t->max_recv_size = smb_direct_max_receive_size; - t->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; + atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); cap->max_send_wr = max_send_wrs; - cap->max_recv_wr = t->recv_credit_max; - cap->max_send_sge = max_sge_per_wr; - cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; + cap->max_recv_wr = sp->recv_credit_max; + cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; + cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; cap->max_inline_data = 0; - cap->max_rdma_ctxs = t->max_rw_credits; + cap->max_rdma_ctxs = sc->rw_io.credits.max; return 0; } -static void smb_direct_destroy_pools(struct smb_direct_transport *t) +static void smb_direct_destroy_pools(struct smbdirect_socket *sc) { - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_recv_io *recvmsg; - while ((recvmsg = get_free_recvmsg(t))) - mempool_free(recvmsg, t->recvmsg_mempool); + while ((recvmsg = get_free_recvmsg(sc))) + mempool_free(recvmsg, sc->recv_io.mem.pool); - mempool_destroy(t->recvmsg_mempool); - t->recvmsg_mempool = NULL; + mempool_destroy(sc->recv_io.mem.pool); + sc->recv_io.mem.pool = NULL; - kmem_cache_destroy(t->recvmsg_cache); - t->recvmsg_cache = NULL; + kmem_cache_destroy(sc->recv_io.mem.cache); + sc->recv_io.mem.cache = NULL; - mempool_destroy(t->sendmsg_mempool); - t->sendmsg_mempool = NULL; + mempool_destroy(sc->send_io.mem.pool); + sc->send_io.mem.pool = NULL; - kmem_cache_destroy(t->sendmsg_cache); - t->sendmsg_cache = NULL; + kmem_cache_destroy(sc->send_io.mem.cache); + sc->send_io.mem.cache = NULL; } -static int smb_direct_create_pools(struct smb_direct_transport *t) +static int smb_direct_create_pools(struct smbdirect_socket *sc) { + struct smbdirect_socket_parameters *sp = &sc->parameters; char name[80]; int i; - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_recv_io *recvmsg; - snprintf(name, sizeof(name), "smb_direct_rqst_pool_%p", t); - t->sendmsg_cache = kmem_cache_create(name, - sizeof(struct smb_direct_sendmsg) + - sizeof(struct smb_direct_negotiate_resp), + snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", sc); + sc->send_io.mem.cache = kmem_cache_create(name, + sizeof(struct smbdirect_send_io) + + sizeof(struct smbdirect_negotiate_resp), 0, SLAB_HWCACHE_ALIGN, NULL); - if (!t->sendmsg_cache) + if (!sc->send_io.mem.cache) return -ENOMEM; - t->sendmsg_mempool = mempool_create(t->send_credit_target, + sc->send_io.mem.pool = mempool_create(sp->send_credit_target, mempool_alloc_slab, mempool_free_slab, - t->sendmsg_cache); - if (!t->sendmsg_mempool) + sc->send_io.mem.cache); + if (!sc->send_io.mem.pool) goto err; - snprintf(name, sizeof(name), "smb_direct_resp_%p", t); - t->recvmsg_cache = kmem_cache_create(name, - sizeof(struct smb_direct_recvmsg) + - t->max_recv_size, + snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", sc); + sc->recv_io.mem.cache = kmem_cache_create(name, + sizeof(struct smbdirect_recv_io) + + sp->max_recv_size, 0, SLAB_HWCACHE_ALIGN, NULL); - if (!t->recvmsg_cache) + if (!sc->recv_io.mem.cache) goto err; - t->recvmsg_mempool = - mempool_create(t->recv_credit_max, mempool_alloc_slab, - mempool_free_slab, t->recvmsg_cache); - if (!t->recvmsg_mempool) + sc->recv_io.mem.pool = + mempool_create(sp->recv_credit_max, mempool_alloc_slab, + mempool_free_slab, sc->recv_io.mem.cache); + if (!sc->recv_io.mem.pool) goto err; - INIT_LIST_HEAD(&t->recvmsg_queue); - - for (i = 0; i < t->recv_credit_max; i++) { - recvmsg = mempool_alloc(t->recvmsg_mempool, KSMBD_DEFAULT_GFP); + for (i = 0; i < sp->recv_credit_max; i++) { + recvmsg = mempool_alloc(sc->recv_io.mem.pool, KSMBD_DEFAULT_GFP); if (!recvmsg) goto err; - recvmsg->transport = t; + recvmsg->socket = sc; recvmsg->sge.length = 0; - list_add(&recvmsg->list, &t->recvmsg_queue); + list_add(&recvmsg->list, &sc->recv_io.free.list); } - t->count_avail_recvmsg = t->recv_credit_max; return 0; err: - smb_direct_destroy_pools(t); + smb_direct_destroy_pools(sc); return -ENOMEM; } -static int smb_direct_create_qpair(struct smb_direct_transport *t, +static int smb_direct_create_qpair(struct smbdirect_socket *sc, struct ib_qp_cap *cap) { + struct smbdirect_socket_parameters *sp = &sc->parameters; int ret; struct ib_qp_init_attr qp_attr; int pages_per_rw; - t->pd = ib_alloc_pd(t->cm_id->device, 0); - if (IS_ERR(t->pd)) { + sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); + if (IS_ERR(sc->ib.pd)) { pr_err("Can't create RDMA PD\n"); - ret = PTR_ERR(t->pd); - t->pd = NULL; + ret = PTR_ERR(sc->ib.pd); + sc->ib.pd = NULL; return ret; } - t->send_cq = ib_alloc_cq(t->cm_id->device, t, - smb_direct_send_credit_target + cap->max_rdma_ctxs, - 0, IB_POLL_WORKQUEUE); - if (IS_ERR(t->send_cq)) { + sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc, + sp->send_credit_target + + cap->max_rdma_ctxs, + IB_POLL_WORKQUEUE); + if (IS_ERR(sc->ib.send_cq)) { pr_err("Can't create RDMA send CQ\n"); - ret = PTR_ERR(t->send_cq); - t->send_cq = NULL; + ret = PTR_ERR(sc->ib.send_cq); + sc->ib.send_cq = NULL; goto err; } - t->recv_cq = ib_alloc_cq(t->cm_id->device, t, - t->recv_credit_max, 0, IB_POLL_WORKQUEUE); - if (IS_ERR(t->recv_cq)) { + sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc, + sp->recv_credit_max, + IB_POLL_WORKQUEUE); + if (IS_ERR(sc->ib.recv_cq)) { pr_err("Can't create RDMA recv CQ\n"); - ret = PTR_ERR(t->recv_cq); - t->recv_cq = NULL; + ret = PTR_ERR(sc->ib.recv_cq); + sc->ib.recv_cq = NULL; goto err; } memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.event_handler = smb_direct_qpair_handler; - qp_attr.qp_context = t; + qp_attr.qp_context = sc; qp_attr.cap = *cap; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; - qp_attr.send_cq = t->send_cq; - qp_attr.recv_cq = t->recv_cq; + qp_attr.send_cq = sc->ib.send_cq; + qp_attr.recv_cq = sc->ib.recv_cq; qp_attr.port_num = ~0; - ret = rdma_create_qp(t->cm_id, t->pd, &qp_attr); + ret = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr); if (ret) { pr_err("Can't create RDMA QP: %d\n", ret); goto err; } - t->qp = t->cm_id->qp; - t->cm_id->event_handler = smb_direct_cm_handler; + sc->ib.qp = sc->rdma.cm_id->qp; + sc->rdma.cm_id->event_handler = smb_direct_cm_handler; - pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; - if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { - ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, - t->max_rw_credits, IB_MR_TYPE_MEM_REG, - t->pages_per_rw_credit, 0); + pages_per_rw = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE) + 1; + if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) { + ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs, + sc->rw_io.credits.max, IB_MR_TYPE_MEM_REG, + sc->rw_io.credits.num_pages, 0); if (ret) { - pr_err("failed to init mr pool count %d pages %d\n", - t->max_rw_credits, t->pages_per_rw_credit); + pr_err("failed to init mr pool count %zu pages %zu\n", + sc->rw_io.credits.max, sc->rw_io.credits.num_pages); goto err; } } return 0; err: - if (t->qp) { - t->qp = NULL; - rdma_destroy_qp(t->cm_id); + if (sc->ib.qp) { + sc->ib.qp = NULL; + rdma_destroy_qp(sc->rdma.cm_id); } - if (t->recv_cq) { - ib_destroy_cq(t->recv_cq); - t->recv_cq = NULL; + if (sc->ib.recv_cq) { + ib_destroy_cq(sc->ib.recv_cq); + sc->ib.recv_cq = NULL; } - if (t->send_cq) { - ib_destroy_cq(t->send_cq); - t->send_cq = NULL; + if (sc->ib.send_cq) { + ib_destroy_cq(sc->ib.send_cq); + sc->ib.send_cq = NULL; } - if (t->pd) { - ib_dealloc_pd(t->pd); - t->pd = NULL; + if (sc->ib.pd) { + ib_dealloc_pd(sc->ib.pd); + sc->ib.pd = NULL; } return ret; } static int smb_direct_prepare(struct ksmbd_transport *t) { - struct smb_direct_transport *st = smb_trans_direct_transfort(t); - struct smb_direct_recvmsg *recvmsg; - struct smb_direct_negotiate_req *req; + struct smb_direct_transport *st = SMBD_TRANS(t); + struct smbdirect_socket *sc = &st->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_recv_io *recvmsg; + struct smbdirect_negotiate_req *req; + unsigned long flags; int ret; + /* + * We are waiting to pass the following states: + * + * SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED + * SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING + * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED + * + * To finally get to SMBDIRECT_SOCKET_NEGOTIATE_RUNNING + * in order to continue below. + * + * Everything else is unexpected and an error. + */ ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); - ret = wait_event_interruptible_timeout(st->wait_status, - st->negotiation_requested || - st->status == SMB_DIRECT_CS_DISCONNECTED, - SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); - if (ret <= 0 || st->status == SMB_DIRECT_CS_DISCONNECTED) + ret = wait_event_interruptible_timeout(sc->status_wait, + sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED && + sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING && + sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, + msecs_to_jiffies(sp->negotiate_timeout_msec)); + if (ret <= 0 || sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING) return ret < 0 ? ret : -ETIMEDOUT; - recvmsg = get_first_reassembly(st); + recvmsg = get_first_reassembly(sc); if (!recvmsg) return -ECONNABORTED; @@ -1955,51 +2152,54 @@ static int smb_direct_prepare(struct ksmbd_transport *t) if (ret == -ECONNABORTED) goto out; - req = (struct smb_direct_negotiate_req *)recvmsg->packet; - st->max_recv_size = min_t(int, st->max_recv_size, + req = (struct smbdirect_negotiate_req *)recvmsg->packet; + sp->max_recv_size = min_t(int, sp->max_recv_size, le32_to_cpu(req->preferred_send_size)); - st->max_send_size = min_t(int, st->max_send_size, + sp->max_send_size = min_t(int, sp->max_send_size, le32_to_cpu(req->max_receive_size)); - st->max_fragmented_send_size = + sp->max_fragmented_send_size = le32_to_cpu(req->max_fragmented_size); - st->max_fragmented_recv_size = - (st->recv_credit_max * st->max_recv_size) / 2; + sp->max_fragmented_recv_size = + (sp->recv_credit_max * sp->max_recv_size) / 2; + sc->recv_io.credits.target = le16_to_cpu(req->credits_requested); + sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); + sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1); - ret = smb_direct_send_negotiate_response(st, ret); + ret = smb_direct_send_negotiate_response(sc, ret); out: - spin_lock_irq(&st->reassembly_queue_lock); - st->reassembly_queue_length--; + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + sc->recv_io.reassembly.queue_length--; list_del(&recvmsg->list); - spin_unlock_irq(&st->reassembly_queue_lock); - put_recvmsg(st, recvmsg); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); + put_recvmsg(sc, recvmsg); return ret; } -static int smb_direct_connect(struct smb_direct_transport *st) +static int smb_direct_connect(struct smbdirect_socket *sc) { - int ret; struct ib_qp_cap qp_cap; + int ret; - ret = smb_direct_init_params(st, &qp_cap); + ret = smb_direct_init_params(sc, &qp_cap); if (ret) { pr_err("Can't configure RDMA parameters\n"); return ret; } - ret = smb_direct_create_pools(st); + ret = smb_direct_create_pools(sc); if (ret) { pr_err("Can't init RDMA pool: %d\n", ret); return ret; } - ret = smb_direct_create_qpair(st, &qp_cap); + ret = smb_direct_create_qpair(sc, &qp_cap); if (ret) { pr_err("Can't accept RDMA client: %d\n", ret); return ret; } - ret = smb_direct_prepare_negotiation(st); + ret = smb_direct_prepare_negotiation(sc); if (ret) { pr_err("Can't negotiate: %d\n", ret); return ret; @@ -2016,10 +2216,15 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) return true; } -static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) +static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, + struct rdma_cm_event *event) { struct smb_direct_transport *t; + struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; struct task_struct *handler; + u8 peer_initiator_depth; + u8 peer_responder_resources; int ret; if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { @@ -2032,8 +2237,71 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) t = alloc_transport(new_cm_id); if (!t) return -ENOMEM; + sc = &t->socket; + sp = &sc->parameters; + + peer_initiator_depth = event->param.conn.initiator_depth; + peer_responder_resources = event->param.conn.responder_resources; + if (rdma_protocol_iwarp(new_cm_id->device, new_cm_id->port_num) && + event->param.conn.private_data_len == 8) { + /* + * Legacy clients with only iWarp MPA v1 support + * need a private blob in order to negotiate + * the IRD/ORD values. + */ + const __be32 *ird_ord_hdr = event->param.conn.private_data; + u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); + u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); + + /* + * cifs.ko sends the legacy IRD/ORD negotiation + * event if iWarp MPA v2 was used. + * + * Here we check that the values match and only + * mark the client as legacy if they don't match. + */ + if ((u32)event->param.conn.initiator_depth != ird32 || + (u32)event->param.conn.responder_resources != ord32) { + /* + * There are broken clients (old cifs.ko) + * using little endian and also + * struct rdma_conn_param only uses u8 + * for initiator_depth and responder_resources, + * so we truncate the value to U8_MAX. + * + * smb_direct_accept_client() will then + * do the real negotiation in order to + * select the minimum between client and + * server. + */ + ird32 = min_t(u32, ird32, U8_MAX); + ord32 = min_t(u32, ord32, U8_MAX); + + sc->rdma.legacy_iwarp = true; + peer_initiator_depth = (u8)ird32; + peer_responder_resources = (u8)ord32; + } + } + + /* + * First set what the we as server are able to support + */ + sp->initiator_depth = min_t(u8, sp->initiator_depth, + new_cm_id->device->attrs.max_qp_rd_atom); - ret = smb_direct_connect(t); + /* + * negotiate the value by using the minimum + * between client and server if the client provided + * non 0 values. + */ + if (peer_initiator_depth != 0) + sp->initiator_depth = min_t(u8, sp->initiator_depth, + peer_initiator_depth); + if (peer_responder_resources != 0) + sp->responder_resources = min_t(u8, sp->responder_resources, + peer_responder_resources); + + ret = smb_direct_connect(sc); if (ret) goto out_err; @@ -2057,7 +2325,7 @@ static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, { switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: { - int ret = smb_direct_handle_connect_request(cm_id); + int ret = smb_direct_handle_connect_request(cm_id, event); if (ret) { pr_err("Can't create transport: %d\n", ret); @@ -2177,7 +2445,8 @@ int ksmbd_rdma_init(void) * for lack of credits */ smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq", - WQ_HIGHPRI | WQ_MEM_RECLAIM, 0); + WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!smb_direct_wq) return -ENOMEM; diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h index a2291b77488a..3f93c6a9f7e4 100644 --- a/fs/smb/server/transport_rdma.h +++ b/fs/smb/server/transport_rdma.h @@ -11,61 +11,20 @@ #define SMBD_MIN_IOSIZE (512 * 1024) #define SMBD_MAX_IOSIZE (16 * 1024 * 1024) -/* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ -struct smb_direct_negotiate_req { - __le16 min_version; - __le16 max_version; - __le16 reserved; - __le16 credits_requested; - __le32 preferred_send_size; - __le32 max_receive_size; - __le32 max_fragmented_size; -} __packed; - -/* SMB DIRECT negotiation response packet [MS-SMBD] 2.2.2 */ -struct smb_direct_negotiate_resp { - __le16 min_version; - __le16 max_version; - __le16 negotiated_version; - __le16 reserved; - __le16 credits_requested; - __le16 credits_granted; - __le32 status; - __le32 max_readwrite_size; - __le32 preferred_send_size; - __le32 max_receive_size; - __le32 max_fragmented_size; -} __packed; - -#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001 - -/* SMB DIRECT data transfer packet with payload [MS-SMBD] 2.2.3 */ -struct smb_direct_data_transfer { - __le16 credits_requested; - __le16 credits_granted; - __le16 flags; - __le16 reserved; - __le32 remaining_data_length; - __le32 data_offset; - __le32 data_length; - __le32 padding; - __u8 buffer[]; -} __packed; - #ifdef CONFIG_SMB_SERVER_SMBDIRECT int ksmbd_rdma_init(void); void ksmbd_rdma_stop_listening(void); void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); void init_smbd_max_io_size(unsigned int sz); -unsigned int get_smbd_max_read_write_size(void); +unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt); #else static inline int ksmbd_rdma_init(void) { return 0; } static inline void ksmbd_rdma_stop_listening(void) { } static inline void ksmbd_rdma_destroy(void) { } static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } static inline void init_smbd_max_io_size(unsigned int sz) { } -static inline unsigned int get_smbd_max_read_write_size(void) { return 0; } +static inline unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { return 0; } #endif #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 04539037108c..1cfa688904b2 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -196,7 +196,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) pr_err("File(%s): creation failed (err:%d)\n", name, err); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -237,7 +237,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) if (!err && dentry != d) ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); return err; @@ -669,7 +669,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, ksmbd_debug(VFS, "vfs_link failed err %d\n", err); out3: - done_path_create(&newpath, dentry); + end_creating_path(&newpath, dentry); out2: path_put(&oldpath); out1: @@ -770,10 +770,9 @@ retry: goto out4; } - rd.old_mnt_idmap = mnt_idmap(old_path->mnt), + rd.mnt_idmap = mnt_idmap(old_path->mnt), rd.old_parent = old_parent, rd.old_dentry = old_child, - rd.new_mnt_idmap = mnt_idmap(new_path.mnt), rd.new_parent = new_path.dentry, rd.new_dentry = new_dentry, rd.flags = flags, @@ -1326,7 +1325,7 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, if (!abs_name) return ERR_PTR(-ENOMEM); - dent = kern_path_create(AT_FDCWD, abs_name, path, flags); + dent = start_creating_path(AT_FDCWD, abs_name, path, flags); kfree(abs_name); return dent; } diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h index 0708155b5caf..78b506c5ef03 100644 --- a/fs/smb/server/vfs_cache.h +++ b/fs/smb/server/vfs_cache.h @@ -112,6 +112,8 @@ struct ksmbd_file { bool is_durable; bool is_persistent; bool is_resilient; + + bool is_posix_ctxt; }; static inline void set_ctx_actor(struct dir_context *ctx, diff --git a/fs/super.c b/fs/super.c index 7f876f32343a..f4fa0e93c463 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1716,49 +1716,6 @@ int get_tree_bdev(struct fs_context *fc, } EXPORT_SYMBOL(get_tree_bdev); -static int test_bdev_super(struct super_block *s, void *data) -{ - return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; -} - -struct dentry *mount_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - struct super_block *s; - int error; - dev_t dev; - - error = lookup_bdev(dev_name, &dev); - if (error) - return ERR_PTR(error); - - flags |= SB_NOSEC; - s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); - if (IS_ERR(s)) - return ERR_CAST(s); - - if (s->s_root) { - if ((flags ^ s->s_flags) & SB_RDONLY) { - deactivate_locked_super(s); - return ERR_PTR(-EBUSY); - } - } else { - error = setup_bdev_super(s, flags, NULL); - if (!error) - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - s->s_flags |= SB_ACTIVE; - } - - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_bdev); - void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; @@ -1773,26 +1730,6 @@ void kill_block_super(struct super_block *sb) EXPORT_SYMBOL(kill_block_super); #endif -struct dentry *mount_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - int error; - struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); - - if (IS_ERR(s)) - return ERR_CAST(s); - - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - s->s_flags |= SB_ACTIVE; - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_nodev); - /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. @@ -2314,17 +2251,20 @@ int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", - WQ_MEM_RECLAIM, 0, + WQ_MEM_RECLAIM | WQ_PERCPU, + 0, sb->s_id); if (!wq) return -ENOMEM; + + old = NULL; /* * This has to be atomic as more DIOs can race to create the workqueue */ - old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); - /* Someone created workqueue before us? Free ours... */ - if (old) + if (!try_cmpxchg(&sb->s_dio_done_wq, &old, wq)) { + /* Someone created workqueue before us? Free ours... */ destroy_workqueue(wq); + } return 0; } EXPORT_SYMBOL_GPL(sb_init_dio_done_wq); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1ca143d2f22a..3825e780cc58 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -97,12 +97,9 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, count = size - pos; } - if (!battr->read && !battr->read_new) + if (!battr->read) return -EIO; - if (battr->read_new) - return battr->read_new(of->file, kobj, battr, buf, pos, count); - return battr->read(of->file, kobj, battr, buf, pos, count); } @@ -161,12 +158,9 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, if (!count) return 0; - if (!battr->write && !battr->write_new) + if (!battr->write) return -EIO; - if (battr->write_new) - return battr->write_new(of->file, kobj, battr, buf, pos, count); - return battr->write(of->file, kobj, battr, buf, pos, count); } @@ -335,19 +329,13 @@ int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, const struct kernfs_ops *ops; struct kernfs_node *kn; - if (battr->read && battr->read_new) - return -EINVAL; - - if (battr->write && battr->write_new) - return -EINVAL; - if (battr->mmap) ops = &sysfs_bin_kfops_mmap; - else if ((battr->read || battr->read_new) && (battr->write || battr->write_new)) + else if (battr->read && battr->write) ops = &sysfs_bin_kfops_rw; - else if (battr->read || battr->read_new) + else if (battr->read) ops = &sysfs_bin_kfops_ro; - else if (battr->write || battr->write_new) + else if (battr->write) ops = &sysfs_bin_kfops_wo; else ops = &sysfs_file_kfops_empty; diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index fb5ac358077b..0b14d004a095 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -88,6 +88,8 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, } const struct fscrypt_operations ubifs_crypt_operations = { + .inode_info_offs = (int)offsetof(struct ubifs_inode, i_crypt_info) - + (int)offsetof(struct ubifs_inode, vfs_inode), .legacy_key_prefix = "ubifs:", .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f3e3b2068608..46952a33c4e6 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -335,7 +335,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) static int ubifs_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); @@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode) goto out; dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); - ubifs_assert(c, !atomic_read(&inode->i_count)); + ubifs_assert(c, !icount_read(inode)); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 5db45c9e26ee..49e50431741c 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -365,6 +365,7 @@ struct ubifs_gced_idx_leb { * @read_in_a_row: number of consecutive pages read in a row (for bulk read) * @data_len: length of the data attached to the inode * @data: inode's data + * @i_crypt_info: inode's fscrypt information * * @ui_mutex exists for two main reasons. At first it prevents inodes from * being written back while UBIFS changing them, being in the middle of an VFS @@ -416,6 +417,9 @@ struct ubifs_inode { pgoff_t read_in_a_row; int data_len; void *data; +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; +#endif }; /** diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 503268cf4296..95ec42b84797 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -19,8 +19,7 @@ struct block_buffer { }; /* Hash a block, writing the result to the next level's pending block buffer. */ -static int hash_one_block(struct inode *inode, - const struct merkle_tree_params *params, +static int hash_one_block(const struct merkle_tree_params *params, struct block_buffer *cur) { struct block_buffer *next = cur + 1; @@ -36,8 +35,7 @@ static int hash_one_block(struct inode *inode, /* Zero-pad the block if it's shorter than the block size. */ memset(&cur->data[cur->filled], 0, params->block_size - cur->filled); - fsverity_hash_block(params, inode, cur->data, - &next->data[next->filled]); + fsverity_hash_block(params, cur->data, &next->data[next->filled]); next->filled += params->digest_size; cur->filled = 0; return 0; @@ -123,7 +121,7 @@ static int build_merkle_tree(struct file *filp, fsverity_err(inode, "Short read of file data"); goto out; } - err = hash_one_block(inode, params, &buffers[-1]); + err = hash_one_block(params, &buffers[-1]); if (err) goto out; for (level = 0; level < num_levels; level++) { @@ -134,7 +132,7 @@ static int build_merkle_tree(struct file *filp, } /* Next block at @level is full */ - err = hash_one_block(inode, params, &buffers[level]); + err = hash_one_block(params, &buffers[level]); if (err) goto out; err = write_merkle_tree_block(inode, @@ -154,7 +152,7 @@ static int build_merkle_tree(struct file *filp, /* Finish all nonempty pending tree blocks. */ for (level = 0; level < num_levels; level++) { if (buffers[level].filled != 0) { - err = hash_one_block(inode, params, &buffers[level]); + err = hash_one_block(params, &buffers[level]); if (err) goto out; err = write_merkle_tree_block(inode, @@ -284,9 +282,9 @@ static int enable_verity(struct file *filp, /* Successfully enabled verity */ /* - * Readers can start using ->i_verity_info immediately, so it - * can't be rolled back once set. So don't set it until just - * after the filesystem has successfully enabled verity. + * Readers can start using the inode's verity info immediately, + * so it can't be rolled back once set. So don't set it until + * just after the filesystem has successfully enabled verity. */ fsverity_set_info(inode, vi); } diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 5fe854a5b9ad..dd20b138d452 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -63,10 +63,11 @@ struct merkle_tree_params { * fsverity_info - cached verity metadata for an inode * * When a verity file is first opened, an instance of this struct is allocated - * and stored in ->i_verity_info; it remains until the inode is evicted. It - * caches information about the Merkle tree that's needed to efficiently verify - * data read from the file. It also caches the file digest. The Merkle tree - * pages themselves are not cached here, but the filesystem may cache them. + * and a pointer to it is stored in the file's in-memory inode. It remains + * until the inode is evicted. It caches information about the Merkle tree + * that's needed to efficiently verify data read from the file. It also caches + * the file digest. The Merkle tree pages themselves are not cached here, but + * the filesystem may cache them. */ struct fsverity_info { struct merkle_tree_params tree_params; @@ -89,7 +90,7 @@ union fsverity_hash_ctx * fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size); void fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, const void *data, u8 *out); + const void *data, u8 *out); void fsverity_hash_buffer(const struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out); void __init fsverity_check_hash_algs(void); diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 9bb3c6344907..de53e14c8aa7 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -94,7 +94,6 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, /** * fsverity_hash_block() - hash a single data or hash block * @params: the Merkle tree's parameters - * @inode: inode for which the hashing is being done * @data: virtual address of a buffer containing the block to hash * @out: output digest, size 'params->digest_size' bytes * @@ -102,7 +101,7 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, * in the Merkle tree parameters. */ void fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, const void *data, u8 *out) + const void *data, u8 *out) { union fsverity_hash_ctx ctx; diff --git a/fs/verity/open.c b/fs/verity/open.c index c561e130cd0c..77b1c977af02 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -244,17 +244,17 @@ fail: void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) { /* - * Multiple tasks may race to set ->i_verity_info, so use - * cmpxchg_release(). This pairs with the smp_load_acquire() in - * fsverity_get_info(). I.e., here we publish ->i_verity_info with a - * RELEASE barrier so that other tasks can ACQUIRE it. + * Multiple tasks may race to set the inode's verity info pointer, so + * use cmpxchg_release(). This pairs with the smp_load_acquire() in + * fsverity_get_info(). I.e., publish the pointer with a RELEASE + * barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) { - /* Lost the race, so free the fsverity_info we allocated. */ + if (cmpxchg_release(fsverity_info_addr(inode), NULL, vi) != NULL) { + /* Lost the race, so free the verity info we allocated. */ fsverity_free_info(vi); /* - * Afterwards, the caller may access ->i_verity_info directly, - * so make sure to ACQUIRE the winning fsverity_info. + * Afterwards, the caller may access the inode's verity info + * directly, so make sure to ACQUIRE the winning verity info. */ (void)fsverity_get_info(inode); } @@ -350,7 +350,6 @@ int fsverity_get_descriptor(struct inode *inode, return 0; } -/* Ensure the inode has an ->i_verity_info */ static int ensure_verity_info(struct inode *inode) { struct fsverity_info *vi = fsverity_get_info(inode); @@ -395,8 +394,10 @@ EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr); void __fsverity_cleanup_inode(struct inode *inode) { - fsverity_free_info(inode->i_verity_info); - inode->i_verity_info = NULL; + struct fsverity_info **vi_addr = fsverity_info_addr(inode); + + fsverity_free_info(*vi_addr); + *vi_addr = NULL; } EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); diff --git a/fs/verity/verify.c b/fs/verity/verify.c index a1f00c3fd3b2..86067c8b40cf 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -10,6 +10,31 @@ #include <linux/bio.h> #include <linux/export.h> +#define FS_VERITY_MAX_PENDING_BLOCKS 2 + +struct fsverity_pending_block { + const void *data; + u64 pos; + u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE]; +}; + +struct fsverity_verification_context { + struct inode *inode; + struct fsverity_info *vi; + unsigned long max_ra_pages; + + /* + * This is the queue of data blocks that are pending verification. When + * the crypto layer supports interleaved hashing, we allow multiple + * blocks to be queued up in order to utilize it. This can improve + * performance significantly vs. sequential hashing of each block. + */ + int num_pending; + int max_pending; + struct fsverity_pending_block + pending_blocks[FS_VERITY_MAX_PENDING_BLOCKS]; +}; + static struct workqueue_struct *fsverity_read_workqueue; /* @@ -79,7 +104,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, } /* - * Verify a single data block against the file's Merkle tree. + * Verify the hash of a single data block against the file's Merkle tree. * * In principle, we need to verify the entire path to the root node. However, * for efficiency the filesystem may cache the hash blocks. Therefore we need @@ -88,10 +113,11 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, * * Return: %true if the data block is valid, else %false. */ -static bool -verify_data_block(struct inode *inode, struct fsverity_info *vi, - const void *data, u64 data_pos, unsigned long max_ra_pages) +static bool verify_data_block(struct inode *inode, struct fsverity_info *vi, + const struct fsverity_pending_block *dblock, + unsigned long max_ra_pages) { + const u64 data_pos = dblock->pos; const struct merkle_tree_params *params = &vi->tree_params; const unsigned int hsize = params->digest_size; int level; @@ -115,8 +141,12 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, */ u64 hidx = data_pos >> params->log_blocksize; - /* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */ - BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX); + /* + * Up to FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS pages may + * be mapped at once. + */ + static_assert(FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS <= + KM_MAX_IDX); if (unlikely(data_pos >= inode->i_size)) { /* @@ -127,7 +157,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, * any part past EOF should be all zeroes. Therefore, we need * to verify that any data blocks fully past EOF are all zeroes. */ - if (memchr_inv(data, 0, params->block_size)) { + if (memchr_inv(dblock->data, 0, params->block_size)) { fsverity_err(inode, "FILE CORRUPTED! Data past EOF is not zeroed"); return false; @@ -202,7 +232,7 @@ descend: unsigned long hblock_idx = hblocks[level - 1].index; unsigned int hoffset = hblocks[level - 1].hoffset; - fsverity_hash_block(params, inode, haddr, real_hash); + fsverity_hash_block(params, haddr, real_hash); if (memcmp(want_hash, real_hash, hsize) != 0) goto corrupted; /* @@ -220,18 +250,18 @@ descend: put_page(hpage); } - /* Finally, verify the data block. */ - fsverity_hash_block(params, inode, data, real_hash); - if (memcmp(want_hash, real_hash, hsize) != 0) + /* Finally, verify the hash of the data block. */ + if (memcmp(want_hash, dblock->real_hash, hsize) != 0) goto corrupted; return true; corrupted: - fsverity_err(inode, - "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", - data_pos, level - 1, - params->hash_alg->name, hsize, want_hash, - params->hash_alg->name, hsize, real_hash); + fsverity_err( + inode, + "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", + data_pos, level - 1, params->hash_alg->name, hsize, want_hash, + params->hash_alg->name, hsize, + level == 0 ? dblock->real_hash : real_hash); error: for (; level > 0; level--) { kunmap_local(hblocks[level - 1].addr); @@ -240,13 +270,73 @@ error: return false; } +static void +fsverity_init_verification_context(struct fsverity_verification_context *ctx, + struct inode *inode, + unsigned long max_ra_pages) +{ + struct fsverity_info *vi = *fsverity_info_addr(inode); + + ctx->inode = inode; + ctx->vi = vi; + ctx->max_ra_pages = max_ra_pages; + ctx->num_pending = 0; + if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 && + sha256_finup_2x_is_optimized()) + ctx->max_pending = 2; + else + ctx->max_pending = 1; +} + +static void +fsverity_clear_pending_blocks(struct fsverity_verification_context *ctx) +{ + int i; + + for (i = ctx->num_pending - 1; i >= 0; i--) { + kunmap_local(ctx->pending_blocks[i].data); + ctx->pending_blocks[i].data = NULL; + } + ctx->num_pending = 0; +} + static bool -verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, - unsigned long max_ra_pages) +fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx) { - struct inode *inode = data_folio->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; - const unsigned int block_size = vi->tree_params.block_size; + struct fsverity_info *vi = ctx->vi; + const struct merkle_tree_params *params = &vi->tree_params; + int i; + + if (ctx->num_pending == 2) { + /* num_pending == 2 implies that the algorithm is SHA-256 */ + sha256_finup_2x(params->hashstate ? ¶ms->hashstate->sha256 : + NULL, + ctx->pending_blocks[0].data, + ctx->pending_blocks[1].data, params->block_size, + ctx->pending_blocks[0].real_hash, + ctx->pending_blocks[1].real_hash); + } else { + for (i = 0; i < ctx->num_pending; i++) + fsverity_hash_block(params, ctx->pending_blocks[i].data, + ctx->pending_blocks[i].real_hash); + } + + for (i = 0; i < ctx->num_pending; i++) { + if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i], + ctx->max_ra_pages)) + return false; + } + fsverity_clear_pending_blocks(ctx); + return true; +} + +static bool fsverity_add_data_blocks(struct fsverity_verification_context *ctx, + struct folio *data_folio, size_t len, + size_t offset) +{ + struct fsverity_info *vi = ctx->vi; + const struct merkle_tree_params *params = &vi->tree_params; + const unsigned int block_size = params->block_size; u64 pos = (u64)data_folio->index << PAGE_SHIFT; if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size))) @@ -255,14 +345,11 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, folio_test_uptodate(data_folio))) return false; do { - void *data; - bool valid; - - data = kmap_local_folio(data_folio, offset); - valid = verify_data_block(inode, vi, data, pos + offset, - max_ra_pages); - kunmap_local(data); - if (!valid) + ctx->pending_blocks[ctx->num_pending].data = + kmap_local_folio(data_folio, offset); + ctx->pending_blocks[ctx->num_pending].pos = pos + offset; + if (++ctx->num_pending == ctx->max_pending && + !fsverity_verify_pending_blocks(ctx)) return false; offset += block_size; len -= block_size; @@ -284,7 +371,15 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, */ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { - return verify_data_blocks(folio, len, offset, 0); + struct fsverity_verification_context ctx; + + fsverity_init_verification_context(&ctx, folio->mapping->host, 0); + + if (fsverity_add_data_blocks(&ctx, folio, len, offset) && + fsverity_verify_pending_blocks(&ctx)) + return true; + fsverity_clear_pending_blocks(&ctx); + return false; } EXPORT_SYMBOL_GPL(fsverity_verify_blocks); @@ -305,6 +400,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks); */ void fsverity_verify_bio(struct bio *bio) { + struct inode *inode = bio_first_folio_all(bio)->mapping->host; + struct fsverity_verification_context ctx; struct folio_iter fi; unsigned long max_ra_pages = 0; @@ -321,13 +418,21 @@ void fsverity_verify_bio(struct bio *bio) max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2); } + fsverity_init_verification_context(&ctx, inode, max_ra_pages); + bio_for_each_folio_all(fi, bio) { - if (!verify_data_blocks(fi.folio, fi.length, fi.offset, - max_ra_pages)) { - bio->bi_status = BLK_STS_IOERR; - break; - } + if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length, + fi.offset)) + goto ioerr; } + + if (!fsverity_verify_pending_blocks(&ctx)) + goto ioerr; + return; + +ioerr: + fsverity_clear_pending_blocks(&ctx); + bio->bi_status = BLK_STS_IOERR; } EXPORT_SYMBOL_GPL(fsverity_verify_bio); #endif /* CONFIG_BLOCK */ @@ -355,7 +460,7 @@ void __init fsverity_init_workqueue(void) * latency on ARM64. */ fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue", - WQ_HIGHPRI, + WQ_HIGHPRI | WQ_PERCPU, num_online_cpus()); if (!fsverity_read_workqueue) panic("failed to allocate fsverity_read_queue"); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 065953475cf5..8930d5254e1d 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -25,7 +25,7 @@ config XFS_FS config XFS_SUPPORT_V4 bool "Support deprecated V4 (crc=0) format" depends on XFS_FS - default y + default n help The V4 filesystem format lacks certain features that are supported by the V5 format, such as metadata checksumming, strengthened @@ -40,7 +40,7 @@ config XFS_SUPPORT_V4 filesystem is a V4 filesystem. If no such string is found, please upgrade xfsprogs to the latest version and try again. - This option will become default N in September 2025. Support for the + This option became default N in September 2025. Support for the V4 format will be removed entirely in September 2030. Distributors can say N here to withdraw support earlier. @@ -50,7 +50,7 @@ config XFS_SUPPORT_V4 config XFS_SUPPORT_ASCII_CI bool "Support deprecated case-insensitive ascii (ascii-ci=1) format" depends on XFS_FS - default y + default n help The ASCII case insensitivity filesystem feature only works correctly on systems that have been coerced into using ISO 8859-1, and it does @@ -67,7 +67,7 @@ config XFS_SUPPORT_ASCII_CI filesystem is a case-insensitive filesystem. If no such string is found, please upgrade xfsprogs to the latest version and try again. - This option will become default N in September 2025. Support for the + This option became default N in September 2025. Support for the feature will be removed entirely in September 2030. Distributors can say N here to withdraw support earlier. @@ -137,7 +137,7 @@ config XFS_BTREE_IN_MEM config XFS_ONLINE_SCRUB bool "XFS online metadata check support" - default n + default y depends on XFS_FS depends on TMPFS && SHMEM select XFS_LIVE_HOOKS @@ -150,12 +150,8 @@ config XFS_ONLINE_SCRUB advantage here is to look for problems proactively so that they can be dealt with in a controlled manner. - This feature is considered EXPERIMENTAL. Use with caution! - See the xfs_scrub man page in section 8 for additional information. - If unsure, say N. - config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y @@ -171,11 +167,9 @@ config XFS_ONLINE_SCRUB_STATS Usage data are collected in /sys/kernel/debug/xfs/scrub. - If unsure, say N. - config XFS_ONLINE_REPAIR bool "XFS online metadata repair support" - default n + default y depends on XFS_FS && XFS_ONLINE_SCRUB select XFS_BTREE_IN_MEM help @@ -186,12 +180,8 @@ config XFS_ONLINE_REPAIR formatted with secondary metadata, such as reverse mappings and inode parent pointers. - This feature is considered EXPERIMENTAL. Use with caution! - See the xfs_scrub man page in section 8 for additional information. - If unsure, say N. - config XFS_WARN bool "XFS Verbose Warnings" depends on XFS_FS && !XFS_DEBUG diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fb79215a509d..8ac8230c3d3c 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -92,9 +92,8 @@ xfs_ag_resv_critical( trace_xfs_ag_resv_critical(pag, type, avail); /* Critically low if less than 10% or max btree height remains. */ - return XFS_TEST_ERROR(avail < orig / 10 || - avail < mp->m_agbtree_maxlevels, - mp, XFS_ERRTAG_AG_RESV_CRITICAL); + return avail < orig / 10 || avail < mp->m_agbtree_maxlevels || + XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_CRITICAL); } /* @@ -203,7 +202,7 @@ __xfs_ag_resv_init( return -EINVAL; } - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_FAIL)) error = -ENOSPC; else error = xfs_dec_fdblocks(mp, hidden_space, true); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 000cc7f4a3ce..ad381c73abc4 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3321,7 +3321,7 @@ xfs_agf_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agf_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } @@ -4019,8 +4019,7 @@ __xfs_free_extent( ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_free_extent_fix_freelist(tp, pag, &agbp); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index fddb55605e0c..91c1b30ebaab 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -667,12 +667,8 @@ xfs_attr_shortform_bytesfit( /* * For attr2 we can try to move the forkoff if there is space in the - * literal area, but for the old format we are done if there is no - * space in the fixed attribute fork. + * literal area */ - if (!xfs_has_attr2(mp)) - return 0; - dsize = dp->i_df.if_bytes; switch (dp->i_df.if_format) { @@ -723,22 +719,16 @@ xfs_attr_shortform_bytesfit( } /* - * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless: - * - noattr2 mount option is set, - * - on-disk version bit says it is already set, or - * - the attr2 mount option is not set to enable automatic upgrade from attr1. + * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless + * on-disk version bit says it is already set */ STATIC void xfs_sbversion_add_attr2( struct xfs_mount *mp, struct xfs_trans *tp) { - if (xfs_has_noattr2(mp)) - return; if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT) return; - if (!xfs_has_attr2(mp)) - return; spin_lock(&mp->m_sb_lock); xfs_add_attr2(mp); @@ -889,7 +879,7 @@ xfs_attr_sf_removename( /* * Fix up the start offset of the attribute fork */ - if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && + if (totsize == sizeof(struct xfs_attr_sf_hdr) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) && !xfs_has_parent(mp)) { @@ -900,7 +890,6 @@ xfs_attr_sf_removename( ASSERT(dp->i_forkoff); ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || - !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE || xfs_has_parent(mp)); xfs_trans_log_inode(args->trans, dp, @@ -1040,8 +1029,7 @@ xfs_attr_shortform_allfit( bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } - if (xfs_has_attr2(dp->i_mount) && - (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && + if ((dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; return xfs_attr_shortform_bytesfit(dp, bytes); @@ -1161,7 +1149,6 @@ xfs_attr3_leaf_to_shortform( * this case. */ if (!(args->op_flags & XFS_DA_OP_REPLACE)) { - ASSERT(xfs_has_attr2(dp->i_mount)); ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); } @@ -1225,7 +1212,7 @@ xfs_attr3_leaf_to_node( trace_xfs_attr_leaf_to_node(args); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { error = -EIO; goto out; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index d954f9b8071f..53ef4b7e504d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -997,8 +997,7 @@ xfs_bmap_add_attrfork_local( static int xfs_bmap_set_attrforkoff( struct xfs_inode *ip, - int size, - int *version) + int size) { int default_size = xfs_default_attroffset(ip) >> 3; @@ -1012,8 +1011,6 @@ xfs_bmap_set_attrforkoff( ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size); if (!ip->i_forkoff) ip->i_forkoff = default_size; - else if (xfs_has_attr2(ip->i_mount) && version) - *version = 2; break; default: ASSERT(0); @@ -1035,7 +1032,6 @@ xfs_bmap_add_attrfork( int rsvd) /* xact may use reserved blks */ { struct xfs_mount *mp = tp->t_mountp; - int version = 1; /* superblock attr version */ int logflags; /* logging flags */ int error; /* error return value */ @@ -1045,7 +1041,7 @@ xfs_bmap_add_attrfork( ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_bmap_set_attrforkoff(ip, size, &version); + error = xfs_bmap_set_attrforkoff(ip, size); if (error) return error; @@ -1069,16 +1065,12 @@ xfs_bmap_add_attrfork( xfs_trans_log_inode(tp, ip, logflags); if (error) return error; - if (!xfs_has_attr(mp) || - (!xfs_has_attr2(mp) && version == 2)) { + if (!xfs_has_attr(mp)) { bool log_sb = false; spin_lock(&mp->m_sb_lock); if (!xfs_has_attr(mp)) { xfs_add_attr(mp); - log_sb = true; - } - if (!xfs_has_attr2(mp) && version == 2) { xfs_add_attr2(mp); log_sb = true; } @@ -3662,8 +3654,7 @@ xfs_bmap_btalloc( /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); - if (unlikely(XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + if (unlikely(XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) error = xfs_bmap_exact_minlen_extent_alloc(ap, &args); else if ((ap->datatype & XFS_ALLOC_USERDATA) && xfs_inode_is_filestream(ap->ip)) @@ -3849,7 +3840,7 @@ xfs_bmapi_read( } if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4200,7 +4191,7 @@ xfs_bmapi_write( (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4545,7 +4536,7 @@ xfs_bmapi_remap( (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5679,7 +5670,7 @@ xfs_bmap_collapse_extents( int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5795,7 +5786,7 @@ xfs_bmap_insert_extents( int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5900,7 +5891,7 @@ xfs_bmap_split_extent( int i = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -6065,7 +6056,7 @@ xfs_bmap_finish_one( trace_xfs_bmap_deferred(bi); - if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; switch (bi->bi_type) { diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index a61211d253f1..dbe9df8c3300 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -306,7 +306,7 @@ xfs_btree_check_block( fa = __xfs_btree_check_block(cur, block, level, bp); if (XFS_IS_CORRUPT(mp, fa != NULL) || - XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { + XFS_TEST_ERROR(mp, xfs_btree_block_errtag(cur))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); xfs_btree_mark_sick(cur); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 723a0643b838..90f7fc219fcc 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -565,7 +565,7 @@ xfs_da3_split( trace_xfs_da_split(state->args); - if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) + if (XFS_TEST_ERROR(state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 1775abcfa04d..82a338458a51 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -223,7 +223,7 @@ xfs_dir_ino_validate( bool ino_ok = xfs_verify_dir_ino(mp, ino); if (XFS_IS_CORRUPT(mp, !ino_ok) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); return -EFSCORRUPTED; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index a53c5d40e084..de840abc0bcd 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -4,14 +4,22 @@ * Copyright (C) 2017 Oracle. * All Rights Reserved. */ -#ifndef __XFS_ERRORTAG_H_ +#if !defined(__XFS_ERRORTAG_H_) || defined(XFS_ERRTAG) #define __XFS_ERRORTAG_H_ /* - * error injection tags - the labels can be anything you want - * but each tag should have its own unique number + * There are two ways to use this header file. The first way is to #include it + * bare, which will define all the XFS_ERRTAG_* error injection knobs for use + * with the XFS_TEST_ERROR macro. The second way is to enclose the #include + * with a #define for an XFS_ERRTAG macro, in which case the header will define + " an XFS_ERRTAGS macro that expands to invoke that XFS_ERRTAG macro for each + * defined error injection knob. */ +/* + * These are the actual error injection tags. The numbers should be consecutive + * because arrays are sized based on the maximum. + */ #define XFS_ERRTAG_NOERROR 0 #define XFS_ERRTAG_IFLUSH_1 1 #define XFS_ERRTAG_IFLUSH_2 2 @@ -71,49 +79,61 @@ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. */ #define XFS_RANDOM_DEFAULT 100 -#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) -#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT -#define XFS_RANDOM_FREE_EXTENT 1 -#define XFS_RANDOM_RMAP_FINISH_ONE 1 -#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 -#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 -#define XFS_RANDOM_BMAP_FINISH_ONE 1 -#define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_LOG_BAD_CRC 1 -#define XFS_RANDOM_LOG_ITEM_PIN 1 -#define XFS_RANDOM_BUF_LRU_REF 2 -#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 -#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 -#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT -#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 -#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 -#define XFS_RANDOM_AG_RESV_FAIL 1 -#define XFS_RANDOM_LARP 1 -#define XFS_RANDOM_DA_LEAF_SPLIT 1 -#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 -#define XFS_RANDOM_WB_DELAY_MS 3000 -#define XFS_RANDOM_WRITE_DELAY_MS 3000 -#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 -#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4 + +/* + * Table of errror injection knobs. The parameters to the XFS_ERRTAG macro are: + * 1. The XFS_ERRTAG_ flag but without the prefix; + * 2. The name of the sysfs knob; and + * 3. The default value for the knob. + */ +#ifdef XFS_ERRTAG +# undef XFS_ERRTAGS +# define XFS_ERRTAGS \ +XFS_ERRTAG(NOERROR, noerror, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_1, iflush1, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_2, iflush2, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_3, iflush3, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_4, iflush4, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_5, iflush5, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_6, iflush6, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(DA_READ_BUF, dareadbuf, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(BTREE_CHECK_LBLOCK, btree_chk_lblk, XFS_RANDOM_DEFAULT/4) \ +XFS_ERRTAG(BTREE_CHECK_SBLOCK, btree_chk_sblk, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(ALLOC_READ_AGF, readagf, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IALLOC_READ_AGI, readagi, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(ITOBP_INOTOBP, itobp, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IUNLINK, iunlink, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IUNLINK_REMOVE, iunlinkrm, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(DIR_INO_VALIDATE, dirinovalid, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(BULKSTAT_READ_CHUNK, bulkstat, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IODONE_IOERR, logiodone, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(STRATREAD_IOERR, stratread, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(STRATCMPL_IOERR, stratcmpl, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(DIOWRITE_IOERR, diowrite, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(BMAPIFORMAT, bmapifmt, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(FREE_EXTENT, free_extent, 1) \ +XFS_ERRTAG(RMAP_FINISH_ONE, rmap_finish_one, 1) \ +XFS_ERRTAG(REFCOUNT_CONTINUE_UPDATE, refcount_continue_update, 1) \ +XFS_ERRTAG(REFCOUNT_FINISH_ONE, refcount_finish_one, 1) \ +XFS_ERRTAG(BMAP_FINISH_ONE, bmap_finish_one, 1) \ +XFS_ERRTAG(AG_RESV_CRITICAL, ag_resv_critical, 4) \ +XFS_ERRTAG(LOG_BAD_CRC, log_bad_crc, 1) \ +XFS_ERRTAG(LOG_ITEM_PIN, log_item_pin, 1) \ +XFS_ERRTAG(BUF_LRU_REF, buf_lru_ref, 2) \ +XFS_ERRTAG(FORCE_SCRUB_REPAIR, force_repair, 1) \ +XFS_ERRTAG(FORCE_SUMMARY_RECALC, bad_summary, 1) \ +XFS_ERRTAG(IUNLINK_FALLBACK, iunlink_fallback, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(BUF_IOERROR, buf_ioerror, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(REDUCE_MAX_IEXTENTS, reduce_max_iextents, 1) \ +XFS_ERRTAG(BMAP_ALLOC_MINLEN_EXTENT, bmap_alloc_minlen_extent, 1) \ +XFS_ERRTAG(AG_RESV_FAIL, ag_resv_fail, 1) \ +XFS_ERRTAG(LARP, larp, 1) \ +XFS_ERRTAG(DA_LEAF_SPLIT, da_leaf_split, 1) \ +XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \ +XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \ +XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \ +XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \ +XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) +#endif /* XFS_ERRTAG */ #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c index 3f1d6a98c118..932ee4619e9e 100644 --- a/fs/xfs/libxfs/xfs_exchmaps.c +++ b/fs/xfs/libxfs/xfs_exchmaps.c @@ -616,7 +616,7 @@ xfs_exchmaps_finish_one( return error; } - if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) + if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) return -EIO; /* If we still have work to do, ask for a new transaction. */ @@ -882,7 +882,7 @@ xmi_ensure_delta_nextents( &new_nextents)) return -EFBIG; - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && new_nextents > 10) return -EFBIG; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 750111634d9f..d97295eaebe6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2140,7 +2140,7 @@ xfs_difree_inobt( * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); @@ -2286,7 +2286,7 @@ xfs_difree_finobt( * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) @@ -2706,7 +2706,7 @@ xfs_agi_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agi_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_IALLOC_READ_AGI)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index aa13fc00afd7..b1812b2c3cce 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -61,8 +61,8 @@ xfs_inode_buf_verify( di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && xfs_verify_agino_or_null(bp->b_pag, unlinked_ino); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP))) { + if (unlikely(!di_ok || + XFS_TEST_ERROR(mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { bp->b_flags &= ~XBF_DONE; xfs_buf_ioerror(bp, -EIO); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 4f99b90add55..1772d82f2d68 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -756,8 +756,7 @@ xfs_iext_count_extend( if (nr_exts < ifp->if_nextents) return -EFBIG; - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && - nr_exts > 10) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && nr_exts > 10) return -EFBIG; if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) { diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 48fe49a5f050..309ce6dd5553 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -299,17 +299,6 @@ xfs_inode_init( } else { inode_init_owner(args->idmap, inode, dir, args->mode); } - - /* - * If the group ID of the new file does not match the effective - * group ID or one of the supplementary group IDs, the S_ISGID - * bit is cleared (and only if the irix_sgid_inherit - * compatibility variable is set). - */ - if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && - !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) - inode->i_mode &= ~S_ISGID; - ip->i_projid = xfs_get_initial_prid(pip); } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 0d637c276db0..6c50cb2ece19 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -86,43 +86,6 @@ struct xfs_unmount_log_format { uint32_t pad2; /* may as well make it 64 bits */ }; -/* Region types for iovec's i_type */ -#define XLOG_REG_TYPE_BFORMAT 1 -#define XLOG_REG_TYPE_BCHUNK 2 -#define XLOG_REG_TYPE_EFI_FORMAT 3 -#define XLOG_REG_TYPE_EFD_FORMAT 4 -#define XLOG_REG_TYPE_IFORMAT 5 -#define XLOG_REG_TYPE_ICORE 6 -#define XLOG_REG_TYPE_IEXT 7 -#define XLOG_REG_TYPE_IBROOT 8 -#define XLOG_REG_TYPE_ILOCAL 9 -#define XLOG_REG_TYPE_IATTR_EXT 10 -#define XLOG_REG_TYPE_IATTR_BROOT 11 -#define XLOG_REG_TYPE_IATTR_LOCAL 12 -#define XLOG_REG_TYPE_QFORMAT 13 -#define XLOG_REG_TYPE_DQUOT 14 -#define XLOG_REG_TYPE_QUOTAOFF 15 -#define XLOG_REG_TYPE_LRHEADER 16 -#define XLOG_REG_TYPE_UNMOUNT 17 -#define XLOG_REG_TYPE_COMMIT 18 -#define XLOG_REG_TYPE_TRANSHDR 19 -#define XLOG_REG_TYPE_ICREATE 20 -#define XLOG_REG_TYPE_RUI_FORMAT 21 -#define XLOG_REG_TYPE_RUD_FORMAT 22 -#define XLOG_REG_TYPE_CUI_FORMAT 23 -#define XLOG_REG_TYPE_CUD_FORMAT 24 -#define XLOG_REG_TYPE_BUI_FORMAT 25 -#define XLOG_REG_TYPE_BUD_FORMAT 26 -#define XLOG_REG_TYPE_ATTRI_FORMAT 27 -#define XLOG_REG_TYPE_ATTRD_FORMAT 28 -#define XLOG_REG_TYPE_ATTR_NAME 29 -#define XLOG_REG_TYPE_ATTR_VALUE 30 -#define XLOG_REG_TYPE_XMI_FORMAT 31 -#define XLOG_REG_TYPE_XMD_FORMAT 32 -#define XLOG_REG_TYPE_ATTR_NEWNAME 33 -#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 -#define XLOG_REG_TYPE_MAX 34 - /* * Flags to log operation header * @@ -141,14 +104,13 @@ struct xfs_unmount_log_format { #define XLOG_END_TRANS 0x10 /* End a continued transaction */ #define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ - -typedef struct xlog_op_header { +struct xlog_op_header { __be32 oh_tid; /* transaction id of operation : 4 b */ __be32 oh_len; /* bytes in data region : 4 b */ __u8 oh_clientid; /* who sent me this : 1 b */ __u8 oh_flags; /* : 1 b */ __u16 oh_res2; /* 32 bit align : 2 b */ -} xlog_op_header_t; +}; /* valid values for h_fmt */ #define XLOG_FMT_UNKNOWN 0 @@ -174,12 +136,40 @@ typedef struct xlog_rec_header { __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; - /* new fields */ + + /* fields added by the Linux port: */ __be32 h_fmt; /* format of log record : 4 */ uuid_t h_fs_uuid; /* uuid of FS : 16 */ + + /* fields added for log v2: */ __be32 h_size; /* iclog size : 4 */ + + /* + * When h_size added for log v2 support, it caused structure to have + * a different size on i386 vs all other architectures because the + * sum of the size ofthe member is not aligned by that of the largest + * __be64-sized member, and i386 has really odd struct alignment rules. + * + * Due to the way the log headers are placed out on-disk that alone is + * not a problem becaue the xlog_rec_header always sits alone in a + * BBSIZEs area, and the rest of that area is padded with zeroes. + * But xlog_cksum used to calculate the checksum based on the structure + * size, and thus gives different checksums for i386 vs the rest. + * We now do two checksum validation passes for both sizes to allow + * moving v5 file systems with unclean logs between i386 and other + * (little-endian) architectures. + */ + __u32 h_pad0; } xlog_rec_header_t; +#ifdef __i386__ +#define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_size) +#define XLOG_REC_SIZE_OTHER sizeof(struct xlog_rec_header) +#else +#define XLOG_REC_SIZE sizeof(struct xlog_rec_header) +#define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_size) +#endif /* __i386__ */ + typedef struct xlog_rec_ext_header { __be32 xh_cycle; /* write cycle of log : 4 */ __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ @@ -195,12 +185,11 @@ typedef union xlog_in_core2 { } xlog_in_core_2_t; /* not an on-disk structure, but needed by log recovery in userspace */ -typedef struct xfs_log_iovec { +struct xfs_log_iovec { void *i_addr; /* beginning address of region */ int i_len; /* length in bytes of region */ uint i_type; /* type of region */ -} xfs_log_iovec_t; - +}; /* * Transaction Header definitions. @@ -213,12 +202,12 @@ typedef struct xfs_log_iovec { * Do not change the below structure without redoing the code in * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). */ -typedef struct xfs_trans_header { +struct xfs_trans_header { uint th_magic; /* magic number */ uint th_type; /* transaction type */ int32_t th_tid; /* transaction id (unused) */ uint th_num_items; /* num items logged by trans */ -} xfs_trans_header_t; +}; #define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ @@ -542,7 +531,7 @@ struct xfs_log_dinode { #define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) #define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1) -typedef struct xfs_buf_log_format { +struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ unsigned short blf_flags; /* misc state */ @@ -550,7 +539,7 @@ typedef struct xfs_buf_log_format { int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ -} xfs_buf_log_format_t; +}; /* * All buffers now need to tell recovery where the magic number @@ -606,40 +595,41 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf) /* * EFI/EFD log format definitions */ -typedef struct xfs_extent { +struct xfs_extent { xfs_fsblock_t ext_start; xfs_extlen_t ext_len; -} xfs_extent_t; +}; /* - * Since an xfs_extent_t has types (start:64, len: 32) - * there are different alignments on 32 bit and 64 bit kernels. - * So we provide the different variants for use by a - * conversion routine. + * Since the structures in struct xfs_extent add up to 96 bytes, it has + * different alignments on i386 vs all other architectures, because i386 + * does not pad structures to their natural alignment. + * + * Provide the different variants for use by a conversion routine. */ -typedef struct xfs_extent_32 { +struct xfs_extent_32 { uint64_t ext_start; uint32_t ext_len; -} __attribute__((packed)) xfs_extent_32_t; +} __attribute__((packed)); -typedef struct xfs_extent_64 { +struct xfs_extent_64 { uint64_t ext_start; uint32_t ext_len; uint32_t ext_pad; -} xfs_extent_64_t; +}; /* * This is the structure used to lay out an efi log item in the * log. The efi_extents field is a variable size array whose * size is given by efi_nextents. */ -typedef struct xfs_efi_log_format { +struct xfs_efi_log_format { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[]; /* array of extents to free */ -} xfs_efi_log_format_t; + struct xfs_extent efi_extents[]; /* array of extents to free */ +}; static inline size_t xfs_efi_log_format_sizeof( @@ -649,13 +639,13 @@ xfs_efi_log_format_sizeof( nr * sizeof(struct xfs_extent); } -typedef struct xfs_efi_log_format_32 { +struct xfs_efi_log_format_32 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[]; /* array of extents to free */ -} __attribute__((packed)) xfs_efi_log_format_32_t; + struct xfs_extent_32 efi_extents[]; /* array of extents to free */ +} __attribute__((packed)); static inline size_t xfs_efi_log_format32_sizeof( @@ -665,13 +655,13 @@ xfs_efi_log_format32_sizeof( nr * sizeof(struct xfs_extent_32); } -typedef struct xfs_efi_log_format_64 { +struct xfs_efi_log_format_64 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[]; /* array of extents to free */ -} xfs_efi_log_format_64_t; + struct xfs_extent_64 efi_extents[]; /* array of extents to free */ +}; static inline size_t xfs_efi_log_format64_sizeof( @@ -686,13 +676,13 @@ xfs_efi_log_format64_sizeof( * log. The efd_extents array is a variable size array whose * size is given by efd_nextents; */ -typedef struct xfs_efd_log_format { +struct xfs_efd_log_format { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[]; /* array of extents freed */ -} xfs_efd_log_format_t; + struct xfs_extent efd_extents[]; /* array of extents freed */ +}; static inline size_t xfs_efd_log_format_sizeof( @@ -702,13 +692,13 @@ xfs_efd_log_format_sizeof( nr * sizeof(struct xfs_extent); } -typedef struct xfs_efd_log_format_32 { +struct xfs_efd_log_format_32 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[]; /* array of extents freed */ -} __attribute__((packed)) xfs_efd_log_format_32_t; + struct xfs_extent_32 efd_extents[]; /* array of extents freed */ +} __attribute__((packed)); static inline size_t xfs_efd_log_format32_sizeof( @@ -718,13 +708,13 @@ xfs_efd_log_format32_sizeof( nr * sizeof(struct xfs_extent_32); } -typedef struct xfs_efd_log_format_64 { +struct xfs_efd_log_format_64 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[]; /* array of extents freed */ -} xfs_efd_log_format_64_t; + struct xfs_extent_64 efd_extents[]; /* array of extents freed */ +}; static inline size_t xfs_efd_log_format64_sizeof( @@ -957,14 +947,14 @@ struct xfs_xmd_log_format { * The first two fields must be the type and size fitting into * 32 bits : log_recovery code assumes that. */ -typedef struct xfs_dq_logformat { +struct xfs_dq_logformat { uint16_t qlf_type; /* dquot log item type */ uint16_t qlf_size; /* size of this item */ xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ int64_t qlf_blkno; /* blkno of dquot buffer */ int32_t qlf_len; /* len of dquot buffer */ uint32_t qlf_boffset; /* off of dquot in buffer */ -} xfs_dq_logformat_t; +}; /* * log format struct for QUOTAOFF records. @@ -974,12 +964,12 @@ typedef struct xfs_dq_logformat { * to the first and ensures that the first logitem is taken out of the AIL * only when the last one is securely committed. */ -typedef struct xfs_qoff_logformat { +struct xfs_qoff_logformat { unsigned short qf_type; /* quotaoff log item type */ unsigned short qf_size; /* size of this item */ unsigned int qf_flags; /* USR and/or GRP */ char qf_pad[12]; /* padding for future */ -} xfs_qoff_logformat_t; +}; /* * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 95de23095030..9e712e62369c 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -111,7 +111,7 @@ struct xlog_recover_item { struct xlog_recover { struct hlist_node r_list; xlog_tid_t r_log_tid; /* log's transaction id */ - xfs_trans_header_t r_theader; /* trans header for partial */ + struct xfs_trans_header r_theader; /* trans header for partial */ int r_state; /* not needed */ xfs_lsn_t r_lsn; /* xact lsn */ struct list_head r_itemq; /* q for items */ diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c index 225923e463c4..b02e3d6c0868 100644 --- a/fs/xfs/libxfs/xfs_metafile.c +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -121,7 +121,7 @@ xfs_metafile_resv_critical( div_u64(mp->m_metafile_resv_target, 10))) return true; - return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); + return XFS_TEST_ERROR(mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); } /* Allocate a block from the metadata file's reservation. */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 5ed44fdf7491..7bfa3242e2c5 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -174,6 +174,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header, 328); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header, 260); XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 897784037483..2484dc9f6d7e 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1113,8 +1113,7 @@ xfs_refcount_still_have_space( * refcount continue update "error" has been injected. */ if (cur->bc_refc.nr_ops > 2 && - XFS_TEST_ERROR(false, cur->bc_mp, - XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) + XFS_TEST_ERROR(cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; if (cur->bc_refc.nr_ops == 0) @@ -1398,7 +1397,7 @@ xfs_refcount_finish_one( trace_xfs_refcount_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* @@ -1511,7 +1510,7 @@ xfs_rtrefcount_finish_one( trace_xfs_refcount_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 3cdf50563fec..83e0488ff773 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2690,7 +2690,7 @@ xfs_rmap_finish_one( trace_xfs_rmap_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_RMAP_FINISH_ONE)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5057536e586c..618061d898d4 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1067,7 +1067,7 @@ xfs_rtfree_extent( ASSERT(rbmip->i_itemp != NULL); xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_rtcheck_alloc_range(&args, start, len); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 711e180f9ebb..cdd16dd805d7 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -142,8 +142,6 @@ xfs_sb_version_to_features( if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) { if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT) features |= XFS_FEAT_LAZYSBCOUNT; - if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT) - features |= XFS_FEAT_ATTR2; if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT) features |= XFS_FEAT_PROJID32; if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE) @@ -155,7 +153,7 @@ xfs_sb_version_to_features( /* Always on V5 features */ features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG | - XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 | + XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_PROJID32 | XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO; /* Optional V5 features */ @@ -1524,7 +1522,8 @@ xfs_fs_geometry( geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | XFS_FSOP_GEOM_FLAGS_DIRV2 | - XFS_FSOP_GEOM_FLAGS_EXTFLG; + XFS_FSOP_GEOM_FLAGS_EXTFLG | + XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_attr(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; if (xfs_has_quota(mp)) @@ -1537,8 +1536,6 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; if (xfs_has_lazysbcount(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; - if (xfs_has_attr2(mp)) - geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_projid32(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; if (xfs_has_crc(mp)) diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h index c4f1367b2cca..5fefd132e002 100644 --- a/fs/xfs/libxfs/xfs_zones.h +++ b/fs/xfs/libxfs/xfs_zones.h @@ -29,6 +29,13 @@ struct xfs_rtgroup; #define XFS_OPEN_GC_ZONES 1U #define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) +/* + * For zoned devices that do not have a limit on the number of open zones, and + * for regular devices using the zoned allocator, use the most common SMR disks + * limit (128) as the default limit on the number of open zones. + */ +#define XFS_DEFAULT_MAX_OPEN_ZONES 128 + bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, xfs_rgblock_t *write_pointer); diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index 38a246b8bf11..b2a83801412e 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -300,7 +300,7 @@ xrep_cow_find_bad( * on the debugging knob, replace everything in the CoW fork. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); if (error) @@ -385,7 +385,7 @@ xrep_cow_find_bad_rt( * CoW fork and then scan for staging extents in the refcountbt. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); if (error) diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c index 14939d7de349..378ec7c8d38e 100644 --- a/fs/xfs/scrub/metapath.c +++ b/fs/xfs/scrub/metapath.c @@ -79,7 +79,7 @@ xchk_metapath_cleanup( if (mpath->dp_ilock_flags) xfs_iunlock(mpath->dp, mpath->dp_ilock_flags); - kfree(mpath->path); + kfree_const(mpath->path); } /* Set up a metadir path scan. @path must be dynamically allocated. */ @@ -98,13 +98,13 @@ xchk_setup_metapath_scan( error = xchk_install_live_inode(sc, ip); if (error) { - kfree(path); + kfree_const(path); return error; } mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS); if (!mpath) { - kfree(path); + kfree_const(path); return -ENOMEM; } @@ -132,7 +132,7 @@ xchk_setup_metapath_rtdir( return -ENOENT; return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip); + kstrdup_const("rtgroups", GFP_KERNEL), sc->mp->m_rtdirip); } /* Scan a rtgroup inode under the /rtgroups directory. */ @@ -179,7 +179,7 @@ xchk_setup_metapath_quotadir( return -ENOENT; return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kstrdup("quota", GFP_KERNEL), qi->qi_dirip); + kstrdup_const("quota", GFP_KERNEL), qi->qi_dirip); } /* Scan a quota inode under the /quota directory. */ @@ -212,7 +212,7 @@ xchk_setup_metapath_dqinode( return -ENOENT; return xchk_setup_metapath_scan(sc, qi->qi_dirip, - kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip); + kstrdup_const(xfs_dqinode_path(type), GFP_KERNEL), ip); } #else # define xchk_setup_metapath_quotadir(...) (-ENOENT) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 1588ce971cb8..951ae8b71566 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -28,6 +28,15 @@ #include "scrub/newbt.h" /* + * This is the maximum number of deferred extent freeing item extents (EFIs) + * that we'll attach to a transaction without rolling the transaction to avoid + * overrunning a tr_itruncate reservation. The newbt code should reserve + * exactly the correct number of blocks to rebuild the btree, so there should + * not be any excess blocks to free when committing a new btree. + */ +#define XREP_MAX_ITRUNCATE_EFIS (128) + +/* * Estimate proper slack values for a btree that's being reloaded. * * Under most circumstances, we'll take whatever default loading value the diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 8703897c0a9c..07f5bb8a6421 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -36,6 +36,12 @@ #include "xfs_metafile.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_item.h" +#include "xfs_bmap_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -91,21 +97,33 @@ struct xreap_state { struct xfs_scrub *sc; - /* Reverse mapping owner and metadata reservation type. */ - const struct xfs_owner_info *oinfo; - enum xfs_ag_resv_type resv; + union { + struct { + /* + * For AG blocks, this is reverse mapping owner and + * metadata reservation type. + */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; + }; + struct { + /* For file blocks, this is the inode and fork. */ + struct xfs_inode *ip; + int whichfork; + }; + }; - /* If true, roll the transaction before reaping the next extent. */ - bool force_roll; + /* Number of invalidated buffers logged to the current transaction. */ + unsigned int nr_binval; - /* Number of deferred reaps attached to the current transaction. */ - unsigned int deferred; + /* Maximum number of buffers we can invalidate in a single tx. */ + unsigned int max_binval; - /* Number of invalidated buffers logged to the current transaction. */ - unsigned int invalidated; + /* Number of deferred reaps attached to the current transaction. */ + unsigned int nr_deferred; - /* Number of deferred reaps queued during the whole reap sequence. */ - unsigned long long total_deferred; + /* Maximum number of intents we can reap in a single transaction. */ + unsigned int max_deferred; }; /* Put a block back on the AGFL. */ @@ -148,71 +166,79 @@ xreap_put_freelist( } /* Are there any uncommitted reap operations? */ -static inline bool xreap_dirty(const struct xreap_state *rs) +static inline bool xreap_is_dirty(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->deferred) - return true; - if (rs->invalidated) - return true; - if (rs->total_deferred) - return true; - return false; + return rs->nr_binval > 0 || rs->nr_deferred > 0; } -#define XREAP_MAX_BINVAL (2048) - /* - * Decide if we want to roll the transaction after reaping an extent. We don't - * want to overrun the transaction reservation, so we prohibit more than - * 128 EFIs per transaction. For the same reason, we limit the number - * of buffer invalidations to 2048. + * Decide if we need to roll the transaction to clear out the the log + * reservation that we allocated to buffer invalidations. */ -static inline bool xreap_want_roll(const struct xreap_state *rs) +static inline bool xreap_want_binval_roll(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS) - return true; - if (rs->invalidated > XREAP_MAX_BINVAL) - return true; - return false; + return rs->nr_binval >= rs->max_binval; } -static inline void xreap_reset(struct xreap_state *rs) +/* Reset the buffer invalidation count after rolling. */ +static inline void xreap_binval_reset(struct xreap_state *rs) { - rs->total_deferred += rs->deferred; - rs->deferred = 0; - rs->invalidated = 0; - rs->force_roll = false; + rs->nr_binval = 0; } -#define XREAP_MAX_DEFER_CHAIN (2048) +/* + * Bump the number of invalidated buffers, and return true if we can continue, + * or false if we need to roll the transaction. + */ +static inline bool xreap_inc_binval(struct xreap_state *rs) +{ + rs->nr_binval++; + return rs->nr_binval < rs->max_binval; +} /* * Decide if we want to finish the deferred ops that are attached to the scrub * transaction. We don't want to queue huge chains of deferred ops because * that can consume a lot of log space and kernel memory. Hence we trigger a - * xfs_defer_finish if there are more than 2048 deferred reap operations or the - * caller did some real work. + * xfs_defer_finish if there are too many deferred reap operations or we've run + * out of space for invalidations. */ -static inline bool -xreap_want_defer_finish(const struct xreap_state *rs) +static inline bool xreap_want_defer_finish(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN) - return true; - return false; + return rs->nr_deferred >= rs->max_deferred; } +/* + * Reset the defer chain length and buffer invalidation count after finishing + * items. + */ static inline void xreap_defer_finish_reset(struct xreap_state *rs) { - rs->total_deferred = 0; - rs->deferred = 0; - rs->invalidated = 0; - rs->force_roll = false; + rs->nr_deferred = 0; + rs->nr_binval = 0; +} + +/* + * Bump the number of deferred extent reaps. + */ +static inline void xreap_inc_defer(struct xreap_state *rs) +{ + rs->nr_deferred++; +} + +/* Force the caller to finish a deferred item chain. */ +static inline void xreap_force_defer_finish(struct xreap_state *rs) +{ + rs->nr_deferred = rs->max_deferred; +} + +/* Maximum number of fsblocks that we might find in a buffer to invalidate. */ +static inline unsigned int +xrep_binval_max_fsblocks( + struct xfs_mount *mp) +{ + /* Remote xattr values are the largest buffers that we support. */ + return xfs_attr3_max_rmt_blocks(mp); } /* @@ -224,12 +250,8 @@ xrep_bufscan_max_sectors( struct xfs_mount *mp, xfs_extlen_t fsblocks) { - int max_fsbs; - - /* Remote xattr values are the largest buffers that we support. */ - max_fsbs = xfs_attr3_max_rmt_blocks(mp); - - return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs)); + return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, + xrep_binval_max_fsblocks(mp))); } /* @@ -297,14 +319,13 @@ xreap_agextent_binval( while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { xfs_trans_bjoin(sc->tp, bp); xfs_trans_binval(sc->tp, bp); - rs->invalidated++; /* * Stop invalidating if we've hit the limit; we should * still have enough reservation left to free however * far we've gotten. */ - if (rs->invalidated > XREAP_MAX_BINVAL) { + if (!xreap_inc_binval(rs)) { *aglenp -= agbno_next - bno; goto out; } @@ -416,21 +437,23 @@ xreap_agextent_iter( trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno, *aglenp); - rs->force_roll = true; - if (rs->oinfo == &XFS_RMAP_OINFO_COW) { /* - * If we're unmapping CoW staging extents, remove the + * t0: Unmapping CoW staging extents, remove the * records from the refcountbt, which will remove the * rmap record as well. */ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); + xreap_inc_defer(rs); return 0; } - return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, - *aglenp, rs->oinfo); + /* t1: unmap crosslinked metadata blocks */ + xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp, + rs->oinfo->oi_owner); + xreap_inc_defer(rs); + return 0; } trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp); @@ -443,12 +466,12 @@ xreap_agextent_iter( */ xreap_agextent_binval(rs, agbno, aglenp); if (*aglenp == 0) { - ASSERT(xreap_want_roll(rs)); + ASSERT(xreap_want_binval_roll(rs)); return 0; } /* - * If we're getting rid of CoW staging extents, use deferred work items + * t2: To get rid of CoW staging extents, use deferred work items * to remove the refcountbt records (which removes the rmap records) * and free the extent. We're not worried about the system going down * here because log recovery walks the refcount btree to clean out the @@ -463,23 +486,23 @@ xreap_agextent_iter( if (error) return error; - rs->force_roll = true; + xreap_inc_defer(rs); return 0; } - /* Put blocks back on the AGFL one at a time. */ + /* t3: Put blocks back on the AGFL one at a time. */ if (rs->resv == XFS_AG_RESV_AGFL) { ASSERT(*aglenp == 1); error = xreap_put_freelist(sc, agbno); if (error) return error; - rs->force_roll = true; + xreap_force_defer_finish(rs); return 0; } /* - * Use deferred frees to get rid of the old btree blocks to try to + * t4: Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. * Add a defer ops barrier every other extent to avoid stressing the * system with large EFIs. @@ -489,12 +512,194 @@ xreap_agextent_iter( if (error) return error; - rs->deferred++; - if (rs->deferred % 2 == 0) + xreap_inc_defer(rs); + if (rs->nr_deferred % 2 == 0) xfs_defer_add_barrier(sc->tp); return 0; } +/* Configure the deferral and invalidation limits */ +static inline void +xreap_configure_limits( + struct xreap_state *rs, + unsigned int fixed_overhead, + unsigned int variable_overhead, + unsigned int per_intent, + unsigned int per_binval) +{ + struct xfs_scrub *sc = rs->sc; + unsigned int res = sc->tp->t_log_res - fixed_overhead; + + /* Don't underflow the reservation */ + if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) { + ASSERT(sc->tp->t_log_res >= + (fixed_overhead + variable_overhead)); + xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE); + return; + } + + rs->max_deferred = per_intent ? res / variable_overhead : 0; + res -= rs->max_deferred * per_intent; + rs->max_binval = per_binval ? res / per_binval : 0; +} + +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single per-AG space extent. This is not for freeing CoW + * staging extents. + */ +STATIC void +xreap_configure_agextent_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap + * record. + * + * t3: Freeing to AGFL: roll and finish deferred items for every block. + * Limits here do not matter. + * + * t4: Freeing metadata blocks: deferred freeing of the space, which + * also removes the rmap record. + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t1 = rui; + const unsigned int t4 = rui + efi; + const unsigned int per_intent = max(t1, t4); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of EFI or + * RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int step_size = max(f1, f2); + + /* Largest buffer size (in fsblocks) that can be invalidated. */ + const unsigned int max_binval = xrep_binval_max_fsblocks(mp); + + /* Maximum overhead of invalidating one buffer. */ + const unsigned int per_binval = + xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); + + /* + * For each transaction in a reap chain, we can delete some number of + * extents and invalidate some number of blocks. We assume that btree + * blocks aren't usually contiguous; and that scrub likely pulled all + * the buffers into memory. From these assumptions, set the maximum + * number of deferrals we can queue before flushing the defer chain, + * and the number of invalidations we can queue before rolling to a + * clean transaction (and possibly relogging some of the deferrals) to + * the same quantity. + */ + const unsigned int variable_overhead = per_intent + per_binval; + + xreap_configure_limits(rs, step_size, variable_overhead, per_intent, + per_binval); + + trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval, + step_size, per_intent, rs->max_deferred); +} + +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single CoW staging extent. This is not for freeing + * metadata blocks. + */ +STATIC void +xreap_configure_agcow_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1) + + xfs_cud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount + * record, which defers removal of rmap record + * + * t2: Freeing CoW blocks: deferred removal of refcount record, which + * defers removal of rmap record; and deferred removal of the space + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t0 = cui + rui; + const unsigned int t2 = cui + rui + efi; + const unsigned int per_intent = max(t0, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* Largest buffer size (in fsblocks) that can be invalidated. */ + const unsigned int max_binval = xrep_binval_max_fsblocks(mp); + + /* Overhead of invalidating one buffer */ + const unsigned int per_binval = + xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); + + /* + * For each transaction in a reap chain, we can delete some number of + * extents and invalidate some number of blocks. We assume that CoW + * staging extents are usually more than 1 fsblock, and that there + * shouldn't be any buffers for those blocks. From the assumptions, + * set the number of deferrals to use as much of the reservation as + * it can, but leave space to invalidate 1/8th that number of buffers. + */ + const unsigned int variable_overhead = per_intent + + (per_binval / 8); + + xreap_configure_limits(rs, step_size, variable_overhead, per_intent, + per_binval); + + trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size, + per_intent, rs->max_deferred); +} + /* * Break an AG metadata extent into sub-extents by fate (crosslinked, not * crosslinked), and dispose of each sub-extent separately. @@ -531,11 +736,11 @@ xreap_agmeta_extent( if (error) return error; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { error = xrep_roll_ag_trans(sc); if (error) return error; - xreap_reset(rs); + xreap_binval_reset(rs); } agbno += aglen; @@ -562,11 +767,12 @@ xrep_reap_agblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip == NULL); + xreap_configure_agextent_limits(&rs); error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -628,7 +834,7 @@ xreap_fsmeta_extent( if (error) goto out_agf; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { /* * Hold the AGF buffer across the transaction roll so * that we don't have to reattach it to the scrub @@ -639,7 +845,7 @@ xreap_fsmeta_extent( xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); if (error) goto out_agf; - xreap_reset(rs); + xreap_binval_reset(rs); } agbno += aglen; @@ -674,11 +880,15 @@ xrep_reap_fsblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); + if (oinfo == &XFS_RMAP_OINFO_COW) + xreap_configure_agcow_limits(&rs); + else + xreap_configure_agextent_limits(&rs); error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -770,7 +980,7 @@ xreap_rgextent_iter( rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno); /* - * If there are other rmappings, this block is cross linked and must + * t1: There are other rmappings; this block is cross linked and must * not be freed. Remove the forward and reverse mapping and move on. */ if (crosslinked) { @@ -778,14 +988,14 @@ xreap_rgextent_iter( *rglenp); xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); - rs->deferred++; + xreap_inc_defer(rs); return 0; } trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp); /* - * The CoW staging extent is not crosslinked. Use deferred work items + * t2: The CoW staging extent is not crosslinked. Use deferred work * to remove the refcountbt records (which removes the rmap records) * and free the extent. We're not worried about the system going down * here because log recovery walks the refcount btree to clean out the @@ -799,10 +1009,73 @@ xreap_rgextent_iter( if (error) return error; - rs->deferred++; + xreap_inc_defer(rs); return 0; } +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single CoW staging extent. This is not for freeing + * metadata blocks. + */ +STATIC void +xreap_configure_rgcow_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1) + + xfs_cud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount + * record, which defers removal of rmap record + * + * t2: Freeing CoW blocks: deferred removal of refcount record, which + * defers removal of rmap record; and deferred removal of the space + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t1 = cui + rui; + const unsigned int t2 = cui + rui + efi; + const unsigned int per_intent = max(t1, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_rt_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rt_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_rt_cui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* + * The only buffer for the rt device is the rtgroup super, so we don't + * need to save space for buffer invalidations. + */ + xreap_configure_limits(rs, step_size, per_intent, per_intent, 0); + + trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent, + rs->max_deferred); +} + #define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ XFS_RTGLOCK_RMAP | \ XFS_RTGLOCK_REFCOUNT) @@ -855,11 +1128,11 @@ xreap_rtmeta_extent( if (error) goto out_unlock; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { error = xfs_trans_roll_inode(&sc->tp, sc->ip); if (error) goto out_unlock; - xreap_reset(rs); + xreap_binval_reset(rs); } rgbno += rglen; @@ -891,12 +1164,14 @@ xrep_reap_rtblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); + ASSERT(oinfo == &XFS_RMAP_OINFO_COW); + xreap_configure_rgcow_limits(&rs); error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -929,13 +1204,13 @@ xrep_reap_metadir_fsblocks( ASSERT(sc->ip != NULL); ASSERT(xfs_is_metadir_inode(sc->ip)); + xreap_configure_agextent_limits(&rs); xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); - error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) { + if (xreap_is_dirty(&rs)) { error = xrep_defer_finish(sc); if (error) return error; @@ -955,13 +1230,12 @@ xrep_reap_metadir_fsblocks( */ STATIC int xreap_bmapi_select( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap, bool *crosslinked) { struct xfs_owner_info oinfo; + struct xfs_scrub *sc = rs->sc; struct xfs_btree_cur *cur; xfs_filblks_t len = 1; xfs_agblock_t bno; @@ -975,7 +1249,8 @@ xreap_bmapi_select( cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, sc->sa.pag); - xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff); + xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork, + imap->br_startoff); error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked); if (error) goto out_cur; @@ -1038,21 +1313,19 @@ xreap_buf_loggable( */ STATIC int xreap_bmapi_binval( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap) { + struct xfs_scrub *sc = rs->sc; struct xfs_mount *mp = sc->mp; struct xfs_perag *pag = sc->sa.pag; - int bmap_flags = xfs_bmapi_aflag(whichfork); + int bmap_flags = xfs_bmapi_aflag(rs->whichfork); xfs_fileoff_t off; xfs_fileoff_t max_off; xfs_extlen_t scan_blocks; xfs_agblock_t bno; xfs_agblock_t agbno; xfs_agblock_t agbno_next; - unsigned int invalidated = 0; int error; /* @@ -1079,7 +1352,7 @@ xreap_bmapi_binval( struct xfs_bmbt_irec hmap; int nhmaps = 1; - error = xfs_bmapi_read(ip, off, max_off - off, &hmap, + error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap, &nhmaps, bmap_flags); if (error) return error; @@ -1120,14 +1393,13 @@ xreap_bmapi_binval( xfs_buf_stale(bp); xfs_buf_relse(bp); } - invalidated++; /* * Stop invalidating if we've hit the limit; we should * still have enough reservation left to free however - * much of the mapping we've seen so far. + * far we've gotten. */ - if (invalidated > XREAP_MAX_BINVAL) { + if (!xreap_inc_binval(rs)) { imap->br_blockcount = agbno_next - bno; goto out; } @@ -1149,12 +1421,11 @@ out: */ STATIC int xrep_reap_bmapi_iter( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap, bool crosslinked) { + struct xfs_scrub *sc = rs->sc; int error; if (crosslinked) { @@ -1171,14 +1442,14 @@ xrep_reap_bmapi_iter( imap->br_blockcount); /* - * Schedule removal of the mapping from the fork. We use + * t0: Schedule removal of the mapping from the fork. We use * deferred log intents in this function to control the exact * sequence of metadata updates. */ - xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); - xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); - xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); return 0; } @@ -1199,41 +1470,139 @@ xrep_reap_bmapi_iter( * transaction is full of logged buffer invalidations, so we need to * return early so that we can roll and retry. */ - error = xreap_bmapi_binval(sc, ip, whichfork, imap); + error = xreap_bmapi_binval(rs, imap); if (error || imap->br_blockcount == 0) return error; /* - * Schedule removal of the mapping from the fork. We use deferred log - * intents in this function to control the exact sequence of metadata + * t1: Schedule removal of the mapping from the fork. We use deferred + * work in this function to control the exact sequence of metadata * updates. */ - xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); - xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); return xfs_free_extent_later(sc->tp, imap->br_startblock, imap->br_blockcount, NULL, XFS_AG_RESV_NONE, XFS_FREE_EXTENT_SKIP_DISCARD); } +/* Compute the maximum mapcount of a file buffer. */ +static unsigned int +xreap_bmapi_binval_mapcount( + struct xfs_scrub *sc) +{ + /* directory blocks can span multiple fsblocks and be discontiguous */ + if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR) + return sc->mp->m_dir_geo->fsbcount; + + /* all other file xattr/symlink blocks must be contiguous */ + return 1; +} + +/* Compute the maximum block size of a file buffer. */ +static unsigned int +xreap_bmapi_binval_blocksize( + struct xfs_scrub *sc) +{ + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_DIR: + return sc->mp->m_dir_geo->blksize; + case XFS_SCRUB_TYPE_XATTR: + case XFS_SCRUB_TYPE_PARENT: + /* + * The xattr structure itself consists of single fsblocks, but + * there could be remote xattr blocks to invalidate. + */ + return XFS_XATTR_SIZE_MAX; + } + + /* everything else is a single block */ + return sc->mp->m_sb.sb_blocksize; +} + +/* + * Compute the maximum number of buffer invalidations that we can do while + * reaping a single extent from a file fork. + */ +STATIC void +xreap_configure_bmapi_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* overhead of invalidating a buffer */ + const unsigned int per_binval = + xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc), + xreap_bmapi_binval_blocksize(sc)); + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int bui = xfs_bui_log_space(1) + + xfs_bud_log_space(); + + /* + * t1: Unmapping crosslinked file data blocks: one bmap deletion, + * possibly an EFI for underfilled bmbt blocks, and an rmap deletion. + * + * t2: Freeing freeing file data blocks: one bmap deletion, possibly an + * EFI for underfilled bmbt blocks, and another EFI for the space + * itself. + */ + const unsigned int t1 = (bui + efi) + rui; + const unsigned int t2 = (bui + efi) + efi; + const unsigned int per_intent = max(t1, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_bui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* + * Each call to xreap_ifork_extent starts with a clean transaction and + * operates on a single mapping by creating a chain of log intent items + * for that mapping. We need to leave enough reservation in the + * transaction to log btree buffer and inode updates for each step in + * the chain, and to relog the log intents. + */ + const unsigned int per_extent_res = per_intent + step_size; + + xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval); + + trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval, + step_size, per_intent, 1); +} + /* * Dispose of as much of this file extent as we can. Upon successful return, * the imap will reflect the mapping that was removed from the fork. */ STATIC int xreap_ifork_extent( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap) { + struct xfs_scrub *sc = rs->sc; xfs_agnumber_t agno; bool crosslinked; int error; ASSERT(sc->sa.pag == NULL); - trace_xreap_ifork_extent(sc, ip, whichfork, imap); + trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap); agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock); sc->sa.pag = xfs_perag_get(sc->mp, agno); @@ -1248,11 +1617,11 @@ xreap_ifork_extent( * Decide the fate of the blocks at the beginning of the mapping, then * update the mapping to use it with the unmap calls. */ - error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked); + error = xreap_bmapi_select(rs, imap, &crosslinked); if (error) goto out_agf; - error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked); + error = xrep_reap_bmapi_iter(rs, imap, crosslinked); if (error) goto out_agf; @@ -1276,6 +1645,11 @@ xrep_reap_ifork( struct xfs_inode *ip, int whichfork) { + struct xreap_state rs = { + .sc = sc, + .ip = ip, + .whichfork = whichfork, + }; xfs_fileoff_t off = 0; int bmap_flags = xfs_bmapi_aflag(whichfork); int error; @@ -1284,6 +1658,7 @@ xrep_reap_ifork( ASSERT(ip == sc->ip || ip == sc->tempip); ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip)); + xreap_configure_bmapi_limits(&rs); while (off < XFS_MAX_FILEOFF) { struct xfs_bmbt_irec imap; int nimaps = 1; @@ -1303,13 +1678,14 @@ xrep_reap_ifork( * can in a single transaction. */ if (xfs_bmap_is_real_extent(&imap)) { - error = xreap_ifork_extent(sc, ip, whichfork, &imap); + error = xreap_ifork_extent(&rs, &imap); if (error) return error; error = xfs_defer_finish(&sc->tp); if (error) return error; + xreap_defer_finish_reset(&rs); } off = imap.br_startoff + imap.br_blockcount; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index d00c18954a26..efd5a7ccdf62 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1110,7 +1110,7 @@ xrep_will_attempt( return true; /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + if (XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) return true; /* Metadata is corrupt or failed cross-referencing. */ diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 9c04295742c8..2bb125c4f9bf 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -18,14 +18,6 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) #ifdef CONFIG_XFS_ONLINE_REPAIR -/* - * This is the maximum number of deferred extent freeing item extents (EFIs) - * that we'll attach to a transaction without rolling the transaction to avoid - * overrunning a tr_itruncate reservation. - */ -#define XREP_MAX_ITRUNCATE_EFIS (128) - - /* Repair helpers */ int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index 953ce7be78dc..5902398185a8 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -185,7 +185,7 @@ xrep_symlink_salvage_inline( return 0; nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); - strncpy(target_buf, ifp->if_data, nr); + memcpy(target_buf, ifp->if_data, nr); return nr; } diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 2450e214103f..987313a52e64 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -22,6 +22,7 @@ #include "xfs_parent.h" #include "xfs_metafile.h" #include "xfs_rtgroup.h" +#include "xfs_trans.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index a8187281eb96..39ea651cbb75 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2000,6 +2000,51 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); +DECLARE_EVENT_CLASS(xrep_reap_limits_class, + TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, + unsigned int max_binval, unsigned int step_size, + unsigned int per_intent, + unsigned int max_deferred), + TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, log_res) + __field(unsigned int, per_binval) + __field(unsigned int, max_binval) + __field(unsigned int, step_size) + __field(unsigned int, per_intent) + __field(unsigned int, max_deferred) + ), + TP_fast_assign( + __entry->dev = tp->t_mountp->m_super->s_dev; + __entry->log_res = tp->t_log_res; + __entry->per_binval = per_binval; + __entry->max_binval = max_binval; + __entry->step_size = step_size; + __entry->per_intent = per_intent; + __entry->max_deferred = max_deferred; + ), + TP_printk("dev %d:%d logres %u per_binval %u max_binval %u step_size %u per_intent %u max_deferred %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->log_res, + __entry->per_binval, + __entry->max_binval, + __entry->step_size, + __entry->per_intent, + __entry->max_deferred) +); +#define DEFINE_REPAIR_REAP_LIMITS_EVENT(name) \ +DEFINE_EVENT(xrep_reap_limits_class, name, \ + TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, \ + unsigned int max_binval, unsigned int step_size, \ + unsigned int per_intent, \ + unsigned int max_deferred), \ + TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred)) +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_rgcow_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_bmapi_limits); + DECLARE_EVENT_CLASS(xrep_reap_find_class, TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, xfs_extlen_t len, bool crosslinked), diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 5eef3bc30bda..c3a593319bee 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -491,7 +491,7 @@ xfs_attr_finish_item( /* Reset trans after EAGAIN cycle since the transaction is new */ args->trans = tp; - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + if (XFS_TEST_ERROR(args->dp->i_mount, XFS_ERRTAG_LARP)) { error = -EIO; goto out; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f9ef3b2a332a..773d959965dc 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -387,8 +387,6 @@ xfs_buf_map_verify( struct xfs_buftarg *btp, struct xfs_buf_map *map) { - xfs_daddr_t eofs; - /* Check for IOs smaller than the sector size / not sector aligned */ ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); @@ -397,11 +395,10 @@ xfs_buf_map_verify( * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ - eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (map->bm_bn < 0 || map->bm_bn >= eofs) { + if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) { xfs_alert(btp->bt_mount, "%s: daddr 0x%llx out of range, EOFS 0x%llx", - __func__, map->bm_bn, eofs); + __func__, map->bm_bn, btp->bt_nr_sectors); WARN_ON(1); return -EFSCORRUPTED; } @@ -1299,7 +1296,7 @@ xfs_buf_bio_end_io( if (bio->bi_status) xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && - XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) + XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) xfs_buf_ioerror(bp, -EIO); if (bp->b_flags & XBF_ASYNC) { @@ -1720,26 +1717,30 @@ xfs_configure_buftarg_atomic_writes( int xfs_configure_buftarg( struct xfs_buftarg *btp, - unsigned int sectorsize) + unsigned int sectorsize, + xfs_rfsblock_t nr_blocks) { - int error; + struct xfs_mount *mp = btp->bt_mount; - ASSERT(btp->bt_bdev != NULL); + if (btp->bt_bdev) { + int error; - /* Set up metadata sector size info */ - btp->bt_meta_sectorsize = sectorsize; - btp->bt_meta_sectormask = sectorsize - 1; + error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); + if (error) { + xfs_warn(mp, + "Cannot use blocksize %u on device %pg, err %d", + sectorsize, btp->bt_bdev, error); + return -EINVAL; + } - error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); - if (error) { - xfs_warn(btp->bt_mount, - "Cannot use blocksize %u on device %pg, err %d", - sectorsize, btp->bt_bdev, error); - return -EINVAL; + if (bdev_can_atomic_write(btp->bt_bdev)) + xfs_configure_buftarg_atomic_writes(btp); } - if (bdev_can_atomic_write(btp->bt_bdev)) - xfs_configure_buftarg_atomic_writes(btp); + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; + /* m_blkbb_log is not set up yet */ + btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT); return 0; } @@ -1749,6 +1750,9 @@ xfs_init_buftarg( size_t logical_sectorsize, const char *descr) { + /* The maximum size of the buftarg is only known once the sb is read. */ + btp->bt_nr_sectors = (xfs_daddr_t)-1; + /* Set up device logical sector size mask */ btp->bt_logical_sectorsize = logical_sectorsize; btp->bt_logical_sectormask = logical_sectorsize - 1; @@ -2084,7 +2088,7 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) * This allows userspace to disrupt buffer caching for debug/testing * purposes. */ - if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) + if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) lru_ref = 0; atomic_set(&bp->b_lru_ref, lru_ref); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b269e115d9ac..8fa7bdf59c91 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -103,6 +103,7 @@ struct xfs_buftarg { size_t bt_meta_sectormask; size_t bt_logical_sectorsize; size_t bt_logical_sectormask; + xfs_daddr_t bt_nr_sectors; /* LRU control structures */ struct shrinker *bt_shrinker; @@ -372,7 +373,8 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); -int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize); +int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize, + xfs_fsblock_t nr_blocks); #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 5d58e2ae4972..e4c8af873632 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -736,6 +736,16 @@ xlog_recover_do_primary_sb_buffer( */ xfs_sb_from_disk(&mp->m_sb, dsb); + /* + * Grow can change the device size. Mirror that into the buftarg. + */ + mp->m_ddev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) { + mp->m_rtdev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + } + if (mp->m_sb.sb_agcount < orig_agcount) { xfs_alert(mp, "Shrinking AG count in log recovery not supported"); return -EFSCORRUPTED; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index dbd87e137694..39830b252ac8 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -10,61 +10,17 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_sysfs.h" #include "xfs_inode.h" #ifdef DEBUG -static unsigned int xfs_errortag_random_default[] = { - XFS_RANDOM_DEFAULT, - XFS_RANDOM_IFLUSH_1, - XFS_RANDOM_IFLUSH_2, - XFS_RANDOM_IFLUSH_3, - XFS_RANDOM_IFLUSH_4, - XFS_RANDOM_IFLUSH_5, - XFS_RANDOM_IFLUSH_6, - XFS_RANDOM_DA_READ_BUF, - XFS_RANDOM_BTREE_CHECK_LBLOCK, - XFS_RANDOM_BTREE_CHECK_SBLOCK, - XFS_RANDOM_ALLOC_READ_AGF, - XFS_RANDOM_IALLOC_READ_AGI, - XFS_RANDOM_ITOBP_INOTOBP, - XFS_RANDOM_IUNLINK, - XFS_RANDOM_IUNLINK_REMOVE, - XFS_RANDOM_DIR_INO_VALIDATE, - XFS_RANDOM_BULKSTAT_READ_CHUNK, - XFS_RANDOM_IODONE_IOERR, - XFS_RANDOM_STRATREAD_IOERR, - XFS_RANDOM_STRATCMPL_IOERR, - XFS_RANDOM_DIOWRITE_IOERR, - XFS_RANDOM_BMAPIFORMAT, - XFS_RANDOM_FREE_EXTENT, - XFS_RANDOM_RMAP_FINISH_ONE, - XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE, - XFS_RANDOM_REFCOUNT_FINISH_ONE, - XFS_RANDOM_BMAP_FINISH_ONE, - XFS_RANDOM_AG_RESV_CRITICAL, - 0, /* XFS_RANDOM_DROP_WRITES has been removed */ - XFS_RANDOM_LOG_BAD_CRC, - XFS_RANDOM_LOG_ITEM_PIN, - XFS_RANDOM_BUF_LRU_REF, - XFS_RANDOM_FORCE_SCRUB_REPAIR, - XFS_RANDOM_FORCE_SUMMARY_RECALC, - XFS_RANDOM_IUNLINK_FALLBACK, - XFS_RANDOM_BUF_IOERROR, - XFS_RANDOM_REDUCE_MAX_IEXTENTS, - XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, - XFS_RANDOM_AG_RESV_FAIL, - XFS_RANDOM_LARP, - XFS_RANDOM_DA_LEAF_SPLIT, - XFS_RANDOM_ATTR_LEAF_TO_NODE, - XFS_RANDOM_WB_DELAY_MS, - XFS_RANDOM_WRITE_DELAY_MS, - XFS_RANDOM_EXCHMAPS_FINISH_ONE, - XFS_RANDOM_METAFILE_RESV_CRITICAL, -}; +#define XFS_ERRTAG(_tag, _name, _default) \ + [XFS_ERRTAG_##_tag] = (_default), +#include "xfs_errortag.h" +static const unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS }; +#undef XFS_ERRTAG struct xfs_errortag_attr { struct attribute attr; @@ -93,21 +49,18 @@ xfs_errortag_attr_store( size_t count) { struct xfs_mount *mp = to_mp(kobject); - struct xfs_errortag_attr *xfs_attr = to_attr(attr); + unsigned int error_tag = to_attr(attr)->tag; int ret; - unsigned int val; if (strcmp(buf, "default") == 0) { - val = xfs_errortag_random_default[xfs_attr->tag]; + mp->m_errortag[error_tag] = + xfs_errortag_random_default[error_tag]; } else { - ret = kstrtouint(buf, 0, &val); + ret = kstrtouint(buf, 0, &mp->m_errortag[error_tag]); if (ret) return ret; } - ret = xfs_errortag_set(mp, xfs_attr->tag, val); - if (ret) - return ret; return count; } @@ -118,10 +71,9 @@ xfs_errortag_attr_show( char *buf) { struct xfs_mount *mp = to_mp(kobject); - struct xfs_errortag_attr *xfs_attr = to_attr(attr); + unsigned int error_tag = to_attr(attr)->tag; - return snprintf(buf, PAGE_SIZE, "%u\n", - xfs_errortag_get(mp, xfs_attr->tag)); + return snprintf(buf, PAGE_SIZE, "%u\n", mp->m_errortag[error_tag]); } static const struct sysfs_ops xfs_errortag_sysfs_ops = { @@ -129,110 +81,28 @@ static const struct sysfs_ops xfs_errortag_sysfs_ops = { .store = xfs_errortag_attr_store, }; -#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \ +#define XFS_ERRTAG(_tag, _name, _default) \ static struct xfs_errortag_attr xfs_errortag_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) }, \ - .tag = (_tag), \ -} - -#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr - -XFS_ERRORTAG_ATTR_RW(noerror, XFS_ERRTAG_NOERROR); -XFS_ERRORTAG_ATTR_RW(iflush1, XFS_ERRTAG_IFLUSH_1); -XFS_ERRORTAG_ATTR_RW(iflush2, XFS_ERRTAG_IFLUSH_2); -XFS_ERRORTAG_ATTR_RW(iflush3, XFS_ERRTAG_IFLUSH_3); -XFS_ERRORTAG_ATTR_RW(iflush4, XFS_ERRTAG_IFLUSH_4); -XFS_ERRORTAG_ATTR_RW(iflush5, XFS_ERRTAG_IFLUSH_5); -XFS_ERRORTAG_ATTR_RW(iflush6, XFS_ERRTAG_IFLUSH_6); -XFS_ERRORTAG_ATTR_RW(dareadbuf, XFS_ERRTAG_DA_READ_BUF); -XFS_ERRORTAG_ATTR_RW(btree_chk_lblk, XFS_ERRTAG_BTREE_CHECK_LBLOCK); -XFS_ERRORTAG_ATTR_RW(btree_chk_sblk, XFS_ERRTAG_BTREE_CHECK_SBLOCK); -XFS_ERRORTAG_ATTR_RW(readagf, XFS_ERRTAG_ALLOC_READ_AGF); -XFS_ERRORTAG_ATTR_RW(readagi, XFS_ERRTAG_IALLOC_READ_AGI); -XFS_ERRORTAG_ATTR_RW(itobp, XFS_ERRTAG_ITOBP_INOTOBP); -XFS_ERRORTAG_ATTR_RW(iunlink, XFS_ERRTAG_IUNLINK); -XFS_ERRORTAG_ATTR_RW(iunlinkrm, XFS_ERRTAG_IUNLINK_REMOVE); -XFS_ERRORTAG_ATTR_RW(dirinovalid, XFS_ERRTAG_DIR_INO_VALIDATE); -XFS_ERRORTAG_ATTR_RW(bulkstat, XFS_ERRTAG_BULKSTAT_READ_CHUNK); -XFS_ERRORTAG_ATTR_RW(logiodone, XFS_ERRTAG_IODONE_IOERR); -XFS_ERRORTAG_ATTR_RW(stratread, XFS_ERRTAG_STRATREAD_IOERR); -XFS_ERRORTAG_ATTR_RW(stratcmpl, XFS_ERRTAG_STRATCMPL_IOERR); -XFS_ERRORTAG_ATTR_RW(diowrite, XFS_ERRTAG_DIOWRITE_IOERR); -XFS_ERRORTAG_ATTR_RW(bmapifmt, XFS_ERRTAG_BMAPIFORMAT); -XFS_ERRORTAG_ATTR_RW(free_extent, XFS_ERRTAG_FREE_EXTENT); -XFS_ERRORTAG_ATTR_RW(rmap_finish_one, XFS_ERRTAG_RMAP_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE); -XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); -XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); -XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); -XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); -XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); -XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); -XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); -XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); -XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); -XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); -XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); -XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); -XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); -XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); -XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); -XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); -XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL); + .tag = XFS_ERRTAG_##_tag, \ +}; +#include "xfs_errortag.h" +XFS_ERRTAGS +#undef XFS_ERRTAG +#define XFS_ERRTAG(_tag, _name, _default) \ + &xfs_errortag_attr_##_name.attr, +#include "xfs_errortag.h" static struct attribute *xfs_errortag_attrs[] = { - XFS_ERRORTAG_ATTR_LIST(noerror), - XFS_ERRORTAG_ATTR_LIST(iflush1), - XFS_ERRORTAG_ATTR_LIST(iflush2), - XFS_ERRORTAG_ATTR_LIST(iflush3), - XFS_ERRORTAG_ATTR_LIST(iflush4), - XFS_ERRORTAG_ATTR_LIST(iflush5), - XFS_ERRORTAG_ATTR_LIST(iflush6), - XFS_ERRORTAG_ATTR_LIST(dareadbuf), - XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk), - XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk), - XFS_ERRORTAG_ATTR_LIST(readagf), - XFS_ERRORTAG_ATTR_LIST(readagi), - XFS_ERRORTAG_ATTR_LIST(itobp), - XFS_ERRORTAG_ATTR_LIST(iunlink), - XFS_ERRORTAG_ATTR_LIST(iunlinkrm), - XFS_ERRORTAG_ATTR_LIST(dirinovalid), - XFS_ERRORTAG_ATTR_LIST(bulkstat), - XFS_ERRORTAG_ATTR_LIST(logiodone), - XFS_ERRORTAG_ATTR_LIST(stratread), - XFS_ERRORTAG_ATTR_LIST(stratcmpl), - XFS_ERRORTAG_ATTR_LIST(diowrite), - XFS_ERRORTAG_ATTR_LIST(bmapifmt), - XFS_ERRORTAG_ATTR_LIST(free_extent), - XFS_ERRORTAG_ATTR_LIST(rmap_finish_one), - XFS_ERRORTAG_ATTR_LIST(refcount_continue_update), - XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), - XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), - XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), - XFS_ERRORTAG_ATTR_LIST(log_bad_crc), - XFS_ERRORTAG_ATTR_LIST(log_item_pin), - XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), - XFS_ERRORTAG_ATTR_LIST(force_repair), - XFS_ERRORTAG_ATTR_LIST(bad_summary), - XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), - XFS_ERRORTAG_ATTR_LIST(buf_ioerror), - XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), - XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), - XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), - XFS_ERRORTAG_ATTR_LIST(larp), - XFS_ERRORTAG_ATTR_LIST(da_leaf_split), - XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), - XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), - XFS_ERRORTAG_ATTR_LIST(write_delay_ms), - XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one), - XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit), - NULL, + XFS_ERRTAGS + NULL }; ATTRIBUTE_GROUPS(xfs_errortag); +#undef XFS_ERRTAG + +/* -1 because XFS_ERRTAG_DROP_WRITES got removed, + 1 for NULL termination */ +static_assert(ARRAY_SIZE(xfs_errortag_attrs) == XFS_ERRTAG_MAX); static const struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, @@ -295,7 +165,6 @@ xfs_errortag_enabled( bool xfs_errortag_test( struct xfs_mount *mp, - const char *expression, const char *file, int line, unsigned int error_tag) @@ -321,36 +190,12 @@ xfs_errortag_test( return false; xfs_warn_ratelimited(mp, -"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", - expression, file, line, mp->m_super->s_id); +"Injecting error at file %s, line %d, on filesystem \"%s\"", + file, line, mp->m_super->s_id); return true; } int -xfs_errortag_get( - struct xfs_mount *mp, - unsigned int error_tag) -{ - if (!xfs_errortag_valid(error_tag)) - return -EINVAL; - - return mp->m_errortag[error_tag]; -} - -int -xfs_errortag_set( - struct xfs_mount *mp, - unsigned int error_tag, - unsigned int tag_value) -{ - if (!xfs_errortag_valid(error_tag)) - return -EINVAL; - - mp->m_errortag[error_tag] = tag_value; - return 0; -} - -int xfs_errortag_add( struct xfs_mount *mp, unsigned int error_tag) @@ -359,9 +204,8 @@ xfs_errortag_add( if (!xfs_errortag_valid(error_tag)) return -EINVAL; - - return xfs_errortag_set(mp, error_tag, - xfs_errortag_random_default[error_tag]); + mp->m_errortag[error_tag] = xfs_errortag_random_default[error_tag]; + return 0; } int diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 0b9c5ba8a598..fe6a71bbe9cd 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -8,22 +8,17 @@ struct xfs_mount; -extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, - xfs_failaddr_t failaddr); -extern void xfs_corruption_error(const char *tag, int level, - struct xfs_mount *mp, const void *buf, size_t bufsize, - const char *filename, int linenum, - xfs_failaddr_t failaddr); +void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, xfs_failaddr_t failaddr); +void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, + const void *buf, size_t bufsize, const char *filename, + int linenum, xfs_failaddr_t failaddr); void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa); -extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, - const char *name, const void *buf, size_t bufsz, - xfs_failaddr_t failaddr); -extern void xfs_verifier_error(struct xfs_buf *bp, int error, - xfs_failaddr_t failaddr); -extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, - const char *name, const void *buf, size_t bufsz, - xfs_failaddr_t failaddr); +void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr); +void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); +void xfs_inode_verifier_error(struct xfs_inode *ip, int error, const char *name, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) @@ -39,12 +34,12 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, #define XFS_CORRUPTION_DUMP_LEN (128) #ifdef DEBUG -extern int xfs_errortag_init(struct xfs_mount *mp); -extern void xfs_errortag_del(struct xfs_mount *mp); -extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression, - const char *file, int line, unsigned int error_tag); -#define XFS_TEST_ERROR(expr, mp, tag) \ - ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) +int xfs_errortag_init(struct xfs_mount *mp); +void xfs_errortag_del(struct xfs_mount *mp); +bool xfs_errortag_test(struct xfs_mount *mp, const char *file, int line, + unsigned int error_tag); +#define XFS_TEST_ERROR(mp, tag) \ + xfs_errortag_test((mp), __FILE__, __LINE__, (tag)) bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); #define XFS_ERRORTAG_DELAY(mp, tag) \ do { \ @@ -58,17 +53,13 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); mdelay((mp)->m_errortag[(tag)]); \ } while (0) -extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); -extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, - unsigned int tag_value); -extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); -extern int xfs_errortag_clearall(struct xfs_mount *mp); +int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); +int xfs_errortag_clearall(struct xfs_mount *mp); #else #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) -#define XFS_TEST_ERROR(expr, mp, tag) (expr) +#define XFS_TEST_ERROR(mp, tag) (false) #define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) -#define xfs_errortag_set(mp, tag, val) (ENOSYS) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) #endif /* DEBUG */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 47ee598a9827..418ddab590e0 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -202,7 +202,7 @@ xfs_efi_copy_format( sizeof(struct xfs_extent)); return 0; } else if (buf->iov_len == len32) { - xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->iov_base; + struct xfs_efi_log_format_32 *src_efi_fmt_32 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; @@ -216,7 +216,7 @@ xfs_efi_copy_format( } return 0; } else if (buf->iov_len == len64) { - xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->iov_base; + struct xfs_efi_log_format_64 *src_efi_fmt_64 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index c8402040410b..af1b0331f7af 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -49,7 +49,7 @@ struct xfs_efi_log_item { struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; - xfs_efi_log_format_t efi_format; + struct xfs_efi_log_format efi_format; }; static inline size_t @@ -69,7 +69,7 @@ struct xfs_efd_log_item { struct xfs_log_item efd_item; struct xfs_efi_log_item *efd_efip; uint efd_next_extent; - xfs_efd_log_format_t efd_format; + struct xfs_efd_log_format efd_format; }; static inline size_t diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f96fbf5c54c9..2702fef2c90c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -75,52 +75,47 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); } -static xfs_csn_t -xfs_fsync_seq( - struct xfs_inode *ip, - bool datasync) -{ - if (!xfs_ipincount(ip)) - return 0; - if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - return 0; - return ip->i_itemp->ili_commit_seq; -} - /* - * All metadata updates are logged, which means that we just have to flush the - * log up to the latest LSN that touched the inode. + * All metadata updates are logged, which means that we just have to push the + * journal to the required sequence number than holds the updates. We track + * datasync commits separately to full sync commits, and hence only need to + * select the correct sequence number for the log force here. * - * If we have concurrent fsync/fdatasync() calls, we need them to all block on - * the log force before we clear the ili_fsync_fields field. This ensures that - * we don't get a racing sync operation that does not wait for the metadata to - * hit the journal before returning. If we race with clearing ili_fsync_fields, - * then all that will happen is the log force will do nothing as the lsn will - * already be on disk. We can't race with setting ili_fsync_fields because that - * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock - * shared until after the ili_fsync_fields is cleared. + * We don't have to serialise against concurrent modifications, as we do not + * have to wait for modifications that have not yet completed. We define a + * transaction commit as completing when the commit sequence number is updated, + * hence if the sequence number has not updated, the sync operation has been + * run before the commit completed and we don't have to wait for it. + * + * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain + * set on the log item until - at least - the journal flush completes. In + * reality, they are only cleared when the inode is fully unpinned (i.e. + * persistent in the journal and not dirty in the CIL), and so we rely on + * xfs_log_force_seq() either skipping sequences that have been persisted or + * waiting on sequences that are still in flight to correctly order concurrent + * sync operations. */ -static int +static int xfs_fsync_flush_log( struct xfs_inode *ip, bool datasync, int *log_flushed) { - int error = 0; - xfs_csn_t seq; + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - seq = xfs_fsync_seq(ip, datasync); - if (seq) { - error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, - log_flushed); + spin_lock(&iip->ili_lock); + if (datasync) + seq = iip->ili_datasync_seq; + else + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); - spin_lock(&ip->i_itemp->ili_lock); - ip->i_itemp->ili_fsync_fields = 0; - spin_unlock(&ip->i_itemp->ili_lock); - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; + if (!seq) + return 0; + + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, + log_flushed); } STATIC int @@ -158,12 +153,10 @@ xfs_file_fsync( error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); /* - * Any inode that has dirty modifications in the log is pinned. The - * racy check here for a pinned inode will not catch modifications - * that happen concurrently to the fsync call, but fsync semantics - * only require to sync previously completed I/O. + * If the inode has a inode log item attached, it may need the journal + * flushed to persist any changes the log item might be tracking. */ - if (xfs_ipincount(ip)) { + if (ip->i_itemp) { err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); if (err2 && !error) error = err2; diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f6f628c01feb..566fd663c95b 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -14,8 +14,6 @@ */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ - .sgid_inherit = { 0, 0, 1 }, - .symlink_mode = { 0, 0, 1 }, .panic_mask = { 0, 0, XFS_PTAG_MASK}, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4cf7abe50143..e44040206851 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -646,8 +646,7 @@ xfs_iget_cache_miss( goto out_destroy; /* - * For version 5 superblocks, if we are initialising a new inode and we - * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can + * For version 5 superblocks, if we are initialising a new inode, we * simply build the new inode core with a random generation number. * * For version 4 (and older) superblocks, log recovery is dependent on @@ -655,8 +654,7 @@ xfs_iget_cache_miss( * value and hence we must also read the inode off disk even when * initializing new inodes. */ - if (xfs_has_v3inodes(mp) && - (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { + if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE)) { VFS_I(ip)->i_generation = get_random_u32(); } else { struct xfs_buf *bp; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9c39251961a3..36b39539e561 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -877,6 +877,35 @@ xfs_create_tmpfile( return error; } +static inline int +xfs_projid_differ( + struct xfs_inode *tdp, + struct xfs_inode *sip) +{ + /* + * If we are using project inheritance, we only allow hard link/renames + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && + tdp->i_projid != sip->i_projid)) { + /* + * Project quota setup skips special files which can + * leave inodes in a PROJINHERIT directory without a + * project ID set. We need to allow links to be made + * to these "project-less" inodes because userspace + * expects them to succeed after project ID setup, + * but everything else should be rejected. + */ + if (!special_file(VFS_I(sip)->i_mode) || + sip->i_projid != 0) { + return -EXDEV; + } + } + + return 0; +} + int xfs_link( struct xfs_inode *tdp, @@ -930,27 +959,9 @@ xfs_link( goto error_return; } - /* - * If we are using project inheritance, we only allow hard link - * creation in our tree when the project IDs are the same; else - * the tree quota mechanism could be circumvented. - */ - if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && - tdp->i_projid != sip->i_projid)) { - /* - * Project quota setup skips special files which can - * leave inodes in a PROJINHERIT directory without a - * project ID set. We need to allow links to be made - * to these "project-less" inodes because userspace - * expects them to succeed after project ID setup, - * but everything else should be rejected. - */ - if (!special_file(VFS_I(sip)->i_mode) || - sip->i_projid != 0) { - error = -EXDEV; - goto error_return; - } - } + error = xfs_projid_differ(tdp, sip); + if (error) + goto error_return; error = xfs_dir_add_child(tp, resblks, &du); if (error) @@ -1035,7 +1046,7 @@ xfs_itruncate_extents_flags( int error = 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - if (atomic_read(&VFS_I(ip)->i_count)) + if (icount_read(VFS_I(ip))) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -1656,7 +1667,6 @@ retry: spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; spin_unlock(&iip->ili_lock); ASSERT(iip->ili_last_fields); @@ -1821,12 +1831,20 @@ static void xfs_iunpin( struct xfs_inode *ip) { - xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); + if (!seq) + return; /* Give the log a push to start the unpinning I/O */ - xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); + xfs_log_force_seq(ip->i_mount, seq, 0, NULL); } @@ -2227,16 +2245,9 @@ retry: if (du_wip.ip) xfs_trans_ijoin(tp, du_wip.ip, 0); - /* - * If we are using project inheritance, we only allow renames - * into our tree when the project IDs are the same; else the - * tree quota mechanism would be circumvented. - */ - if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && - target_dp->i_projid != src_ip->i_projid)) { - error = -EXDEV; + error = xfs_projid_differ(target_dp, src_ip); + if (error) goto out_trans_cancel; - } /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) { @@ -2377,8 +2388,8 @@ xfs_iflush( * error handling as the caller will shutdown and fail the buffer. */ error = -EFSCORRUPTED; - if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), - mp, XFS_ERRTAG_IFLUSH_1)) { + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); @@ -2394,29 +2405,27 @@ xfs_iflush( goto flush_out; } } else if (S_ISREG(VFS_I(ip)->i_mode)) { - if (XFS_TEST_ERROR( - ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && - ip->i_df.if_format != XFS_DINODE_FMT_BTREE, - mp, XFS_ERRTAG_IFLUSH_3)) { + if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } else if (S_ISDIR(VFS_I(ip)->i_mode)) { - if (XFS_TEST_ERROR( - ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && - ip->i_df.if_format != XFS_DINODE_FMT_BTREE && - ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, - mp, XFS_ERRTAG_IFLUSH_4)) { + if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE && + ip->i_df.if_format != XFS_DINODE_FMT_LOCAL) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > - ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { + if (ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > + ip->i_nblocks || XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %llu, " "total extents = %llu nblocks = %lld, ptr "PTR_FMT, @@ -2425,8 +2434,8 @@ xfs_iflush( ip->i_nblocks, ip); goto flush_out; } - if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, - mp, XFS_ERRTAG_IFLUSH_6)) { + if (ip->i_forkoff > mp->m_sb.sb_inodesize || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_forkoff, ip); @@ -2502,7 +2511,6 @@ flush_out: spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags); spin_unlock(&iip->ili_lock); @@ -2661,12 +2669,15 @@ int xfs_log_force_inode( struct xfs_inode *ip) { + struct xfs_inode_log_item *iip = ip->i_itemp; xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - seq = ip->i_itemp->ili_commit_seq; - xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (!iip) + return 0; + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); if (!seq) return 0; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 829675700fcd..1bd411a1114c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -131,46 +131,28 @@ xfs_inode_item_precommit( } /* - * Inode verifiers do not check that the extent size hint is an integer - * multiple of the rt extent size on a directory with both rtinherit - * and extszinherit flags set. If we're logging a directory that is - * misconfigured in this way, clear the hint. + * Inode verifiers do not check that the extent size hints are an + * integer multiple of the rt extent size on a directory with + * rtinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the bad hints. */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - flags |= XFS_ILOG_CORE; + if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) { + if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + flags |= XFS_ILOG_CORE; + } } - /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it - * to XFS_ILOG_CORE so that the actual on-disk dirty tracking - * (ili_fields) correctly tracks that the version has changed. - */ spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); - if (flags & XFS_ILOG_IVERSION) - flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); - - /* - * Inode verifiers do not check that the CoW extent size hint is an - * integer multiple of the rt extent size on a directory with both - * rtinherit and cowextsize flags set. If we're logging a directory - * that is misconfigured in this way, clear the hint. - */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { - ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; - ip->i_cowextsize = 0; - flags |= XFS_ILOG_CORE; - } - if (!iip->ili_item.li_buf) { struct xfs_buf *bp; int error; @@ -205,6 +187,20 @@ xfs_inode_item_precommit( } /* + * Store the dirty flags back into the inode item as this state is used + * later on in xfs_inode_item_committing() to determine whether the + * transaction is relevant to fsync state or not. + */ + iip->ili_dirty_flags = flags; + + /* + * Convert the flags on-disk fields that have been modified in the + * transaction so that ili_fields tracks the changes correctly. + */ + if (flags & XFS_ILOG_IVERSION) + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + + /* * Always OR in the bits from the ili_last_fields field. This is to * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines * in the eventual clearing of the ili_fields bits. See the big comment @@ -214,12 +210,6 @@ xfs_inode_item_precommit( spin_unlock(&iip->ili_lock); xfs_inode_item_precommit_check(ip); - - /* - * We are done with the log item transaction dirty state, so clear it so - * that it doesn't pollute future transactions. - */ - iip->ili_dirty_flags = 0; return 0; } @@ -729,13 +719,24 @@ xfs_inode_item_unpin( struct xfs_log_item *lip, int remove) { - struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE)); ASSERT(atomic_read(&ip->i_pincount) > 0); - if (atomic_dec_and_test(&ip->i_pincount)) + + /* + * If this is the last unpin, then the inode no longer needs a journal + * flush to persist it. Hence we can clear the commit sequence numbers + * as a fsync/fdatasync operation on the inode at this point is a no-op. + */ + if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) { + iip->ili_commit_seq = 0; + iip->ili_datasync_seq = 0; + spin_unlock(&iip->ili_lock); wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); + } } STATIC uint @@ -858,12 +859,45 @@ xfs_inode_item_committed( return lsn; } +/* + * The modification is now complete, so before we unlock the inode we need to + * update the commit sequence numbers for data integrity journal flushes. We + * always record the commit sequence number (ili_commit_seq) so that anything + * that needs a full journal sync will capture all of this modification. + * + * We then + * check if the changes will impact a datasync (O_DSYNC) journal flush. If the + * changes will require a datasync flush, then we also record the sequence in + * ili_datasync_seq. + * + * These commit sequence numbers will get cleared atomically with the inode being + * unpinned (i.e. pin count goes to zero), and so it will only be set when the + * inode is dirty in the journal. This removes the need for checking if the + * inode is pinned to determine if a journal flush is necessary, and hence + * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to + * serialise pin counts against commit sequence number updates. + * + */ STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, xfs_csn_t seq) { - INODE_ITEM(lip)->ili_commit_seq = seq; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + + spin_lock(&iip->ili_lock); + iip->ili_commit_seq = seq; + if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP)) + iip->ili_datasync_seq = seq; + spin_unlock(&iip->ili_lock); + + /* + * Clear the per-transaction dirty flags now that we have finished + * recording the transaction's inode modifications in the CIL and are + * about to release and (maybe) unlock the inode. + */ + iip->ili_dirty_flags = 0; + return xfs_inode_item_release(lip); } @@ -1055,7 +1089,6 @@ xfs_iflush_abort_clean( { iip->ili_last_fields = 0; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; iip->ili_flush_lsn = 0; iip->ili_item.li_buf = NULL; list_del_init(&iip->ili_item.li_bio_list); diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index ba92ce11a011..2ddcca41714f 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -32,9 +32,17 @@ struct xfs_inode_log_item { spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ - unsigned int ili_fsync_fields; /* logged since last fsync */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + + /* + * We record the sequence number for every inode modification, as + * well as those that only require fdatasync operations for data + * integrity. This allows optimisation of the O_DSYNC/fdatasync path + * without needing to track what modifications the journal is currently + * carrying for the inode. These are protected by the above ili_lock. + */ xfs_csn_t ili_commit_seq; /* last transaction commit */ + xfs_csn_t ili_datasync_seq; /* for datasync optimisation */ }; static inline int xfs_inode_clean(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index e1051a530a50..a6bb7ee7a27a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -512,9 +512,6 @@ xfs_fileattr_get( { struct xfs_inode *ip = XFS_I(d_inode(dentry)); - if (d_is_special(dentry)) - return -ENOTTY; - xfs_ilock(ip, XFS_ILOCK_SHARED); xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa); xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -736,9 +733,6 @@ xfs_fileattr_set( trace_xfs_ioctl_setattr(ip); - if (d_is_special(dentry)) - return -ENOTTY; - if (!fa->fsx_valid) { if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | FS_NODUMP_FL | @@ -1209,21 +1203,21 @@ xfs_file_ioctl( current->comm); return -ENOTTY; case XFS_IOC_DIOINFO: { - struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct kstat st; struct dioattr da; - da.d_mem = target->bt_logical_sectorsize; + error = vfs_getattr(&filp->f_path, &st, STATX_DIOALIGN, 0); + if (error) + return error; /* - * See xfs_report_dioalign() for an explanation about why this - * reports a value larger than the sector size for COW inodes. + * Some userspace directly feeds the return value to + * posix_memalign, which fails for values that are smaller than + * the pointer size. Round up the value to not break userspace. */ - if (xfs_is_cow_inode(ip)) - da.d_miniosz = xfs_inode_alloc_unitsize(ip); - else - da.d_miniosz = target->bt_logical_sectorsize; + da.d_mem = roundup(st.dio_mem_align, sizeof(void *)); + da.d_miniosz = st.dio_offset_align; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); - if (copy_to_user(arg, &da, sizeof(da))) return -EFAULT; return 0; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2a74f2957341..d3f6e3e42a11 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -149,9 +149,18 @@ xfs_bmbt_to_iomap( iomap->bdev = target->bt_bdev; iomap->flags = iomap_flags; - if (xfs_ipincount(ip) && - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; + /* + * If the inode is dirty for datasync purposes, let iomap know so it + * doesn't elide the IO completion journal flushes on O_DSYNC IO. + */ + if (ip->i_itemp) { + struct xfs_inode_log_item *iip = ip->i_itemp; + + spin_lock(&iip->ili_lock); + if (iip->ili_datasync_seq) + iomap->flags |= IOMAP_F_DIRTY; + spin_unlock(&iip->ili_lock); + } iomap->validity_cookie = sequence_cookie; return 0; @@ -1554,7 +1563,7 @@ xfs_zoned_buffered_write_iomap_begin( return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; @@ -1728,7 +1737,7 @@ xfs_buffered_write_iomap_begin( return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 603effabe1ee..caff0125faea 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -431,14 +431,12 @@ xfs_vn_symlink( struct dentry *dentry, const char *symname) { - struct inode *inode; - struct xfs_inode *cip = NULL; - struct xfs_name name; - int error; - umode_t mode; + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; + int error; + umode_t mode = S_IFLNK | S_IRWXUGO; - mode = S_IFLNK | - (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); error = xfs_dentry_mode_to_name(&name, dentry, mode); if (unlikely(error)) goto out; @@ -1335,6 +1333,8 @@ static const struct inode_operations xfs_symlink_inode_operations = { .setattr = xfs_vn_setattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; /* Figure out if this file actually supports DAX. */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 9a2221b4aa21..4dd747bdbcca 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -89,8 +89,6 @@ typedef __u32 xfs_nlink_t; #undef XFS_NATIVE_HOST #endif -#define irix_sgid_inherit xfs_params.sgid_inherit.val -#define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val #define xfs_error_level xfs_params.error_level.val #define xfs_syncd_centisecs xfs_params.syncd_timer.val diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c8a57e21a1d3..603e85c1ab4c 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -969,8 +969,8 @@ xfs_log_unmount_write( * counters will be recalculated. Refer to xlog_check_unmount_rec for * more details. */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + if (xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { xfs_alert(mp, "%s: will fix summary counters at next mount", __func__); return; @@ -1240,7 +1240,7 @@ xlog_ioend_work( /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { + if (error || XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } @@ -1489,8 +1489,7 @@ xlog_alloc_log( log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | - WQ_HIGHPRI), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU), 0, mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog; @@ -1568,13 +1567,13 @@ xlog_cksum( struct xlog *log, struct xlog_rec_header *rhead, char *dp, - int size) + unsigned int hdrsize, + unsigned int size) { uint32_t crc; /* first generate the crc for the record header ... */ - crc = xfs_start_cksum_update((char *)rhead, - sizeof(struct xlog_rec_header), + crc = xfs_start_cksum_update((char *)rhead, hdrsize, offsetof(struct xlog_rec_header, h_crc)); /* ... then for additional cycle data for v2 logs ... */ @@ -1818,7 +1817,7 @@ xlog_sync( /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, - iclog->ic_datap, size); + iclog->ic_datap, XLOG_REC_SIZE, size); /* * Intentionally corrupt the log record CRC based on the error injection * frequency, if defined. This facilitates testing log recovery in the @@ -1827,7 +1826,7 @@ xlog_sync( * detects the bad CRC and attempts to recover. */ #ifdef DEBUG - if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { + if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_fail_crc = true; xfs_warn(log->l_mp, @@ -2656,10 +2655,11 @@ restart: * until you know exactly how many bytes get copied. Therefore, wait * until later to update ic_offset. * - * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's + * xlog_write() algorithm assumes that at least 2 xlog_op_header's * can fit into remaining data section. */ - if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + if (iclog->ic_size - iclog->ic_offset < + 2 * sizeof(struct xlog_op_header)) { int error = 0; xlog_state_switch_iclogs(log, iclog, iclog->ic_size); @@ -3153,11 +3153,11 @@ xlog_calc_unit_res( */ /* for trans header */ - unit_bytes += sizeof(xlog_op_header_t); - unit_bytes += sizeof(xfs_trans_header_t); + unit_bytes += sizeof(struct xlog_op_header); + unit_bytes += sizeof(struct xfs_trans_header); /* for start-rec */ - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); /* * for LR headers - the space for data in an iclog is the size minus @@ -3180,12 +3180,12 @@ xlog_calc_unit_res( num_headers = howmany(unit_bytes, iclog_space); /* for split-recs - ophdrs added when data split over LRs */ - unit_bytes += sizeof(xlog_op_header_t) * num_headers; + unit_bytes += sizeof(struct xlog_op_header) * num_headers; /* add extra header reservations if we overrun */ while (!num_headers || howmany(unit_bytes, iclog_space) > num_headers) { - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); num_headers++; } unit_bytes += log->l_iclog_hsize * num_headers; @@ -3322,7 +3322,7 @@ xlog_verify_iclog( struct xlog_in_core *iclog, int count) { - xlog_op_header_t *ophead; + struct xlog_op_header *ophead; xlog_in_core_t *icptr; xlog_in_core_2_t *xhdr; void *base_ptr, *ptr, *p; @@ -3400,7 +3400,7 @@ xlog_verify_iclog( op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); } } - ptr += sizeof(xlog_op_header_t) + op_len; + ptr += sizeof(struct xlog_op_header) + op_len; } } #endif diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index af6daf4f6792..dcc1f44ed68f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -20,6 +20,43 @@ struct xfs_log_vec { int lv_alloc_size; /* size of allocated lv */ }; +/* Region types for iovec's i_type */ +#define XLOG_REG_TYPE_BFORMAT 1 +#define XLOG_REG_TYPE_BCHUNK 2 +#define XLOG_REG_TYPE_EFI_FORMAT 3 +#define XLOG_REG_TYPE_EFD_FORMAT 4 +#define XLOG_REG_TYPE_IFORMAT 5 +#define XLOG_REG_TYPE_ICORE 6 +#define XLOG_REG_TYPE_IEXT 7 +#define XLOG_REG_TYPE_IBROOT 8 +#define XLOG_REG_TYPE_ILOCAL 9 +#define XLOG_REG_TYPE_IATTR_EXT 10 +#define XLOG_REG_TYPE_IATTR_BROOT 11 +#define XLOG_REG_TYPE_IATTR_LOCAL 12 +#define XLOG_REG_TYPE_QFORMAT 13 +#define XLOG_REG_TYPE_DQUOT 14 +#define XLOG_REG_TYPE_QUOTAOFF 15 +#define XLOG_REG_TYPE_LRHEADER 16 +#define XLOG_REG_TYPE_UNMOUNT 17 +#define XLOG_REG_TYPE_COMMIT 18 +#define XLOG_REG_TYPE_TRANSHDR 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_RUI_FORMAT 21 +#define XLOG_REG_TYPE_RUD_FORMAT 22 +#define XLOG_REG_TYPE_CUI_FORMAT 23 +#define XLOG_REG_TYPE_CUD_FORMAT 24 +#define XLOG_REG_TYPE_BUI_FORMAT 25 +#define XLOG_REG_TYPE_BUD_FORMAT 26 +#define XLOG_REG_TYPE_ATTRI_FORMAT 27 +#define XLOG_REG_TYPE_ATTRD_FORMAT 28 +#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_VALUE 30 +#define XLOG_REG_TYPE_XMI_FORMAT 31 +#define XLOG_REG_TYPE_XMD_FORMAT 32 +#define XLOG_REG_TYPE_ATTR_NEWNAME 33 +#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 +#define XLOG_REG_TYPE_MAX 34 + #define XFS_LOG_VEC_ORDERED (-1) /* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index a9a7a271c15b..0cfc654d8e87 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -499,8 +499,8 @@ xlog_recover_finish( extern void xlog_recover_cancel(struct xlog *); -extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, - char *dp, int size); +__le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, unsigned int hdrsize, unsigned int size); extern struct kmem_cache *xfs_log_ticket_cache; struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e6ed9e09c027..549d60959aee 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2894,20 +2894,34 @@ xlog_recover_process( int pass, struct list_head *buffer_list) { - __le32 old_crc = rhead->h_crc; - __le32 crc; + __le32 expected_crc = rhead->h_crc, crc, other_crc; - crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE, + be32_to_cpu(rhead->h_len)); + + /* + * Look at the end of the struct xlog_rec_header definition in + * xfs_log_format.h for the glory details. + */ + if (expected_crc && crc != expected_crc) { + other_crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE_OTHER, + be32_to_cpu(rhead->h_len)); + if (other_crc == expected_crc) { + xfs_notice_once(log->l_mp, + "Fixing up incorrect CRC due to padding."); + crc = other_crc; + } + } /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always - * sets old_crc to 0 so we must consider this valid even on v5 supers. - * Otherwise, return EFSBADCRC on failure so the callers up the stack - * know precisely what failed. + * sets expected_crc to 0 so we must consider this valid even on v5 + * supers. Otherwise, return EFSBADCRC on failure so the callers up the + * stack know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (old_crc && crc != old_crc) + if (expected_crc && crc != expected_crc) return -EFSBADCRC; return 0; } @@ -2918,11 +2932,11 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != old_crc) { - if (old_crc || xfs_has_crc(log->l_mp)) { + if (crc != expected_crc) { + if (expected_crc || xfs_has_crc(log->l_mp)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", - le32_to_cpu(old_crc), + le32_to_cpu(expected_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index dc32c5e34d81..0953f6ae94ab 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1057,19 +1057,6 @@ xfs_mountfs( xfs_inodegc_start(mp); xfs_blockgc_start(mp); - /* - * Now that we've recovered any pending superblock feature bit - * additions, we can finish setting up the attr2 behaviour for the - * mount. The noattr2 option overrides the superblock flag, so only - * check the superblock feature flag if the mount option is not set. - */ - if (xfs_has_noattr2(mp)) { - mp->m_features &= ~XFS_FEAT_ATTR2; - } else if (!xfs_has_attr2(mp) && - (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) { - mp->m_features |= XFS_FEAT_ATTR2; - } - if (xfs_has_metadir(mp)) { error = xfs_mount_setup_metadir(mp); if (error) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 97de44c32272..f046d1215b04 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -363,7 +363,6 @@ typedef struct xfs_mount { #define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */ #define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */ #define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */ -#define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */ #define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */ #define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */ #define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */ @@ -386,7 +385,6 @@ typedef struct xfs_mount { /* Mount features */ #define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */ -#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ #define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred @@ -396,7 +394,6 @@ typedef struct xfs_mount { #define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */ #define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */ #define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */ -#define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/ #define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */ #define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */ #define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */ @@ -504,12 +501,17 @@ __XFS_HAS_V4_FEAT(align, ALIGN) __XFS_HAS_V4_FEAT(logv2, LOGV2) __XFS_HAS_V4_FEAT(extflg, EXTFLG) __XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT) -__XFS_ADD_V4_FEAT(attr2, ATTR2) __XFS_ADD_V4_FEAT(projid32, PROJID32) __XFS_HAS_V4_FEAT(v3inodes, V3INODES) __XFS_HAS_V4_FEAT(crc, CRC) __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) +static inline void xfs_add_attr2(struct xfs_mount *mp) +{ + if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) + xfs_sb_version_addattr2(&mp->m_sb); +} + /* * Mount features * @@ -517,7 +519,6 @@ __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) * bit inodes and read-only state, are kept as operational state rather than * features. */ -__XFS_HAS_FEAT(noattr2, NOATTR2) __XFS_HAS_FEAT(noalign, NOALIGN) __XFS_HAS_FEAT(allocsize, ALLOCSIZE) __XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE) @@ -526,7 +527,6 @@ __XFS_HAS_FEAT(dirsync, DIRSYNC) __XFS_HAS_FEAT(discard, DISCARD) __XFS_HAS_FEAT(grpid, GRPID) __XFS_HAS_FEAT(small_inums, SMALL_INUMS) -__XFS_HAS_FEAT(ikeep, IKEEP) __XFS_HAS_FEAT(swalloc, SWALLOC) __XFS_HAS_FEAT(filestreams, FILESTREAMS) __XFS_HAS_FEAT(dax_always, DAX_ALWAYS) diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 866c71d9fbae..73b7e72944e4 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -293,7 +293,8 @@ int xfs_mru_cache_init(void) { xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", - XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1); + XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 1); if (!xfs_mru_reap_wq) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index fbeddcac4792..b17672889942 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -165,7 +165,7 @@ xfs_dax_translate_range( uint64_t *bblen) { u64 dev_start = btp->bt_dax_part_off; - u64 dev_len = bdev_nr_bytes(btp->bt_bdev); + u64 dev_len = BBTOB(btp->bt_nr_sectors); u64 dev_end = dev_start + dev_len - 1; /* Notify failure on the whole device. */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bb0a82635a77..e85a156dc17d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -105,8 +105,8 @@ enum { Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, - Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep, - Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, + Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, + Opt_largeio, Opt_nolargeio, Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, @@ -133,12 +133,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("norecovery", Opt_norecovery), fsparam_flag("inode64", Opt_inode64), fsparam_flag("inode32", Opt_inode32), - fsparam_flag("ikeep", Opt_ikeep), - fsparam_flag("noikeep", Opt_noikeep), fsparam_flag("largeio", Opt_largeio), fsparam_flag("nolargeio", Opt_nolargeio), - fsparam_flag("attr2", Opt_attr2), - fsparam_flag("noattr2", Opt_noattr2), fsparam_flag("filestreams", Opt_filestreams), fsparam_flag("quota", Opt_quota), fsparam_flag("noquota", Opt_noquota), @@ -175,13 +171,11 @@ xfs_fs_show_options( { static struct proc_xfs_info xfs_info_set[] = { /* the few simple ones we can get from the mount struct */ - { XFS_FEAT_IKEEP, ",ikeep" }, { XFS_FEAT_WSYNC, ",wsync" }, { XFS_FEAT_NOALIGN, ",noalign" }, { XFS_FEAT_SWALLOC, ",swalloc" }, { XFS_FEAT_NOUUID, ",nouuid" }, { XFS_FEAT_NORECOVERY, ",norecovery" }, - { XFS_FEAT_ATTR2, ",attr2" }, { XFS_FEAT_FILESTREAMS, ",filestreams" }, { XFS_FEAT_GRPID, ",grpid" }, { XFS_FEAT_DISCARD, ",discard" }, @@ -541,7 +535,8 @@ xfs_setup_devices( { int error; - error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize, + mp->m_sb.sb_dblocks); if (error) return error; @@ -551,7 +546,7 @@ xfs_setup_devices( if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; error = xfs_configure_buftarg(mp->m_logdev_targp, - log_sector_size); + log_sector_size, mp->m_sb.sb_logblocks); if (error) return error; } @@ -565,7 +560,7 @@ xfs_setup_devices( mp->m_rtdev_targp = mp->m_ddev_targp; } else if (mp->m_rtname) { error = xfs_configure_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_sectsize); + mp->m_sb.sb_sectsize, mp->m_sb.sb_rblocks); if (error) return error; } @@ -578,19 +573,19 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 1, mp->m_super->s_id); if (!mp->m_buf_workqueue) goto out; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 0, mp->m_super->s_id); if (!mp->m_unwritten_workqueue) goto out_destroy_buf; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) goto out_destroy_unwritten; @@ -602,13 +597,14 @@ xfs_init_mount_workqueues( goto out_destroy_reclaim; mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 1, mp->m_super->s_id); if (!mp->m_inodegc_wq) goto out_destroy_blockgc; mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", - XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_PERCPU), 0, + mp->m_super->s_id); if (!mp->m_sync_workqueue) goto out_destroy_inodegc; @@ -778,7 +774,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode); + return inode_generic_drop(inode); } STATIC void @@ -1088,15 +1084,6 @@ xfs_finish_flags( } /* - * V5 filesystems always use attr2 format for attributes. - */ - if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) { - xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. " - "attr2 is always enabled for V5 filesystems."); - return -EINVAL; - } - - /* * prohibit r/w mounts of read-only filesystems */ if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) { @@ -1542,22 +1529,6 @@ xfs_fs_parse_param( return 0; #endif /* Following mount options will be removed in September 2025 */ - case Opt_ikeep: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true); - parsing_mp->m_features |= XFS_FEAT_IKEEP; - return 0; - case Opt_noikeep: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false); - parsing_mp->m_features &= ~XFS_FEAT_IKEEP; - return 0; - case Opt_attr2: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true); - parsing_mp->m_features |= XFS_FEAT_ATTR2; - return 0; - case Opt_noattr2: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); - parsing_mp->m_features |= XFS_FEAT_NOATTR2; - return 0; case Opt_max_open_zones: parsing_mp->m_max_open_zones = result.uint_32; return 0; @@ -1593,16 +1564,6 @@ xfs_fs_validate_params( return -EINVAL; } - /* - * We have not read the superblock at this point, so only the attr2 - * mount option can set the attr2 feature by this stage. - */ - if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) { - xfs_warn(mp, "attr2 and noattr2 cannot both be specified."); - return -EINVAL; - } - - if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) { xfs_warn(mp, "sunit and swidth options incompatible with the noalign option"); @@ -2177,21 +2138,6 @@ xfs_fs_reconfigure( if (error) return error; - /* attr2 -> noattr2 */ - if (xfs_has_noattr2(new_mp)) { - if (xfs_has_crc(mp)) { - xfs_warn(mp, - "attr2 is always enabled for a V5 filesystem - can't be changed."); - return -EINVAL; - } - mp->m_features &= ~XFS_FEAT_ATTR2; - mp->m_features |= XFS_FEAT_NOATTR2; - } else if (xfs_has_attr2(new_mp)) { - /* noattr2 -> attr2 */ - mp->m_features &= ~XFS_FEAT_NOATTR2; - mp->m_features |= XFS_FEAT_ATTR2; - } - /* Validate new max_atomic_write option before making other changes */ if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) { error = xfs_set_max_atomic_write_opt(mp, @@ -2596,8 +2542,8 @@ xfs_init_workqueues(void) * AGs in all the filesystems mounted. Hence use the default large * max_active value for this workqueue. */ - xfs_alloc_wq = alloc_workqueue("xfsalloc", - XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0); + xfs_alloc_wq = alloc_workqueue("xfsalloc", XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 0); if (!xfs_alloc_wq) return -ENOMEM; diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 751dc74a3067..9918f14b4874 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -50,7 +50,7 @@ xfs_panic_mask_proc_handler( } #endif /* CONFIG_PROC_FS */ -STATIC int +static inline int xfs_deprecated_dointvec_minmax( const struct ctl_table *ctl, int write, @@ -68,24 +68,6 @@ xfs_deprecated_dointvec_minmax( static const struct ctl_table xfs_table[] = { { - .procname = "irix_sgid_inherit", - .data = &xfs_params.sgid_inherit.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.sgid_inherit.min, - .extra2 = &xfs_params.sgid_inherit.max - }, - { - .procname = "irix_symlink_mode", - .data = &xfs_params.symlink_mode.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.symlink_mode.min, - .extra2 = &xfs_params.symlink_mode.max - }, - { .procname = "panic_mask", .data = &xfs_params.panic_mask.val, .maxlen = sizeof(int), @@ -185,15 +167,6 @@ static const struct ctl_table xfs_table[] = { .extra1 = &xfs_params.blockgc_timer.min, .extra2 = &xfs_params.blockgc_timer.max, }, - { - .procname = "speculative_cow_prealloc_lifetime", - .data = &xfs_params.blockgc_timer.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.blockgc_timer.min, - .extra2 = &xfs_params.blockgc_timer.max, - }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 51646f066c4f..ed9d896079c1 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -19,9 +19,6 @@ typedef struct xfs_sysctl_val { } xfs_sysctl_val_t; typedef struct xfs_param { - xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is - * not a member of parent dir GID. */ - xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */ xfs_sysctl_val_t error_level; /* Degree of reporting for problems */ xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ac344e42846c..79b8641880ab 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1152,7 +1152,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->count = icount_read(VFS_I(ip)); __entry->pincount = atomic_read(&ip->i_pincount); __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 575e7028f423..474f5a04ec63 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -452,19 +452,17 @@ xfs_trans_mod_sb( */ STATIC void xfs_trans_apply_sb_deltas( - xfs_trans_t *tp) + struct xfs_trans *tp) { - struct xfs_dsb *sbp; - struct xfs_buf *bp; - int whole = 0; - - bp = xfs_trans_getsb(tp); - sbp = bp->b_addr; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp = xfs_trans_getsb(tp); + struct xfs_dsb *sbp = bp->b_addr; + int whole = 0; /* * Only update the superblock counters if we are logging them */ - if (!xfs_has_lazysbcount((tp->t_mountp))) { + if (!xfs_has_lazysbcount(mp)) { if (tp->t_icount_delta) be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); if (tp->t_ifree_delta) @@ -491,8 +489,7 @@ xfs_trans_apply_sb_deltas( * write the correct value ondisk. */ if ((tp->t_frextents_delta || tp->t_res_frextents_delta) && - !xfs_has_rtgroups(tp->t_mountp)) { - struct xfs_mount *mp = tp->t_mountp; + !xfs_has_rtgroups(mp)) { int64_t rtxdelta; rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta; @@ -505,6 +502,8 @@ xfs_trans_apply_sb_deltas( if (tp->t_dblocks_delta) { be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); + mp->m_ddev_targp->bt_nr_sectors += + XFS_FSB_TO_BB(mp, tp->t_dblocks_delta); whole = 1; } if (tp->t_agcount_delta) { @@ -524,7 +523,7 @@ xfs_trans_apply_sb_deltas( * recompute the ondisk rtgroup block log. The incore values * will be recomputed in xfs_trans_unreserve_and_mod_sb. */ - if (xfs_has_rtgroups(tp->t_mountp)) { + if (xfs_has_rtgroups(mp)) { sbp->sb_rgblklog = xfs_compute_rgblklog( be32_to_cpu(sbp->sb_rgextents), be32_to_cpu(sbp->sb_rextsize)); @@ -537,6 +536,8 @@ xfs_trans_apply_sb_deltas( } if (tp->t_rblocks_delta) { be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta); + mp->m_rtdev_targp->bt_nr_sectors += + XFS_FSB_TO_BB(mp, tp->t_rblocks_delta); whole = 1; } if (tp->t_rextents_delta) { diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 67c328d23e4a..38983c6777df 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -374,7 +374,7 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; /* diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index f28214c28ab5..1147bacb2da8 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -493,64 +493,58 @@ xfs_try_open_zone( return oz; } +enum xfs_zone_alloc_score { + /* Any open zone will do it, we're desperate */ + XFS_ZONE_ALLOC_ANY = 0, + + /* It better fit somehow */ + XFS_ZONE_ALLOC_OK = 1, + + /* Only reuse a zone if it fits really well. */ + XFS_ZONE_ALLOC_GOOD = 2, +}; + /* - * For data with short or medium lifetime, try to colocated it into an - * already open zone with a matching temperature. + * Life time hint co-location matrix. Fields not set default to 0 + * aka XFS_ZONE_ALLOC_ANY. */ -static bool -xfs_colocate_eagerly( - enum rw_hint file_hint) -{ - switch (file_hint) { - case WRITE_LIFE_MEDIUM: - case WRITE_LIFE_SHORT: - case WRITE_LIFE_NONE: - return true; - default: - return false; - } -} - -static bool -xfs_good_hint_match( - struct xfs_open_zone *oz, - enum rw_hint file_hint) -{ - switch (oz->oz_write_hint) { - case WRITE_LIFE_LONG: - case WRITE_LIFE_EXTREME: - /* colocate long and extreme */ - if (file_hint == WRITE_LIFE_LONG || - file_hint == WRITE_LIFE_EXTREME) - return true; - break; - case WRITE_LIFE_MEDIUM: - /* colocate medium with medium */ - if (file_hint == WRITE_LIFE_MEDIUM) - return true; - break; - case WRITE_LIFE_SHORT: - case WRITE_LIFE_NONE: - case WRITE_LIFE_NOT_SET: - /* colocate short and none */ - if (file_hint <= WRITE_LIFE_SHORT) - return true; - break; - } - return false; -} +static const unsigned int +xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = { + [WRITE_LIFE_NOT_SET] = { + [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_NONE] = { + [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_SHORT] = { + [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_MEDIUM] = { + [WRITE_LIFE_MEDIUM] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_LONG] = { + [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_EXTREME] = { + [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, + }, +}; static bool xfs_try_use_zone( struct xfs_zone_info *zi, enum rw_hint file_hint, struct xfs_open_zone *oz, - bool lowspace) + unsigned int goodness) { if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) return false; - if (!lowspace && !xfs_good_hint_match(oz, file_hint)) + + if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness) return false; + if (!atomic_inc_not_zero(&oz->oz_ref)) return false; @@ -581,14 +575,14 @@ static struct xfs_open_zone * xfs_select_open_zone_lru( struct xfs_zone_info *zi, enum rw_hint file_hint, - bool lowspace) + unsigned int goodness) { struct xfs_open_zone *oz; lockdep_assert_held(&zi->zi_open_zones_lock); list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) - if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) + if (xfs_try_use_zone(zi, file_hint, oz, goodness)) return oz; cond_resched_lock(&zi->zi_open_zones_lock); @@ -651,9 +645,11 @@ xfs_select_zone_nowait( * data. */ spin_lock(&zi->zi_open_zones_lock); - if (xfs_colocate_eagerly(write_hint)) - oz = xfs_select_open_zone_lru(zi, write_hint, false); - else if (pack_tight) + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD); + if (oz) + goto out_unlock; + + if (pack_tight) oz = xfs_select_open_zone_mru(zi, write_hint); if (oz) goto out_unlock; @@ -667,16 +663,16 @@ xfs_select_zone_nowait( goto out_unlock; /* - * Try to colocate cold data with other cold data if we failed to open a - * new zone for it. + * Try to find an zone that is an ok match to colocate data with. + */ + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK); + if (oz) + goto out_unlock; + + /* + * Pick the least recently used zone, regardless of hint match */ - if (write_hint != WRITE_LIFE_NOT_SET && - !xfs_colocate_eagerly(write_hint)) - oz = xfs_select_open_zone_lru(zi, write_hint, false); - if (!oz) - oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); - if (!oz) - oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY); out_unlock: spin_unlock(&zi->zi_open_zones_lock); return oz; @@ -1135,7 +1131,7 @@ xfs_calc_open_zones( if (bdev_open_zones) mp->m_max_open_zones = bdev_open_zones; else - mp->m_max_open_zones = xfs_max_open_zones(mp); + mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES; } if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { @@ -1248,7 +1244,7 @@ xfs_mount_zones( if (!mp->m_zone_info) return -ENOMEM; - xfs_info(mp, "%u zones of %u blocks size (%u max open)", + xfs_info(mp, "%u zones of %u blocks (%u max open zones)", mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, mp->m_max_open_zones); trace_xfs_zones_mount(mp); |
