aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/binfmt_elf.c34
-rw-r--r--fs/binfmt_em86.c3
-rw-r--r--fs/btrfs/ctree.h15
-rw-r--r--fs/btrfs/delayed-inode.c68
-rw-r--r--fs/btrfs/extent-tree.c731
-rw-r--r--fs/btrfs/file.c16
-rw-r--r--fs/btrfs/inode.c7
-rw-r--r--fs/btrfs/relocation.c45
-rw-r--r--fs/ceph/addr.c77
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c873
-rw-r--r--fs/ceph/dir.c73
-rw-r--r--fs/ceph/file.c77
-rw-r--r--fs/ceph/inode.c40
-rw-r--r--fs/ceph/ioctl.c30
-rw-r--r--fs/ceph/mds_client.c358
-rw-r--r--fs/ceph/mds_client.h19
-rw-r--r--fs/ceph/snap.c10
-rw-r--r--fs/ceph/super.c43
-rw-r--r--fs/ceph/super.h48
-rw-r--r--fs/ceph/xattr.c101
-rw-r--r--fs/exec.c9
-rw-r--r--fs/inode.c2
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/blocklayout/dev.c110
-rw-r--r--fs/nfs/blocklayout/extent_tree.c27
-rw-r--r--fs/nfs/callback_proc.c64
-rw-r--r--fs/nfs/callback_xdr.c6
-rw-r--r--fs/nfs/client.c22
-rw-r--r--fs/nfs/dir.c52
-rw-r--r--fs/nfs/direct.c93
-rw-r--r--fs/nfs/file.c100
-rw-r--r--fs/nfs/filelayout/filelayout.c18
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c23
-rw-r--r--fs/nfs/inode.c138
-rw-r--r--fs/nfs/internal.h62
-rw-r--r--fs/nfs/io.c147
-rw-r--r--fs/nfs/nfs3client.c14
-rw-r--r--fs/nfs/nfs42proc.c24
-rw-r--r--fs/nfs/nfs42xdr.c12
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4client.c26
-rw-r--r--fs/nfs/nfs4file.c16
-rw-r--r--fs/nfs/nfs4proc.c153
-rw-r--r--fs/nfs/nfs4xdr.c11
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/pnfs.c191
-rw-r--r--fs/nfs/pnfs.h34
-rw-r--r--fs/nfs/pnfs_nfs.c13
-rw-r--r--fs/nfs/super.c14
-rw-r--r--fs/nfs/write.c44
-rw-r--r--fs/nilfs2/alloc.c45
-rw-r--r--fs/nilfs2/bmap.c4
-rw-r--r--fs/nilfs2/bmap.h2
-rw-r--r--fs/nilfs2/btnode.c4
-rw-r--r--fs/nilfs2/btree.c61
-rw-r--r--fs/nilfs2/btree.h2
-rw-r--r--fs/nilfs2/cpfile.c23
-rw-r--r--fs/nilfs2/cpfile.h3
-rw-r--r--fs/nilfs2/dat.c19
-rw-r--r--fs/nilfs2/dat.h1
-rw-r--r--fs/nilfs2/dir.c60
-rw-r--r--fs/nilfs2/direct.c10
-rw-r--r--fs/nilfs2/direct.h10
-rw-r--r--fs/nilfs2/gcinode.c9
-rw-r--r--fs/nilfs2/ifile.c7
-rw-r--r--fs/nilfs2/ifile.h1
-rw-r--r--fs/nilfs2/inode.c36
-rw-r--r--fs/nilfs2/ioctl.c48
-rw-r--r--fs/nilfs2/mdt.c6
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/nilfs.h48
-rw-r--r--fs/nilfs2/page.c45
-rw-r--r--fs/nilfs2/recovery.c72
-rw-r--r--fs/nilfs2/segbuf.c6
-rw-r--r--fs/nilfs2/segment.c61
-rw-r--r--fs/nilfs2/segment.h1
-rw-r--r--fs/nilfs2/sufile.c44
-rw-r--r--fs/nilfs2/sufile.h1
-rw-r--r--fs/nilfs2/super.c206
-rw-r--r--fs/nilfs2/sysfs.c74
-rw-r--r--fs/nilfs2/the_nilfs.c134
-rw-r--r--fs/nilfs2/the_nilfs.h11
-rw-r--r--fs/ocfs2/alloc.c37
-rw-r--r--fs/ocfs2/alloc.h2
-rw-r--r--fs/ocfs2/aops.c37
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c53
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c29
-rw-r--r--fs/ocfs2/dlm/dlmthread.c57
-rw-r--r--fs/ocfs2/stack_user.c11
-rw-r--r--fs/ocfs2/suballoc.c20
-rw-r--r--fs/orangefs/dcache.c4
-rw-r--r--fs/orangefs/inode.c6
-rw-r--r--fs/orangefs/namei.c12
-rw-r--r--fs/orangefs/orangefs-kernel.h6
-rw-r--r--fs/orangefs/orangefs-mod.c2
-rw-r--r--fs/orangefs/orangefs-sysfs.c43
-rw-r--r--fs/orangefs/orangefs-utils.c38
-rw-r--r--fs/orangefs/protocol.h8
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/base.c7
-rw-r--r--fs/proc/stat.c10
-rw-r--r--fs/reiserfs/ibalance.c3
104 files changed, 3258 insertions, 2289 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a7a28110dc80..7f6aff3f72eb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -605,28 +605,30 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
* Do the same thing for the memory mapping - between
* elf_bss and last_bss is the bss section.
*/
- k = load_addr + eppnt->p_memsz + eppnt->p_vaddr;
+ k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
if (k > last_bss)
last_bss = k;
}
}
+ /*
+ * Now fill out the bss section: first pad the last page from
+ * the file up to the page boundary, and zero it from elf_bss
+ * up to the end of the page.
+ */
+ if (padzero(elf_bss)) {
+ error = -EFAULT;
+ goto out;
+ }
+ /*
+ * Next, align both the file and mem bss up to the page size,
+ * since this is where elf_bss was just zeroed up to, and where
+ * last_bss will end after the vm_brk() below.
+ */
+ elf_bss = ELF_PAGEALIGN(elf_bss);
+ last_bss = ELF_PAGEALIGN(last_bss);
+ /* Finally, if there is still more bss to allocate, do it. */
if (last_bss > elf_bss) {
- /*
- * Now fill out the bss section. First pad the last page up
- * to the page boundary, and then perform a mmap to make sure
- * that there are zero-mapped pages up to and including the
- * last bss page.
- */
- if (padzero(elf_bss)) {
- error = -EFAULT;
- goto out;
- }
-
- /* What we have mapped so far */
- elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
-
- /* Map the last of the bss segment */
error = vm_brk(elf_bss, last_bss - elf_bss);
if (error)
goto out;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 490538536cb4..dd2d3f0cd55d 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -24,7 +24,8 @@
static int load_em86(struct linux_binprm *bprm)
{
- char *interp, *i_name, *i_arg;
+ const char *i_name, *i_arg;
+ char *interp;
struct file * file;
int retval;
struct elfhdr elf_ex;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b2620d1f883f..443fcc402114 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -439,6 +439,8 @@ struct btrfs_space_info {
struct list_head list;
/* Protected by the spinlock 'lock'. */
struct list_head ro_bgs;
+ struct list_head priority_tickets;
+ struct list_head tickets;
struct rw_semaphore groups_sem;
/* for block groups in our same type */
@@ -2624,6 +2626,15 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_ALL,
};
+enum btrfs_flush_state {
+ FLUSH_DELAYED_ITEMS_NR = 1,
+ FLUSH_DELAYED_ITEMS = 2,
+ FLUSH_DELALLOC = 3,
+ FLUSH_DELALLOC_WAIT = 4,
+ ALLOC_CHUNK = 5,
+ COMMIT_TRANS = 6,
+};
+
int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
@@ -2661,8 +2672,8 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
- struct btrfs_block_rsv *dst_rsv,
- u64 num_bytes);
+ struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
+ int update_size);
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index d3aaabbfada0..dd3c040139a2 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -553,7 +553,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
dst_rsv = &root->fs_info->delayed_block_rsv;
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
if (!ret) {
trace_btrfs_space_reservation(root->fs_info, "delayed_item",
item->key.objectid,
@@ -598,6 +598,29 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
/*
+ * If our block_rsv is the delalloc block reserve then check and see if
+ * we have our extra reservation for updating the inode. If not fall
+ * through and try to reserve space quickly.
+ *
+ * We used to try and steal from the delalloc block rsv or the global
+ * reserve, but we'd steal a full reservation, which isn't kind. We are
+ * here through delalloc which means we've likely just cowed down close
+ * to the leaf that contains the inode, so we would steal less just
+ * doing the fallback inode update, so if we do end up having to steal
+ * from the global block rsv we hopefully only steal one or two blocks
+ * worth which is less likely to hurt us.
+ */
+ if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
+ release = true;
+ else
+ src_rsv = NULL;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+
+ /*
* btrfs_dirty_inode will update the inode under btrfs_join_transaction
* which doesn't reserve space for speed. This is a problem since we
* still need to reserve space for this update, so try to reserve the
@@ -626,51 +649,10 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes, 1);
}
return ret;
- } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
- spin_lock(&BTRFS_I(inode)->lock);
- if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &BTRFS_I(inode)->runtime_flags)) {
- spin_unlock(&BTRFS_I(inode)->lock);
- release = true;
- goto migrate;
- }
- spin_unlock(&BTRFS_I(inode)->lock);
-
- /* Ok we didn't have space pre-reserved. This shouldn't happen
- * too often but it can happen if we do delalloc to an existing
- * inode which gets dirtied because of the time update, and then
- * isn't touched again until after the transaction commits and
- * then we try to write out the data. First try to be nice and
- * reserve something strictly for us. If not be a pain and try
- * to steal from the delalloc block rsv.
- */
- ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
- BTRFS_RESERVE_NO_FLUSH);
- if (!ret)
- goto out;
-
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
- if (!ret)
- goto out;
-
- if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
- btrfs_debug(root->fs_info,
- "block rsv migrate returned %d", ret);
- WARN_ON(1);
- }
- /*
- * Ok this is a problem, let's just steal from the global rsv
- * since this really shouldn't happen that often.
- */
- ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
- dst_rsv, num_bytes);
- goto out;
}
-migrate:
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
-out:
/*
* Migrate only takes a reservation, it doesn't touch the size of the
* block_rsv. This is to simplify people who don't normally have things
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b480fd555774..e9376b1657e2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -111,6 +111,16 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int reserved);
+static int __reserve_metadata_bytes(struct btrfs_root *root,
+ struct btrfs_space_info *space_info,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush);
+static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes);
+static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -3913,6 +3923,7 @@ static const char *alloc_name(u64 flags)
static int update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
+ u64 bytes_readonly,
struct btrfs_space_info **space_info)
{
struct btrfs_space_info *found;
@@ -3933,8 +3944,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->disk_total += total_bytes * factor;
found->bytes_used += bytes_used;
found->disk_used += bytes_used * factor;
+ found->bytes_readonly += bytes_readonly;
if (total_bytes > 0)
found->full = 0;
+ space_info_add_new_bytes(info, found, total_bytes -
+ bytes_used - bytes_readonly);
spin_unlock(&found->lock);
*space_info = found;
return 0;
@@ -3960,7 +3974,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->disk_used = bytes_used * factor;
found->bytes_pinned = 0;
found->bytes_reserved = 0;
- found->bytes_readonly = 0;
+ found->bytes_readonly = bytes_readonly;
found->bytes_may_use = 0;
found->full = 0;
found->max_extent_size = 0;
@@ -3969,6 +3983,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->flush = 0;
init_waitqueue_head(&found->wait);
INIT_LIST_HEAD(&found->ro_bgs);
+ INIT_LIST_HEAD(&found->tickets);
+ INIT_LIST_HEAD(&found->priority_tickets);
ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
info->space_info_kobj, "%s",
@@ -4470,7 +4486,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
space_info = __find_space_info(extent_root->fs_info, flags);
if (!space_info) {
ret = update_space_info(extent_root->fs_info, flags,
- 0, 0, &space_info);
+ 0, 0, 0, &space_info);
BUG_ON(ret); /* -ENOMEM */
}
BUG_ON(!space_info); /* Logic error */
@@ -4582,12 +4598,19 @@ static int can_overcommit(struct btrfs_root *root,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
- u64 profile = btrfs_get_alloc_profile(root, 0);
+ struct btrfs_block_rsv *global_rsv;
+ u64 profile;
u64 space_size;
u64 avail;
u64 used;
+ /* Don't overcommit when in mixed mode. */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
+ return 0;
+
+ BUG_ON(root->fs_info == NULL);
+ global_rsv = &root->fs_info->global_block_rsv;
+ profile = btrfs_get_alloc_profile(root, 0);
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly;
@@ -4739,6 +4762,11 @@ skip_async:
spin_unlock(&space_info->lock);
break;
}
+ if (list_empty(&space_info->tickets) &&
+ list_empty(&space_info->priority_tickets)) {
+ spin_unlock(&space_info->lock);
+ break;
+ }
spin_unlock(&space_info->lock);
loops++;
@@ -4807,13 +4835,11 @@ commit:
return btrfs_commit_transaction(trans, root);
}
-enum flush_state {
- FLUSH_DELAYED_ITEMS_NR = 1,
- FLUSH_DELAYED_ITEMS = 2,
- FLUSH_DELALLOC = 3,
- FLUSH_DELALLOC_WAIT = 4,
- ALLOC_CHUNK = 5,
- COMMIT_TRANS = 6,
+struct reserve_ticket {
+ u64 bytes;
+ int error;
+ struct list_head list;
+ wait_queue_head_t wait;
};
static int flush_space(struct btrfs_root *root,
@@ -4866,6 +4892,8 @@ static int flush_space(struct btrfs_root *root,
break;
}
+ trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes,
+ orig_bytes, state, ret);
return ret;
}
@@ -4873,17 +4901,22 @@ static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
struct btrfs_space_info *space_info)
{
+ struct reserve_ticket *ticket;
u64 used;
u64 expected;
- u64 to_reclaim;
+ u64 to_reclaim = 0;
to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
- spin_lock(&space_info->lock);
if (can_overcommit(root, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL)) {
- to_reclaim = 0;
- goto out;
- }
+ BTRFS_RESERVE_FLUSH_ALL))
+ return 0;
+
+ list_for_each_entry(ticket, &space_info->tickets, list)
+ to_reclaim += ticket->bytes;
+ list_for_each_entry(ticket, &space_info->priority_tickets, list)
+ to_reclaim += ticket->bytes;
+ if (to_reclaim)
+ return to_reclaim;
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
@@ -4899,14 +4932,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
to_reclaim = 0;
to_reclaim = min(to_reclaim, space_info->bytes_may_use +
space_info->bytes_reserved);
-out:
- spin_unlock(&space_info->lock);
-
return to_reclaim;
}
static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_fs_info *fs_info, u64 used)
+ struct btrfs_root *root, u64 used)
{
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
@@ -4914,73 +4944,177 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
return 0;
- return (used >= thresh && !btrfs_fs_closing(fs_info) &&
- !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+ if (!btrfs_calc_reclaim_metadata_size(root, space_info))
+ return 0;
+
+ return (used >= thresh && !btrfs_fs_closing(root->fs_info) &&
+ !test_bit(BTRFS_FS_STATE_REMOUNTING,
+ &root->fs_info->fs_state));
}
-static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_fs_info *fs_info,
- int flush_state)
+static void wake_all_tickets(struct list_head *head)
{
- u64 used;
-
- spin_lock(&space_info->lock);
- /*
- * We run out of space and have not got any free space via flush_space,
- * so don't bother doing async reclaim.
- */
- if (flush_state > COMMIT_TRANS && space_info->full) {
- spin_unlock(&space_info->lock);
- return 0;
- }
+ struct reserve_ticket *ticket;
- used = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use;
- if (need_do_async_reclaim(space_info, fs_info, used)) {
- spin_unlock(&space_info->lock);
- return 1;
+ while (!list_empty(head)) {
+ ticket = list_first_entry(head, struct reserve_ticket, list);
+ list_del_init(&ticket->list);
+ ticket->error = -ENOSPC;
+ wake_up(&ticket->wait);
}
- spin_unlock(&space_info->lock);
-
- return 0;
}
+/*
+ * This is for normal flushers, we can wait all goddamned day if we want to. We
+ * will loop and continuously try to flush as long as we are making progress.
+ * We count progress as clearing off tickets each time we have to loop.
+ */
static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
{
+ struct reserve_ticket *last_ticket = NULL;
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
u64 to_reclaim;
int flush_state;
+ int commit_cycles = 0;
fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
space_info);
- if (!to_reclaim)
+ if (!to_reclaim) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
return;
+ }
+ last_ticket = list_first_entry(&space_info->tickets,
+ struct reserve_ticket, list);
+ spin_unlock(&space_info->lock);
flush_state = FLUSH_DELAYED_ITEMS_NR;
do {
+ struct reserve_ticket *ticket;
+ int ret;
+
+ ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
+ to_reclaim, flush_state);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+ space_info);
+ ticket = list_first_entry(&space_info->tickets,
+ struct reserve_ticket, list);
+ if (last_ticket == ticket) {
+ flush_state++;
+ } else {
+ last_ticket = ticket;
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ if (commit_cycles)
+ commit_cycles--;
+ }
+
+ if (flush_state > COMMIT_TRANS) {
+ commit_cycles++;
+ if (commit_cycles > 2) {
+ wake_all_tickets(&space_info->tickets);
+ space_info->flush = 0;
+ } else {
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ }
+ }
+ spin_unlock(&space_info->lock);
+ } while (flush_state <= COMMIT_TRANS);
+}
+
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+ INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+
+static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ u64 to_reclaim;
+ int flush_state = FLUSH_DELAYED_ITEMS_NR;
+
+ spin_lock(&space_info->lock);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+ space_info);
+ if (!to_reclaim) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ spin_unlock(&space_info->lock);
+
+ do {
flush_space(fs_info->fs_root, space_info, to_reclaim,
to_reclaim, flush_state);
flush_state++;
- if (!btrfs_need_do_async_reclaim(space_info, fs_info,
- flush_state))
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
return;
+ }
+ spin_unlock(&space_info->lock);
+
+ /*
+ * Priority flushers can't wait on delalloc without
+ * deadlocking.
+ */
+ if (flush_state == FLUSH_DELALLOC ||
+ flush_state == FLUSH_DELALLOC_WAIT)
+ flush_state = ALLOC_CHUNK;
} while (flush_state < COMMIT_TRANS);
}
-void btrfs_init_async_reclaim_work(struct work_struct *work)
+static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket, u64 orig_bytes)
+
{
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+ DEFINE_WAIT(wait);
+ int ret = 0;
+
+ spin_lock(&space_info->lock);
+ while (ticket->bytes > 0 && ticket->error == 0) {
+ ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
+ if (ret) {
+ ret = -EINTR;
+ break;
+ }
+ spin_unlock(&space_info->lock);
+
+ schedule();
+
+ finish_wait(&ticket->wait, &wait);
+ spin_lock(&space_info->lock);
+ }
+ if (!ret)
+ ret = ticket->error;
+ if (!list_empty(&ticket->list))
+ list_del_init(&ticket->list);
+ if (ticket->bytes && ticket->bytes < orig_bytes) {
+ u64 num_bytes = orig_bytes - ticket->bytes;
+ space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags, num_bytes, 0);
+ }
+ spin_unlock(&space_info->lock);
+
+ return ret;
}
/**
* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
* @root - the root we're allocating for
- * @block_rsv - the block_rsv we're allocating for
+ * @space_info - the space info we want to allocate from
* @orig_bytes - the number of bytes we want
* @flush - whether or not we can flush to make our reservation
*
@@ -4991,81 +5125,36 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int reserve_metadata_bytes(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+static int __reserve_metadata_bytes(struct btrfs_root *root,
+ struct btrfs_space_info *space_info,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_space_info *space_info = block_rsv->space_info;
+ struct reserve_ticket ticket;
u64 used;
- u64 num_bytes = orig_bytes;
- int flush_state = FLUSH_DELAYED_ITEMS_NR;
int ret = 0;
- bool flushing = false;
-again:
- ret = 0;
- spin_lock(&space_info->lock);
- /*
- * We only want to wait if somebody other than us is flushing and we
- * are actually allowed to flush all things.
- */
- while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
- space_info->flush) {
- spin_unlock(&space_info->lock);
- /*
- * If we have a trans handle we can't wait because the flusher
- * may have to commit the transaction, which would mean we would
- * deadlock since we are waiting for the flusher to finish, but
- * hold the current transaction open.
- */
- if (current->journal_info)
- return -EAGAIN;
- ret = wait_event_killable(space_info->wait, !space_info->flush);
- /* Must have been killed, return */
- if (ret)
- return -EINTR;
-
- spin_lock(&space_info->lock);
- }
+ ASSERT(orig_bytes);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+ spin_lock(&space_info->lock);
ret = -ENOSPC;
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
/*
- * The idea here is that we've not already over-reserved the block group
- * then we can go ahead and save our reservation first and then start
- * flushing if we need to. Otherwise if we've already overcommitted
- * lets start flushing stuff first and then come back and try to make
- * our reservation.
+ * If we have enough space then hooray, make our reservation and carry
+ * on. If not see if we can overcommit, and if we can, hooray carry on.
+ * If not things get more complicated.
*/
- if (used <= space_info->total_bytes) {
- if (used + orig_bytes <= space_info->total_bytes) {
- space_info->bytes_may_use += orig_bytes;
- trace_btrfs_space_reservation(root->fs_info,
- "space_info", space_info->flags, orig_bytes, 1);
- ret = 0;
- } else {
- /*
- * Ok set num_bytes to orig_bytes since we aren't
- * overocmmitted, this way we only try and reclaim what
- * we need.
- */
- num_bytes = orig_bytes;
- }
- } else {
- /*
- * Ok we're over committed, set num_bytes to the overcommitted
- * amount plus the amount of bytes that we need for this
- * reservation.
- */
- num_bytes = used - space_info->total_bytes +
- (orig_bytes * 2);
- }
-
- if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+ if (used + orig_bytes <= space_info->total_bytes) {
+ space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ space_info->flags, orig_bytes,
+ 1);
+ ret = 0;
+ } else if (can_overcommit(root, space_info, orig_bytes, flush)) {
space_info->bytes_may_use += orig_bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
space_info->flags, orig_bytes,
@@ -5074,16 +5163,31 @@ again:
}
/*
- * Couldn't make our reservation, save our place so while we're trying
- * to reclaim space we can actually use it instead of somebody else
- * stealing it from us.
+ * If we couldn't make a reservation then setup our reservation ticket
+ * and kick the async worker if it's not already running.
*
- * We make the other tasks wait for the flush only when we can flush
- * all things.
+ * If we are a priority flusher then we just need to add our ticket to
+ * the list and we will do our own flushing further down.
*/
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
- flushing = true;
- space_info->flush = 1;
+ ticket.bytes = orig_bytes;
+ ticket.error = 0;
+ init_waitqueue_head(&ticket.wait);
+ if (flush == BTRFS_RESERVE_FLUSH_ALL) {
+ list_add_tail(&ticket.list, &space_info->tickets);
+ if (!space_info->flush) {
+ space_info->flush = 1;
+ trace_btrfs_trigger_flush(root->fs_info,
+ space_info->flags,
+ orig_bytes, flush,
+ "enospc");
+ queue_work(system_unbound_wq,
+ &root->fs_info->async_reclaim_work);
+ }
+ } else {
+ list_add_tail(&ticket.list,
+ &space_info->priority_tickets);
+ }
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
/*
@@ -5092,39 +5196,67 @@ again:
* the async reclaim as we will panic.
*/
if (!root->fs_info->log_root_recovering &&
- need_do_async_reclaim(space_info, root->fs_info, used) &&
- !work_busy(&root->fs_info->async_reclaim_work))
+ need_do_async_reclaim(space_info, root, used) &&
+ !work_busy(&root->fs_info->async_reclaim_work)) {
+ trace_btrfs_trigger_flush(root->fs_info,
+ space_info->flags,
+ orig_bytes, flush,
+ "preempt");
queue_work(system_unbound_wq,
&root->fs_info->async_reclaim_work);
+ }
}
spin_unlock(&space_info->lock);
-
if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
- goto out;
+ return ret;
- ret = flush_space(root, space_info, num_bytes, orig_bytes,
- flush_state);
- flush_state++;
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ return wait_reserve_ticket(root->fs_info, space_info, &ticket,
+ orig_bytes);
- /*
- * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
- * would happen. So skip delalloc flush.
- */
- if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
- (flush_state == FLUSH_DELALLOC ||
- flush_state == FLUSH_DELALLOC_WAIT))
- flush_state = ALLOC_CHUNK;
+ ret = 0;
+ priority_reclaim_metadata_space(root->fs_info, space_info, &ticket);
+ spin_lock(&space_info->lock);
+ if (ticket.bytes) {
+ if (ticket.bytes < orig_bytes) {
+ u64 num_bytes = orig_bytes - ticket.bytes;
+ space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info", space_info->flags,
+ num_bytes, 0);
- if (!ret)
- goto again;
- else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
- flush_state < COMMIT_TRANS)
- goto again;
- else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- flush_state <= COMMIT_TRANS)
- goto again;
+ }
+ list_del_init(&ticket.list);
+ ret = -ENOSPC;
+ }
+ spin_unlock(&space_info->lock);
+ ASSERT(list_empty(&ticket.list));
+ return ret;
+}
-out:
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
+ *
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv. If there is not enough space it will make an attempt to
+ * flush out space to make room. It will do this by flushing delalloc if
+ * possible or committing the transaction. If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int reserve_metadata_bytes(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ int ret;
+
+ ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
+ flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
struct btrfs_block_rsv *global_rsv =
@@ -5137,13 +5269,8 @@ out:
if (ret == -ENOSPC)
trace_btrfs_space_reservation(root->fs_info,
"space_info:enospc",
- space_info->flags, orig_bytes, 1);
- if (flushing) {
- spin_lock(&space_info->lock);
- space_info->flush = 0;
- wake_up_all(&space_info->wait);
- spin_unlock(&space_info->lock);
- }
+ block_rsv->space_info->flags,
+ orig_bytes, 1);
return ret;
}
@@ -5219,6 +5346,108 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
return 0;
}
+/*
+ * This is for space we already have accounted in space_info->bytes_may_use, so
+ * basically when we're returning space from block_rsv's.
+ */
+static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes)
+{
+ struct reserve_ticket *ticket;
+ struct list_head *head;
+ u64 used;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
+ bool check_overcommit = false;
+
+ spin_lock(&space_info->lock);
+ head = &space_info->priority_tickets;
+
+ /*
+ * If we are over our limit then we need to check and see if we can
+ * overcommit, and if we can't then we just need to free up our space
+ * and not satisfy any requests.
+ */
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+ if (used - num_bytes >= space_info->total_bytes)
+ check_overcommit = true;
+again:
+ while (!list_empty(head) && num_bytes) {
+ ticket = list_first_entry(head, struct reserve_ticket,
+ list);
+ /*
+ * We use 0 bytes because this space is already reserved, so
+ * adding the ticket space would be a double count.
+ */
+ if (check_overcommit &&
+ !can_overcommit(fs_info->extent_root, space_info, 0,
+ flush))
+ break;
+ if (num_bytes >= ticket->bytes) {
+ list_del_init(&ticket->list);
+ num_bytes -= ticket->bytes;
+ ticket->bytes = 0;
+ wake_up(&ticket->wait);
+ } else {
+ ticket->bytes -= num_bytes;
+ num_bytes = 0;
+ }
+ }
+
+ if (num_bytes && head == &space_info->priority_tickets) {
+ head = &space_info->tickets;
+ flush = BTRFS_RESERVE_FLUSH_ALL;
+ goto again;
+ }
+ space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags, num_bytes, 0);
+ spin_unlock(&space_info->lock);
+}
+
+/*
+ * This is for newly allocated space that isn't accounted in
+ * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
+ * we use this helper.
+ */
+static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes)
+{
+ struct reserve_ticket *ticket;
+ struct list_head *head = &space_info->priority_tickets;
+
+again:
+ while (!list_empty(head) && num_bytes) {
+ ticket = list_first_entry(head, struct reserve_ticket,
+ list);
+ if (num_bytes >= ticket->bytes) {
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags,
+ ticket->bytes, 1);
+ list_del_init(&ticket->list);
+ num_bytes -= ticket->bytes;
+ space_info->bytes_may_use += ticket->bytes;
+ ticket->bytes = 0;
+ wake_up(&ticket->wait);
+ } else {
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags,
+ num_bytes, 1);
+ space_info->bytes_may_use += num_bytes;
+ ticket->bytes -= num_bytes;
+ num_bytes = 0;
+ }
+ }
+
+ if (num_bytes && head == &space_info->priority_tickets) {
+ head = &space_info->tickets;
+ goto again;
+ }
+}
+
static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5253,18 +5482,15 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
}
spin_unlock(&dest->lock);
}
- if (num_bytes) {
- spin_lock(&space_info->lock);
- space_info->bytes_may_use -= num_bytes;
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags, num_bytes, 0);
- spin_unlock(&space_info->lock);
- }
+ if (num_bytes)
+ space_info_add_old_bytes(fs_info, space_info,
+ num_bytes);
}
}
-static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
- struct btrfs_block_rsv *dst, u64 num_bytes)
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
+ struct btrfs_block_rsv *dst, u64 num_bytes,
+ int update_size)
{
int ret;
@@ -5272,7 +5498,7 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
if (ret)
return ret;
- block_rsv_add_bytes(dst, num_bytes, 1);
+ block_rsv_add_bytes(dst, num_bytes, update_size);
return 0;
}
@@ -5379,13 +5605,6 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
- struct btrfs_block_rsv *dst_rsv,
- u64 num_bytes)
-{
- return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
-}
-
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes)
@@ -5398,48 +5617,21 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
num_bytes);
}
-/*
- * helper to calculate size of global block reservation.
- * the desired value is sum of space used by extent tree,
- * checksum tree and root tree
- */
-static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_space_info *sinfo;
- u64 num_bytes;
- u64 meta_used;
- u64 data_used;
- int csum_size = btrfs_super_csum_size(fs_info->super_copy);
-
- sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
- spin_lock(&sinfo->lock);
- data_used = sinfo->bytes_used;
- spin_unlock(&sinfo->lock);
-
- sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
- spin_lock(&sinfo->lock);
- if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
- data_used = 0;
- meta_used = sinfo->bytes_used;
- spin_unlock(&sinfo->lock);
-
- num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
- csum_size * 2;
- num_bytes += div_u64(data_used + meta_used, 50);
-
- if (num_bytes * 3 > meta_used)
- num_bytes = div_u64(meta_used, 3);
-
- return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
-}
-
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
struct btrfs_space_info *sinfo = block_rsv->space_info;
u64 num_bytes;
- num_bytes = calc_global_metadata_size(fs_info);
+ /*
+ * The global block rsv is based on the size of the extent tree, the
+ * checksum tree and the root tree. If the fs is empty we want to set
+ * it to a minimal amount for safety.
+ */
+ num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
+ btrfs_root_used(&fs_info->csum_root->root_item) +
+ btrfs_root_used(&fs_info->tree_root->root_item);
+ num_bytes = max_t(u64, num_bytes, SZ_16M);
spin_lock(&sinfo->lock);
spin_lock(&block_rsv->lock);
@@ -5554,7 +5746,13 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+ /*
+ * We always use trans->block_rsv here as we will have reserved space
+ * for our orphan when starting the transaction, using get_block_rsv()
+ * here will sometimes make us choose the wrong block rsv as we could be
+ * doing a reloc inode for a non refcounted root.
+ */
+ struct btrfs_block_rsv *src_rsv = trans->block_rsv;
struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
/*
@@ -5565,7 +5763,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
trace_btrfs_space_reservation(root->fs_info, "orphan",
btrfs_ino(inode), num_bytes, 1);
- return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+ return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
}
void btrfs_orphan_release_metadata(struct inode *inode)
@@ -5620,7 +5818,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
- ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
if (ret && *qgroup_reserved)
btrfs_qgroup_free_meta(root, *qgroup_reserved);
@@ -5730,21 +5928,26 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
u64 to_reserve = 0;
u64 csum_bytes;
unsigned nr_extents = 0;
- int extra_reserve = 0;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
bool delalloc_lock = true;
u64 to_free = 0;
unsigned dropped;
+ bool release_extra = false;
/* If we are a free space inode we need to not flush since we will be in
* the middle of a transaction commit. We also don't need the delalloc
* mutex since we won't race with anybody. We need this mostly to make
* lockdep shut its filthy mouth.
+ *
+ * If we have a transaction open (can happen if we call truncate_block
+ * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
*/
if (btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
delalloc_lock = false;
+ } else if (current->journal_info) {
+ flush = BTRFS_RESERVE_FLUSH_LIMIT;
}
if (flush != BTRFS_RESERVE_NO_FLUSH &&
@@ -5761,24 +5964,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
BTRFS_MAX_EXTENT_SIZE - 1,
BTRFS_MAX_EXTENT_SIZE);
BTRFS_I(inode)->outstanding_extents += nr_extents;
- nr_extents = 0;
+ nr_extents = 0;
if (BTRFS_I(inode)->outstanding_extents >
BTRFS_I(inode)->reserved_extents)
- nr_extents = BTRFS_I(inode)->outstanding_extents -
+ nr_extents += BTRFS_I(inode)->outstanding_extents -
BTRFS_I(inode)->reserved_extents;
- /*
- * Add an item to reserve for updating the inode when we complete the
- * delalloc io.
- */
- if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &BTRFS_I(inode)->runtime_flags)) {
- nr_extents++;
- extra_reserve = 1;
- }
-
- to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+ /* We always want to reserve a slot for updating the inode. */
+ to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1);
to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
csum_bytes = BTRFS_I(inode)->csum_bytes;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -5790,17 +5984,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
goto out_fail;
}
- ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+ ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
goto out_fail;
}
spin_lock(&BTRFS_I(inode)->lock);
- if (extra_reserve) {
- set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &BTRFS_I(inode)->runtime_flags);
- nr_extents--;
+ if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags)) {
+ to_reserve -= btrfs_calc_trans_metadata_size(root, 1);
+ release_extra = true;
}
BTRFS_I(inode)->reserved_extents += nr_extents;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -5811,8 +6005,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (to_reserve)
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), to_reserve, 1);
- block_rsv_add_bytes(block_rsv, to_reserve, 1);
-
+ if (release_extra)
+ btrfs_block_rsv_release(root, block_rsv,
+ btrfs_calc_trans_metadata_size(root,
+ 1));
return 0;
out_fail:
@@ -6044,6 +6240,9 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
+ trace_btrfs_space_reservation(root->fs_info, "pinned",
+ cache->space_info->flags,
+ num_bytes, 1);
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
@@ -6118,10 +6317,10 @@ static int pin_down_extent(struct btrfs_root *root,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
+ trace_btrfs_space_reservation(root->fs_info, "pinned",
+ cache->space_info->flags, num_bytes, 1);
set_extent_dirty(root->fs_info->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
- if (reserved)
- trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
return 0;
}
@@ -6476,6 +6675,9 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+
+ trace_btrfs_space_reservation(fs_info, "pinned",
+ space_info->flags, len, 0);
space_info->max_extent_size = 0;
percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
@@ -6483,17 +6685,29 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
readonly = true;
}
spin_unlock(&cache->lock);
- if (!readonly && global_rsv->space_info == space_info) {
+ if (!readonly && return_free_space &&
+ global_rsv->space_info == space_info) {
+ u64 to_add = len;
+ WARN_ON(!return_free_space);
spin_lock(&global_rsv->lock);
if (!global_rsv->full) {
- len = min(len, global_rsv->size -
- global_rsv->reserved);
- global_rsv->reserved += len;
- space_info->bytes_may_use += len;
+ to_add = min(len, global_rsv->size -
+ global_rsv->reserved);
+ global_rsv->reserved += to_add;
+ space_info->bytes_may_use += to_add;
if (global_rsv->reserved >= global_rsv->size)
global_rsv->full = 1;
+ trace_btrfs_space_reservation(fs_info,
+ "space_info",
+ space_info->flags,
+ to_add, 1);
+ len -= to_add;
}
spin_unlock(&global_rsv->lock);
+ /* Add to any tickets we may have */
+ if (len)
+ space_info_add_new_bytes(fs_info, space_info,
+ len);
}
spin_unlock(&space_info->lock);
}
@@ -7782,12 +7996,10 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
ret = btrfs_discard_extent(root, start, len, NULL);
btrfs_add_free_space(cache, start, len);
btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
+ trace_btrfs_reserved_extent_free(root, start, len);
}
btrfs_put_block_group(cache);
-
- trace_btrfs_reserved_extent_free(root, start, len);
-
return ret;
}
@@ -9791,13 +10003,15 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
space_info = list_entry(info->space_info.next,
struct btrfs_space_info,
list);
- if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
- if (WARN_ON(space_info->bytes_pinned > 0 ||
+
+ /*
+ * Do not hide this behind enospc_debug, this is actually
+ * important and indicates a real bug if this happens.
+ */
+ if (WARN_ON(space_info->bytes_pinned > 0 ||
space_info->bytes_reserved > 0 ||
- space_info->bytes_may_use > 0)) {
- dump_space_info(space_info, 0, 0);
- }
- }
+ space_info->bytes_may_use > 0))
+ dump_space_info(space_info, 0, 0);
list_del(&space_info->list);
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
struct kobject *kobj;
@@ -10005,9 +10219,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
goto error;
}
+ trace_btrfs_add_block_group(root->fs_info, cache, 0);
ret = update_space_info(info, cache->flags, found_key.offset,
btrfs_block_group_used(&cache->item),
- &space_info);
+ cache->bytes_super, &space_info);
if (ret) {
btrfs_remove_free_space_cache(cache);
spin_lock(&info->block_group_cache_lock);
@@ -10020,9 +10235,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
}
cache->space_info = space_info;
- spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_readonly += cache->bytes_super;
- spin_unlock(&cache->space_info->lock);
__link_block_group(space_info, cache);
@@ -10114,7 +10326,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_root *extent_root;
struct btrfs_block_group_cache *cache;
-
extent_root = root->fs_info->extent_root;
btrfs_set_log_full_commit(root->fs_info, trans);
@@ -10160,7 +10371,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* assigned to our block group, but don't update its counters just yet.
* We want our bg to be added to the rbtree with its ->space_info set.
*/
- ret = update_space_info(root->fs_info, cache->flags, 0, 0,
+ ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0,
&cache->space_info);
if (ret) {
btrfs_remove_free_space_cache(cache);
@@ -10179,8 +10390,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* Now that our block group has its ->space_info set and is inserted in
* the rbtree, update the space info's counters.
*/
+ trace_btrfs_add_block_group(root->fs_info, cache, 1);
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
- &cache->space_info);
+ cache->bytes_super, &cache->space_info);
if (ret) {
btrfs_remove_free_space_cache(cache);
spin_lock(&root->fs_info->block_group_cache_lock);
@@ -10193,16 +10405,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
}
update_global_block_rsv(root->fs_info);
- spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_readonly += cache->bytes_super;
- spin_unlock(&cache->space_info->lock);
-
__link_block_group(cache->space_info, cache);
list_add_tail(&cache->bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
-
return 0;
}
@@ -10747,21 +10954,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
mixed = 1;
flags = BTRFS_BLOCK_GROUP_SYSTEM;
- ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
if (ret)
goto out;
if (mixed) {
flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
- ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
} else {
flags = BTRFS_BLOCK_GROUP_METADATA;
- ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
if (ret)
goto out;
flags = BTRFS_BLOCK_GROUP_DATA;
- ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
}
out:
return ret;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2234e88cf674..bcfb4a27ddd4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1629,13 +1629,11 @@ again:
* managed to copy.
*/
if (num_sectors > dirty_sectors) {
- /*
- * we round down because we don't want to count
- * any partial blocks actually sent through the
- * IO machines
- */
- release_bytes = round_down(release_bytes - copied,
- root->sectorsize);
+
+ /* release everything except the sectors we dirtied */
+ release_bytes -= dirty_sectors <<
+ root->fs_info->sb->s_blocksize_bits;
+
if (copied > 0) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
@@ -2479,7 +2477,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
- min_size);
+ min_size, 0);
BUG_ON(ret);
trans->block_rsv = rsv;
@@ -2522,7 +2520,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
- rsv, min_size);
+ rsv, min_size, 0);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index df731c0ebec7..8078077d1090 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5263,7 +5263,7 @@ void btrfs_evict_inode(struct inode *inode)
if (steal_from_global) {
if (!btrfs_check_space_for_delayed_refs(trans, root))
ret = btrfs_block_rsv_migrate(global_rsv, rsv,
- min_size);
+ min_size, 0);
else
ret = -ENOSPC;
}
@@ -9116,7 +9116,7 @@ static int btrfs_truncate(struct inode *inode)
/* Migrate the slack space for the truncate to our reserve */
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
- min_size);
+ min_size, 0);
BUG_ON(ret);
/*
@@ -9156,7 +9156,7 @@ static int btrfs_truncate(struct inode *inode)
}
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
- rsv, min_size);
+ rsv, min_size, 0);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
}
@@ -9177,7 +9177,6 @@ static int btrfs_truncate(struct inode *inode)
ret = btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
}
-
out:
btrfs_free_block_rsv(root, rsv);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0477dca154ed..fc067b07e31f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2604,25 +2604,28 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
trans->block_rsv = rc->block_rsv;
rc->reserved_bytes += num_bytes;
+
+ /*
+ * We are under a transaction here so we can only do limited flushing.
+ * If we get an enospc just kick back -EAGAIN so we know to drop the
+ * transaction and try to refill when we can flush all the things.
+ */
ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
- BTRFS_RESERVE_FLUSH_ALL);
+ BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
- if (ret == -EAGAIN) {
- tmp = rc->extent_root->nodesize *
- RELOCATION_RESERVED_NODES;
- while (tmp <= rc->reserved_bytes)
- tmp <<= 1;
- /*
- * only one thread can access block_rsv at this point,
- * so we don't need hold lock to protect block_rsv.
- * we expand more reservation size here to allow enough
- * space for relocation and we will return earlier in
- * enospc case.
- */
- rc->block_rsv->size = tmp + rc->extent_root->nodesize *
- RELOCATION_RESERVED_NODES;
- }
- return ret;
+ tmp = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES;
+ while (tmp <= rc->reserved_bytes)
+ tmp <<= 1;
+ /*
+ * only one thread can access block_rsv at this point,
+ * so we don't need hold lock to protect block_rsv.
+ * we expand more reservation size here to allow enough
+ * space for relocation and we will return eailer in
+ * enospc case.
+ */
+ rc->block_rsv->size = tmp + rc->extent_root->nodesize *
+ RELOCATION_RESERVED_NODES;
+ return -EAGAIN;
}
return 0;
@@ -3871,6 +3874,7 @@ static noinline_for_stack
int prepare_to_relocate(struct reloc_control *rc)
{
struct btrfs_trans_handle *trans;
+ int ret;
rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
BTRFS_BLOCK_RSV_TEMP);
@@ -3885,6 +3889,11 @@ int prepare_to_relocate(struct reloc_control *rc)
rc->reserved_bytes = 0;
rc->block_rsv->size = rc->extent_root->nodesize *
RELOCATION_RESERVED_NODES;
+ ret = btrfs_block_rsv_refill(rc->extent_root,
+ rc->block_rsv, rc->block_rsv->size,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret)
+ return ret;
rc->create_reloc_tree = 1;
set_reloc_control(rc);
@@ -4643,7 +4652,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
if (rc->merge_reloc_tree) {
ret = btrfs_block_rsv_migrate(&pending->block_rsv,
rc->block_rsv,
- rc->nodes_relocated);
+ rc->nodes_relocated, 1);
if (ret)
return ret;
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 26a9d10d75e9..d5b6f959a3c3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1730,7 +1730,8 @@ enum {
POOL_WRITE = 2,
};
-static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
+ s64 pool, struct ceph_string *pool_ns)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1738,6 +1739,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
struct rb_node **p, *parent;
struct ceph_pool_perm *perm;
struct page **pages;
+ size_t pool_ns_len;
int err = 0, err2 = 0, have = 0;
down_read(&mdsc->pool_perm_rwsem);
@@ -1749,17 +1751,31 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
else if (pool > perm->pool)
p = &(*p)->rb_right;
else {
- have = perm->perm;
- break;
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
}
}
up_read(&mdsc->pool_perm_rwsem);
if (*p)
goto out;
- dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
+ if (pool_ns)
+ dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
+ pool, (int)pool_ns->len, pool_ns->str);
+ else
+ dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
down_write(&mdsc->pool_perm_rwsem);
+ p = &mdsc->pool_perm_tree.rb_node;
parent = NULL;
while (*p) {
parent = *p;
@@ -1769,8 +1785,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
else if (pool > perm->pool)
p = &(*p)->rb_right;
else {
- have = perm->perm;
- break;
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
}
}
if (*p) {
@@ -1788,6 +1813,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
rd_req->r_flags = CEPH_OSD_FLAG_READ;
osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
rd_req->r_base_oloc.pool = pool;
+ if (pool_ns)
+ rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
@@ -1841,7 +1868,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
goto out_unlock;
}
- perm = kmalloc(sizeof(*perm), GFP_NOFS);
+ pool_ns_len = pool_ns ? pool_ns->len : 0;
+ perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS);
if (!perm) {
err = -ENOMEM;
goto out_unlock;
@@ -1849,6 +1877,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
perm->pool = pool;
perm->perm = have;
+ perm->pool_ns_len = pool_ns_len;
+ if (pool_ns_len > 0)
+ memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
+ perm->pool_ns[pool_ns_len] = 0;
+
rb_link_node(&perm->node, parent, p);
rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
err = 0;
@@ -1860,43 +1893,46 @@ out_unlock:
out:
if (!err)
err = have;
- dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
+ if (pool_ns)
+ dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
+ pool, (int)pool_ns->len, pool_ns->str, err);
+ else
+ dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
return err;
}
int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
{
- u32 pool;
+ s64 pool;
+ struct ceph_string *pool_ns;
int ret, flags;
- /* does not support pool namespace yet */
- if (ci->i_pool_ns_len)
- return -EIO;
-
if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
NOPOOLPERM))
return 0;
spin_lock(&ci->i_ceph_lock);
flags = ci->i_ceph_flags;
- pool = ceph_file_layout_pg_pool(ci->i_layout);
+ pool = ci->i_layout.pool_id;
spin_unlock(&ci->i_ceph_lock);
check:
if (flags & CEPH_I_POOL_PERM) {
if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
- dout("ceph_pool_perm_check pool %u no read perm\n",
+ dout("ceph_pool_perm_check pool %lld no read perm\n",
pool);
return -EPERM;
}
if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
- dout("ceph_pool_perm_check pool %u no write perm\n",
+ dout("ceph_pool_perm_check pool %lld no write perm\n",
pool);
return -EPERM;
}
return 0;
}
- ret = __ceph_pool_perm_get(ci, pool);
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ ret = __ceph_pool_perm_get(ci, pool, pool_ns);
+ ceph_put_string(pool_ns);
if (ret < 0)
return ret;
@@ -1907,10 +1943,11 @@ check:
flags |= CEPH_I_POOL_WR;
spin_lock(&ci->i_ceph_lock);
- if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
- ci->i_ceph_flags = flags;
+ if (pool == ci->i_layout.pool_id &&
+ pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
+ ci->i_ceph_flags |= flags;
} else {
- pool = ceph_file_layout_pg_pool(ci->i_layout);
+ pool = ci->i_layout.pool_id;
flags = ci->i_ceph_flags;
}
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 238c55b01723..5bc5d37b1217 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -71,7 +71,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
&ceph_fscache_fsid_object_def,
fsc, true);
if (!fsc->fscache)
- pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
+ pr_err("Unable to register fsid: %p fscache cookie\n", fsc);
return 0;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6f60d0a3d0f9..99115cae1652 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -40,6 +40,11 @@
* cluster to release server state.
*/
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid);
/*
* Generate readable cap strings for debugging output.
@@ -849,12 +854,14 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
*/
int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
{
- int want = 0;
- int mode;
- for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
- if (ci->i_nr_by_mode[mode])
- want |= ceph_caps_for_mode(mode);
- return want;
+ int i, bits = 0;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (ci->i_nr_by_mode[i])
+ bits |= 1 << i;
+ }
+ if (bits == 0)
+ return 0;
+ return ceph_caps_for_mode(bits >> 1);
}
/*
@@ -991,7 +998,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
u32 seq, u64 flush_tid, u64 oldest_flush_tid,
u32 issue_seq, u32 mseq, u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
- struct timespec *ctime, u64 time_warp_seq,
+ struct timespec *ctime, u32 time_warp_seq,
kuid_t uid, kgid_t gid, umode_t mode,
u64 xattr_version,
struct ceph_buffer *xattrs_buf,
@@ -1116,8 +1123,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct inode *inode = &ci->vfs_inode;
u64 cap_id = cap->cap_id;
int held, revoking, dropping, keep;
- u64 seq, issue_seq, mseq, time_warp_seq, follows;
- u64 size, max_size;
+ u64 follows, size, max_size;
+ u32 seq, issue_seq, mseq, time_warp_seq;
struct timespec mtime, atime, ctime;
int wake = 0;
umode_t mode;
@@ -1215,6 +1222,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
return delayed;
}
+static inline int __send_flush_snap(struct inode *inode,
+ struct ceph_mds_session *session,
+ struct ceph_cap_snap *capsnap,
+ u32 mseq, u64 oldest_flush_tid)
+{
+ return send_cap_msg(session, ceph_vino(inode).ino, 0,
+ CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+ capsnap->dirty, 0, capsnap->cap_flush.tid,
+ oldest_flush_tid, 0, mseq, capsnap->size, 0,
+ &capsnap->mtime, &capsnap->atime,
+ &capsnap->ctime, capsnap->time_warp_seq,
+ capsnap->uid, capsnap->gid, capsnap->mode,
+ capsnap->xattr_version, capsnap->xattr_blob,
+ capsnap->follows, capsnap->inline_data);
+}
+
/*
* When a snapshot is taken, clients accumulate dirty metadata on
* inodes with capabilities in ceph_cap_snaps to describe the file
@@ -1222,37 +1245,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
* asynchronously back to the MDS once sync writes complete and dirty
* data is written out.
*
- * Unless @kick is true, skip cap_snaps that were already sent to
- * the MDS (i.e., during this session).
- *
* Called under i_ceph_lock. Takes s_mutex as needed.
*/
-void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int kick)
+static void __ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
struct inode *inode = &ci->vfs_inode;
- int mds;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap_snap *capsnap;
- u32 mseq;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
- session->s_mutex */
- u64 next_follows = 0; /* keep track of how far we've gotten through the
- i_cap_snaps list, and skip these entries next time
- around to avoid an infinite loop */
+ u64 oldest_flush_tid = 0;
+ u64 first_tid = 1, last_tid = 0;
- if (psession)
- session = *psession;
+ dout("__flush_snaps %p session %p\n", inode, session);
- dout("__flush_snaps %p\n", inode);
-retry:
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- /* avoid an infiniute loop after retry */
- if (capsnap->follows < next_follows)
- continue;
/*
* we need to wait for sync writes to complete and for dirty
* pages to be written out.
@@ -1263,97 +1271,129 @@ retry:
/* should be removed by ceph_try_drop_cap_snap() */
BUG_ON(!capsnap->need_flush);
- /* pick mds, take s_mutex */
- if (ci->i_auth_cap == NULL) {
- dout("no auth cap (migrating?), doing nothing\n");
- goto out;
- }
-
/* only flush each capsnap once */
- if (!kick && !list_empty(&capsnap->flushing_item)) {
- dout("already flushed %p, skipping\n", capsnap);
+ if (capsnap->cap_flush.tid > 0) {
+ dout(" already flushed %p, skipping\n", capsnap);
continue;
}
- mds = ci->i_auth_cap->session->s_mds;
- mseq = ci->i_auth_cap->mseq;
+ spin_lock(&mdsc->cap_dirty_lock);
+ capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
+ list_add_tail(&capsnap->cap_flush.g_list,
+ &mdsc->cap_flush_list);
+ if (oldest_flush_tid == 0)
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ if (list_empty(&ci->i_flushing_item)) {
+ list_add_tail(&ci->i_flushing_item,
+ &session->s_cap_flushing);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ list_add_tail(&capsnap->cap_flush.i_list,
+ &ci->i_cap_flush_list);
- if (session && session->s_mds != mds) {
- dout("oops, wrong session %p mutex\n", session);
- if (kick)
- goto out;
+ if (first_tid == 1)
+ first_tid = capsnap->cap_flush.tid;
+ last_tid = capsnap->cap_flush.tid;
+ }
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- session = NULL;
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
+
+ while (first_tid <= last_tid) {
+ struct ceph_cap *cap = ci->i_auth_cap;
+ struct ceph_cap_flush *cf;
+ int ret;
+
+ if (!(cap && cap->session == session)) {
+ dout("__flush_snaps %p auth cap %p not mds%d, "
+ "stop\n", inode, cap, session->s_mds);
+ break;
}
- if (!session) {
- spin_unlock(&ci->i_ceph_lock);
- mutex_lock(&mdsc->mutex);
- session = __ceph_lookup_mds_session(mdsc, mds);
- mutex_unlock(&mdsc->mutex);
- if (session) {
- dout("inverting session/ino locks on %p\n",
- session);
- mutex_lock(&session->s_mutex);
+
+ ret = -ENOENT;
+ list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid >= first_tid) {
+ ret = 0;
+ break;
}
- /*
- * if session == NULL, we raced against a cap
- * deletion or migration. retry, and we'll
- * get a better @mds value next time.
- */
- spin_lock(&ci->i_ceph_lock);
- goto retry;
}
+ if (ret < 0)
+ break;
- spin_lock(&mdsc->cap_dirty_lock);
- capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
- spin_unlock(&mdsc->cap_dirty_lock);
+ first_tid = cf->tid + 1;
+ capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
atomic_inc(&capsnap->nref);
- if (list_empty(&capsnap->flushing_item))
- list_add_tail(&capsnap->flushing_item,
- &session->s_cap_snaps_flushing);
spin_unlock(&ci->i_ceph_lock);
- dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
- inode, capsnap, capsnap->follows, capsnap->flush_tid);
- send_cap_msg(session, ceph_vino(inode).ino, 0,
- CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
- capsnap->dirty, 0, capsnap->flush_tid, 0,
- 0, mseq, capsnap->size, 0,
- &capsnap->mtime, &capsnap->atime,
- &capsnap->ctime, capsnap->time_warp_seq,
- capsnap->uid, capsnap->gid, capsnap->mode,
- capsnap->xattr_version, capsnap->xattr_blob,
- capsnap->follows, capsnap->inline_data);
-
- next_follows = capsnap->follows + 1;
- ceph_put_cap_snap(capsnap);
+ dout("__flush_snaps %p capsnap %p tid %llu %s\n",
+ inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
+
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err("__flush_snaps: error sending cap flushsnap, "
+ "ino (%llx.%llx) tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid, capsnap->follows);
+ }
+ ceph_put_cap_snap(capsnap);
spin_lock(&ci->i_ceph_lock);
- goto retry;
}
+}
- /* we flushed them all; remove this inode from the queue */
- spin_lock(&mdsc->snap_flush_lock);
- list_del_init(&ci->i_snap_flush_item);
- spin_unlock(&mdsc->snap_flush_lock);
+void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_session *session = *psession;
+ int mds;
+ dout("ceph_flush_snaps %p\n", inode);
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
+ dout(" no capsnap needs flush, doing nothing\n");
+ goto out;
+ }
+ if (!ci->i_auth_cap) {
+ dout(" no auth cap (migrating?), doing nothing\n");
+ goto out;
+ }
-out:
- if (psession)
- *psession = session;
- else if (session) {
+ mds = ci->i_auth_cap->session->s_mds;
+ if (session && session->s_mds != mds) {
+ dout(" oops, wrong session %p mutex\n", session);
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
+ session = NULL;
+ }
+ if (!session) {
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_lock(&mdsc->mutex);
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
+ if (session) {
+ dout(" inverting session/ino locks on %p\n", session);
+ mutex_lock(&session->s_mutex);
+ }
+ goto retry;
}
-}
-static void ceph_flush_snaps(struct ceph_inode_info *ci)
-{
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, NULL, 0);
+ __ceph_flush_snaps(ci, session);
+out:
spin_unlock(&ci->i_ceph_lock);
+
+ if (psession) {
+ *psession = session;
+ } else {
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ }
+ /* we flushed them all; remove this inode from the queue */
+ spin_lock(&mdsc->snap_flush_lock);
+ list_del_init(&ci->i_snap_flush_item);
+ spin_unlock(&mdsc->snap_flush_lock);
}
/*
@@ -1411,52 +1451,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
return dirty;
}
-static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
- struct ceph_cap_flush *cf)
-{
- struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_cap_flush *other = NULL;
-
- while (*p) {
- parent = *p;
- other = rb_entry(parent, struct ceph_cap_flush, i_node);
-
- if (cf->tid < other->tid)
- p = &(*p)->rb_left;
- else if (cf->tid > other->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&cf->i_node, parent, p);
- rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
-}
-
-static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
- struct ceph_cap_flush *cf)
-{
- struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_cap_flush *other = NULL;
-
- while (*p) {
- parent = *p;
- other = rb_entry(parent, struct ceph_cap_flush, g_node);
-
- if (cf->tid < other->tid)
- p = &(*p)->rb_left;
- else if (cf->tid > other->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&cf->g_node, parent, p);
- rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
-}
-
struct ceph_cap_flush *ceph_alloc_cap_flush(void)
{
return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
@@ -1470,23 +1464,54 @@ void ceph_free_cap_flush(struct ceph_cap_flush *cf)
static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
{
- struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
- if (n) {
+ if (!list_empty(&mdsc->cap_flush_list)) {
struct ceph_cap_flush *cf =
- rb_entry(n, struct ceph_cap_flush, g_node);
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
return cf->tid;
}
return 0;
}
/*
+ * Remove cap_flush from the mdsc's or inode's flushing cap list.
+ * Return true if caller needs to wake up flush waiters.
+ */
+static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci,
+ struct ceph_cap_flush *cf)
+{
+ struct ceph_cap_flush *prev;
+ bool wake = cf->wake;
+ if (mdsc) {
+ /* are there older pending cap flushes? */
+ if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
+ prev = list_prev_entry(cf, g_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del(&cf->g_list);
+ } else if (ci) {
+ if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
+ prev = list_prev_entry(cf, i_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del(&cf->i_list);
+ } else {
+ BUG_ON(1);
+ }
+ return wake;
+}
+
+/*
* Add dirty inode to the flushing list. Assigned a seq number so we
* can wait for caps to flush without starving.
*
* Called under i_ceph_lock.
*/
static int __mark_caps_flushing(struct inode *inode,
- struct ceph_mds_session *session,
+ struct ceph_mds_session *session, bool wake,
u64 *flush_tid, u64 *oldest_flush_tid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1509,26 +1534,22 @@ static int __mark_caps_flushing(struct inode *inode,
swap(cf, ci->i_prealloc_cap_flush);
cf->caps = flushing;
+ cf->wake = wake;
spin_lock(&mdsc->cap_dirty_lock);
list_del_init(&ci->i_dirty_item);
cf->tid = ++mdsc->last_cap_flush_tid;
- __add_cap_flushing_to_mdsc(mdsc, cf);
+ list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
*oldest_flush_tid = __get_oldest_flush_tid(mdsc);
if (list_empty(&ci->i_flushing_item)) {
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++;
- dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
- } else {
- list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
- dout(" inode %p now flushing (more) tid %llu\n",
- inode, cf->tid);
}
spin_unlock(&mdsc->cap_dirty_lock);
- __add_cap_flushing_to_inode(ci, cf);
+ list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
*flush_tid = cf->tid;
return flushing;
@@ -1583,10 +1604,11 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
struct rb_node *p;
- int tried_invalidate = 0;
- int delayed = 0, sent = 0, force_requeue = 0, num;
- int queue_invalidate = 0;
- int is_delayed = flags & CHECK_CAPS_NODELAY;
+ int delayed = 0, sent = 0, num;
+ bool is_delayed = flags & CHECK_CAPS_NODELAY;
+ bool queue_invalidate = false;
+ bool force_requeue = false;
+ bool tried_invalidate = false;
/* if we are unmounting, flush any unused caps immediately. */
if (mdsc->stopping)
@@ -1597,9 +1619,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
- /* flush snaps first time around only */
- if (!list_empty(&ci->i_cap_snaps))
- __ceph_flush_snaps(ci, &session, 0);
goto retry_locked;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1666,17 +1685,17 @@ retry_locked:
if (revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) {
dout("check_caps queuing invalidate\n");
- queue_invalidate = 1;
+ queue_invalidate = true;
ci->i_rdcache_revoking = ci->i_rdcache_gen;
} else {
dout("check_caps failed to invalidate pages\n");
/* we failed to invalidate pages. check these
caps again later. */
- force_requeue = 1;
+ force_requeue = true;
__cap_set_timeouts(mdsc, ci);
}
}
- tried_invalidate = 1;
+ tried_invalidate = true;
goto retry_locked;
}
@@ -1720,10 +1739,15 @@ retry_locked:
}
}
/* flush anything dirty? */
- if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
- ci->i_dirty_caps) {
- dout("flushing dirty caps\n");
- goto ack;
+ if (cap == ci->i_auth_cap) {
+ if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
+ dout("flushing dirty caps\n");
+ goto ack;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
+ dout("flushing snap caps\n");
+ goto ack;
+ }
}
/* completed revocation? going down and there are no caps? */
@@ -1782,6 +1806,26 @@ ack:
goto retry;
}
}
+
+ /* kick flushing and flush snaps before sending normal
+ * cap message */
+ if (cap == ci->i_auth_cap &&
+ (ci->i_ceph_flags &
+ (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
+ __ceph_flush_snaps(ci, session);
+
+ goto retry_locked;
+ }
+
/* take snap_rwsem after session mutex */
if (!took_snap_rwsem) {
if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
@@ -1796,7 +1840,7 @@ ack:
}
if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
- flushing = __mark_caps_flushing(inode, session,
+ flushing = __mark_caps_flushing(inode, session, false,
&flush_tid,
&oldest_flush_tid);
} else {
@@ -1822,7 +1866,7 @@ ack:
* otherwise cancel.
*/
if (delayed && is_delayed)
- force_requeue = 1; /* __send_cap delayed release; requeue */
+ force_requeue = true; /* __send_cap delayed release; requeue */
if (!delayed && !is_delayed)
__cap_delay_cancel(mdsc, ci);
else if (!is_delayed || force_requeue)
@@ -1873,8 +1917,8 @@ retry:
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
goto out;
- flushing = __mark_caps_flushing(inode, session, &flush_tid,
- &oldest_flush_tid);
+ flushing = __mark_caps_flushing(inode, session, true,
+ &flush_tid, &oldest_flush_tid);
/* __send_cap drops i_ceph_lock */
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
@@ -1887,10 +1931,11 @@ retry:
spin_unlock(&ci->i_ceph_lock);
}
} else {
- struct rb_node *n = rb_last(&ci->i_cap_flush_tree);
- if (n) {
+ if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
- rb_entry(n, struct ceph_cap_flush, i_node);
+ list_last_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ cf->wake = true;
flush_tid = cf->tid;
}
flushing = ci->i_flushing_caps;
@@ -1910,14 +1955,13 @@ out:
static int caps_are_flushed(struct inode *inode, u64 flush_tid)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap_flush *cf;
- struct rb_node *n;
int ret = 1;
spin_lock(&ci->i_ceph_lock);
- n = rb_first(&ci->i_cap_flush_tree);
- if (n) {
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ struct ceph_cap_flush * cf =
+ list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
if (cf->tid <= flush_tid)
ret = 0;
}
@@ -1926,53 +1970,6 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
}
/*
- * Wait on any unsafe replies for the given inode. First wait on the
- * newest request, and make that the upper bound. Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
- */
-static void sync_write_wait(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_writes;
- struct ceph_osd_request *req;
- u64 last_tid;
-
- if (!S_ISREG(inode->i_mode))
- return;
-
- spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- /* set upper bound as _last_ entry in chain */
- req = list_last_entry(head, struct ceph_osd_request,
- r_unsafe_item);
- last_tid = req->r_tid;
-
- do {
- ceph_osdc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
- dout("sync_write_wait on tid %llu (until %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
- spin_lock(&ci->i_unsafe_lock);
- ceph_osdc_put_request(req);
-
- /*
- * from here on look at first entry in chain, since we
- * only want to wait for anything older than last_tid
- */
- if (list_empty(head))
- break;
- req = list_first_entry(head, struct ceph_osd_request,
- r_unsafe_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
-}
-
-/*
* wait for any unsafe requests to complete.
*/
static int unsafe_request_wait(struct inode *inode)
@@ -2024,7 +2021,8 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
int dirty;
dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
- sync_write_wait(inode);
+
+ ceph_sync_write_wait(inode);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret < 0)
@@ -2087,87 +2085,74 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
return err;
}
-/*
- * After a recovering MDS goes active, we need to resend any caps
- * we were flushing.
- *
- * Caller holds session->s_mutex.
- */
-static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_cap_snap *capsnap;
-
- dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
- list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
- flushing_item) {
- struct ceph_inode_info *ci = capsnap->ci;
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
-
- spin_lock(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- if (cap && cap->session == session) {
- dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
- cap, capsnap);
- __ceph_flush_snaps(ci, &session, 1);
- } else {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
- }
- spin_unlock(&ci->i_ceph_lock);
- }
-}
-
-static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct ceph_inode_info *ci)
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
{
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
struct ceph_cap_flush *cf;
- struct rb_node *n;
- int delayed = 0;
+ int ret;
u64 first_tid = 0;
- u64 oldest_flush_tid;
- spin_lock(&mdsc->cap_dirty_lock);
- oldest_flush_tid = __get_oldest_flush_tid(mdsc);
- spin_unlock(&mdsc->cap_dirty_lock);
+ list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid < first_tid)
+ continue;
- while (true) {
- spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
- spin_unlock(&ci->i_ceph_lock);
+ pr_err("%p auth cap %p not mds%d ???\n",
+ inode, cap, session->s_mds);
break;
}
- for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
- if (cf->tid >= first_tid)
- break;
- }
- if (!n) {
+ first_tid = cf->tid + 1;
+
+ if (cf->caps) {
+ dout("kick_flushing_caps %p cap %p tid %llu %s\n",
+ inode, cap, cf->tid, ceph_cap_string(cf->caps));
+ ci->i_ceph_flags |= CEPH_I_NODELAY;
+ ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __ceph_caps_used(ci),
+ __ceph_caps_wanted(ci),
+ cap->issued | cap->implemented,
+ cf->caps, cf->tid, oldest_flush_tid);
+ if (ret) {
+ pr_err("kick_flushing_caps: error sending "
+ "cap flush, ino (%llx.%llx) "
+ "tid %llu flushing %s\n",
+ ceph_vinop(inode), cf->tid,
+ ceph_cap_string(cf->caps));
+ }
+ } else {
+ struct ceph_cap_snap *capsnap =
+ container_of(cf, struct ceph_cap_snap,
+ cap_flush);
+ dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
+ inode, capsnap, cf->tid,
+ ceph_cap_string(capsnap->dirty));
+
+ atomic_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
- break;
- }
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err("kick_flushing_caps: error sending "
+ "cap flushsnap, ino (%llx.%llx) "
+ "tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid,
+ capsnap->follows);
+ }
- first_tid = cf->tid + 1;
+ ceph_put_cap_snap(capsnap);
+ }
- dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode,
- cap, cf->tid, ceph_cap_string(cf->caps));
- delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- cf->caps, cf->tid, oldest_flush_tid);
+ spin_lock(&ci->i_ceph_lock);
}
- return delayed;
}
void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
@@ -2175,8 +2160,14 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
{
struct ceph_inode_info *ci;
struct ceph_cap *cap;
+ u64 oldest_flush_tid;
dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
@@ -2196,10 +2187,11 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
*/
if ((cap->issued & ci->i_flushing_caps) !=
ci->i_flushing_caps) {
- spin_unlock(&ci->i_ceph_lock);
- if (!__kick_flushing_caps(mdsc, session, ci))
- continue;
- spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
+ } else {
+ ci->i_ceph_flags |= CEPH_I_KICK_FLUSH;
}
spin_unlock(&ci->i_ceph_lock);
@@ -2210,50 +2202,56 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci;
-
- kick_flushing_capsnaps(mdsc, session);
+ struct ceph_cap *cap;
+ u64 oldest_flush_tid;
dout("kick_flushing_caps mds%d\n", session->s_mds);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
- int delayed = __kick_flushing_caps(mdsc, session, ci);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (!(cap && cap->session == session)) {
+ pr_err("%p auth cap %p not mds%d ???\n",
+ &ci->vfs_inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
+ continue;
+ }
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
}
+ spin_unlock(&ci->i_ceph_lock);
}
}
static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct inode *inode)
+ __releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *cap;
- spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s\n", inode,
ceph_cap_string(ci->i_flushing_caps));
- __ceph_flush_snaps(ci, &session, 1);
-
- if (ci->i_flushing_caps) {
- int delayed;
-
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ u64 oldest_flush_tid;
spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item,
&cap->session->s_cap_flushing);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
-
- delayed = __kick_flushing_caps(mdsc, session, ci);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
- spin_unlock(&ci->i_ceph_lock);
- }
} else {
spin_unlock(&ci->i_ceph_lock);
}
@@ -2580,16 +2578,19 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
* drop cap_snap that is not associated with any snapshot.
* we don't need to send FLUSHSNAP message for it.
*/
-static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap)
+static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap *capsnap)
{
if (!capsnap->need_flush &&
!capsnap->writing && !capsnap->dirty_pages) {
-
dout("dropping cap_snap %p follows %llu\n",
capsnap, capsnap->follows);
+ BUG_ON(capsnap->cap_flush.tid > 0);
ceph_put_snap_context(capsnap->context);
+ if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+
list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
ceph_put_cap_snap(capsnap);
return 1;
}
@@ -2636,7 +2637,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
struct ceph_cap_snap,
ci_item);
capsnap->writing = 0;
- if (ceph_try_drop_cap_snap(capsnap))
+ if (ceph_try_drop_cap_snap(ci, capsnap))
put++;
else if (__ceph_finish_cap_snap(ci, capsnap))
flushsnaps = 1;
@@ -2661,7 +2662,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
if (last && !flushsnaps)
ceph_check_caps(ci, 0, NULL);
else if (flushsnaps)
- ceph_flush_snaps(ci);
+ ceph_flush_snaps(ci, NULL);
if (wake)
wake_up_all(&ci->i_cap_wq);
while (put-- > 0)
@@ -2679,15 +2680,19 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc)
{
struct inode *inode = &ci->vfs_inode;
- int last = 0;
- int complete_capsnap = 0;
- int drop_capsnap = 0;
- int found = 0;
struct ceph_cap_snap *capsnap = NULL;
+ int put = 0;
+ bool last = false;
+ bool found = false;
+ bool flush_snaps = false;
+ bool complete_capsnap = false;
spin_lock(&ci->i_ceph_lock);
ci->i_wrbuffer_ref -= nr;
- last = !ci->i_wrbuffer_ref;
+ if (ci->i_wrbuffer_ref == 0) {
+ last = true;
+ put++;
+ }
if (ci->i_head_snapc == snapc) {
ci->i_wrbuffer_ref_head -= nr;
@@ -2707,15 +2712,22 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
} else {
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
if (capsnap->context == snapc) {
- found = 1;
+ found = true;
break;
}
}
BUG_ON(!found);
capsnap->dirty_pages -= nr;
if (capsnap->dirty_pages == 0) {
- complete_capsnap = 1;
- drop_capsnap = ceph_try_drop_cap_snap(capsnap);
+ complete_capsnap = true;
+ if (!capsnap->writing) {
+ if (ceph_try_drop_cap_snap(ci, capsnap)) {
+ put++;
+ } else {
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+ flush_snaps = true;
+ }
+ }
}
dout("put_wrbuffer_cap_refs on %p cap_snap %p "
" snap %lld %d/%d -> %d/%d %s%s\n",
@@ -2730,12 +2742,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
if (last) {
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
- iput(inode);
- } else if (complete_capsnap) {
- ceph_flush_snaps(ci);
- wake_up_all(&ci->i_cap_wq);
+ } else if (flush_snaps) {
+ ceph_flush_snaps(ci, NULL);
}
- if (drop_capsnap)
+ if (complete_capsnap)
+ wake_up_all(&ci->i_cap_wq);
+ while (put-- > 0)
iput(inode);
}
@@ -2779,12 +2791,11 @@ static void invalidate_aliases(struct inode *inode)
*/
static void handle_cap_grant(struct ceph_mds_client *mdsc,
struct inode *inode, struct ceph_mds_caps *grant,
- u64 inline_version,
- void *inline_data, int inline_len,
+ struct ceph_string **pns, u64 inline_version,
+ void *inline_data, u32 inline_len,
struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
- struct ceph_cap *cap, int issued,
- u32 pool_ns_len)
+ struct ceph_cap *cap, int issued)
__releases(ci->i_ceph_lock)
__releases(mdsc->snap_rwsem)
{
@@ -2895,8 +2906,18 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
/* file layout may have changed */
- ci->i_layout = grant->layout;
- ci->i_pool_ns_len = pool_ns_len;
+ s64 old_pool = ci->i_layout.pool_id;
+ struct ceph_string *old_ns;
+
+ ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
+ old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+ lockdep_is_held(&ci->i_ceph_lock));
+ rcu_assign_pointer(ci->i_layout.pool_ns, *pns);
+
+ if (ci->i_layout.pool_id != old_pool || *pns != old_ns)
+ ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
+
+ *pns = old_ns;
/* size/truncate_seq? */
queue_trunc = ceph_fill_file_size(inode, issued,
@@ -2979,13 +3000,13 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
fill_inline = true;
}
- spin_unlock(&ci->i_ceph_lock);
-
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
- kick_flushing_inode_caps(mdsc, session, inode);
- up_read(&mdsc->snap_rwsem);
if (newcaps & ~issued)
wake = true;
+ kick_flushing_inode_caps(mdsc, session, inode);
+ up_read(&mdsc->snap_rwsem);
+ } else {
+ spin_unlock(&ci->i_ceph_lock);
}
if (fill_inline)
@@ -3029,23 +3050,24 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
- struct ceph_cap_flush *cf;
- struct rb_node *n;
+ struct ceph_cap_flush *cf, *tmp_cf;
LIST_HEAD(to_remove);
unsigned seq = le32_to_cpu(m->seq);
int dirty = le32_to_cpu(m->dirty);
int cleaned = 0;
- int drop = 0;
+ bool drop = false;
+ bool wake_ci = 0;
+ bool wake_mdsc = 0;
- n = rb_first(&ci->i_cap_flush_tree);
- while (n) {
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
- n = rb_next(&cf->i_node);
+ list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
if (cf->tid == flush_tid)
cleaned = cf->caps;
+ if (cf->caps == 0) /* capsnap */
+ continue;
if (cf->tid <= flush_tid) {
- rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
- list_add_tail(&cf->list, &to_remove);
+ if (__finish_cap_flush(NULL, ci, cf))
+ wake_ci = true;
+ list_add_tail(&cf->i_list, &to_remove);
} else {
cleaned &= ~cf->caps;
if (!cleaned)
@@ -3066,31 +3088,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&to_remove)) {
- list_for_each_entry(cf, &to_remove, list)
- rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
-
- n = rb_first(&mdsc->cap_flush_tree);
- cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
- if (!cf || cf->tid > flush_tid)
- wake_up_all(&mdsc->cap_flushing_wq);
+ list_for_each_entry(cf, &to_remove, i_list) {
+ if (__finish_cap_flush(mdsc, NULL, cf))
+ wake_mdsc = true;
}
if (ci->i_flushing_caps == 0) {
- list_del_init(&ci->i_flushing_item);
- if (!list_empty(&session->s_cap_flushing))
- dout(" mds%d still flushing cap on %p\n",
- session->s_mds,
- &list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item)->vfs_inode);
+ if (list_empty(&ci->i_cap_flush_list)) {
+ list_del_init(&ci->i_flushing_item);
+ if (!list_empty(&session->s_cap_flushing)) {
+ dout(" mds%d still flushing cap on %p\n",
+ session->s_mds,
+ &list_first_entry(&session->s_cap_flushing,
+ struct ceph_inode_info,
+ i_flushing_item)->vfs_inode);
+ }
+ }
mdsc->num_cap_flushing--;
dout(" inode %p now !flushing\n", inode);
if (ci->i_dirty_caps == 0) {
dout(" inode %p now clean\n", inode);
BUG_ON(!list_empty(&ci->i_dirty_item));
- drop = 1;
+ drop = true;
if (ci->i_wr_ref == 0 &&
ci->i_wrbuffer_ref_head == 0) {
BUG_ON(!ci->i_head_snapc);
@@ -3102,17 +3122,21 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
}
}
spin_unlock(&mdsc->cap_dirty_lock);
- wake_up_all(&ci->i_cap_wq);
out:
spin_unlock(&ci->i_ceph_lock);
while (!list_empty(&to_remove)) {
cf = list_first_entry(&to_remove,
- struct ceph_cap_flush, list);
- list_del(&cf->list);
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
ceph_free_cap_flush(cf);
}
+
+ if (wake_ci)
+ wake_up_all(&ci->i_cap_wq);
+ if (wake_mdsc)
+ wake_up_all(&mdsc->cap_flushing_wq);
if (drop)
iput(inode);
}
@@ -3131,7 +3155,9 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
u64 follows = le64_to_cpu(m->snap_follows);
struct ceph_cap_snap *capsnap;
- int drop = 0;
+ bool flushed = false;
+ bool wake_ci = false;
+ bool wake_mdsc = false;
dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
inode, ci, session->s_mds, follows);
@@ -3139,30 +3165,47 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
if (capsnap->follows == follows) {
- if (capsnap->flush_tid != flush_tid) {
+ if (capsnap->cap_flush.tid != flush_tid) {
dout(" cap_snap %p follows %lld tid %lld !="
" %lld\n", capsnap, follows,
- flush_tid, capsnap->flush_tid);
+ flush_tid, capsnap->cap_flush.tid);
break;
}
- WARN_ON(capsnap->dirty_pages || capsnap->writing);
- dout(" removing %p cap_snap %p follows %lld\n",
- inode, capsnap, follows);
- ceph_put_snap_context(capsnap->context);
- list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
- ceph_put_cap_snap(capsnap);
- wake_up_all(&mdsc->cap_flushing_wq);
- drop = 1;
+ flushed = true;
break;
} else {
dout(" skipping cap_snap %p follows %lld\n",
capsnap, capsnap->follows);
}
}
+ if (flushed) {
+ WARN_ON(capsnap->dirty_pages || capsnap->writing);
+ dout(" removing %p cap_snap %p follows %lld\n",
+ inode, capsnap, follows);
+ list_del(&capsnap->ci_item);
+ if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
+ wake_ci = true;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+
+ if (list_empty(&ci->i_cap_flush_list))
+ list_del_init(&ci->i_flushing_item);
+
+ if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
+ wake_mdsc = true;
+
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
spin_unlock(&ci->i_ceph_lock);
- if (drop)
+ if (flushed) {
+ ceph_put_snap_context(capsnap->context);
+ ceph_put_cap_snap(capsnap);
+ if (wake_ci)
+ wake_up_all(&ci->i_cap_wq);
+ if (wake_mdsc)
+ wake_up_all(&mdsc->cap_flushing_wq);
iput(inode);
+ }
}
/*
@@ -3267,7 +3310,8 @@ retry:
tcap->implemented |= issued;
if (cap == ci->i_auth_cap)
ci->i_auth_cap = tcap;
- if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
+ if (!list_empty(&ci->i_cap_flush_list) &&
+ ci->i_auth_cap == tcap) {
spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item,
&tcap->session->s_cap_flushing);
@@ -3420,20 +3464,18 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_cap *cap;
struct ceph_mds_caps *h;
struct ceph_mds_cap_peer *peer = NULL;
- struct ceph_snap_realm *realm;
+ struct ceph_snap_realm *realm = NULL;
+ struct ceph_string *pool_ns = NULL;
int mds = session->s_mds;
int op, issued;
u32 seq, mseq;
struct ceph_vino vino;
- u64 cap_id;
- u64 size, max_size;
u64 tid;
u64 inline_version = 0;
void *inline_data = NULL;
u32 inline_len = 0;
void *snaptrace;
size_t snaptrace_len;
- u32 pool_ns_len = 0;
void *p, *end;
dout("handle_caps from mds%d\n", mds);
@@ -3447,11 +3489,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
op = le32_to_cpu(h->op);
vino.ino = le64_to_cpu(h->ino);
vino.snap = CEPH_NOSNAP;
- cap_id = le64_to_cpu(h->cap_id);
seq = le32_to_cpu(h->seq);
mseq = le32_to_cpu(h->migrate_seq);
- size = le64_to_cpu(h->size);
- max_size = le64_to_cpu(h->max_size);
snaptrace = h + 1;
snaptrace_len = le32_to_cpu(h->snap_trace_len);
@@ -3490,6 +3529,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
u64 flush_tid;
u32 caller_uid, caller_gid;
u32 osd_epoch_barrier;
+ u32 pool_ns_len;
/* version >= 5 */
ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
/* version >= 6 */
@@ -3499,6 +3539,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
ceph_decode_32_safe(&p, end, caller_gid, bad);
/* version >= 8 */
ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+ if (pool_ns_len > 0) {
+ ceph_decode_need(&p, end, pool_ns_len, bad);
+ pool_ns = ceph_find_or_create_string(p, pool_ns_len);
+ p += pool_ns_len;
+ }
}
/* lookup ino */
@@ -3519,7 +3564,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
cap = ceph_get_cap(mdsc, NULL);
cap->cap_ino = vino.ino;
cap->queue_release = 1;
- cap->cap_id = cap_id;
+ cap->cap_id = le64_to_cpu(h->cap_id);
cap->mseq = mseq;
cap->seq = seq;
spin_lock(&session->s_cap_lock);
@@ -3554,10 +3599,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
handle_cap_import(mdsc, inode, h, peer, session,
&cap, &issued);
- handle_cap_grant(mdsc, inode, h,
+ handle_cap_grant(mdsc, inode, h, &pool_ns,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued,
- pool_ns_len);
+ msg->middle, session, cap, issued);
if (realm)
ceph_put_snap_realm(mdsc, realm);
goto done_unlocked;
@@ -3579,10 +3623,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_GRANT:
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
- handle_cap_grant(mdsc, inode, h,
+ handle_cap_grant(mdsc, inode, h, &pool_ns,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued,
- pool_ns_len);
+ msg->middle, session, cap, issued);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
@@ -3613,6 +3656,7 @@ done:
mutex_unlock(&session->s_mutex);
done_unlocked:
iput(inode);
+ ceph_put_string(pool_ns);
return;
bad:
@@ -3673,6 +3717,16 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
dout("flush_dirty_caps done\n");
}
+void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
+{
+ int i;
+ int bits = (fmode << 1) | 1;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (bits & (1 << i))
+ ci->i_nr_by_mode[i]++;
+ }
+}
+
/*
* Drop open file reference. If we were the last open file,
* we may need to release capabilities to the MDS (or schedule
@@ -3680,15 +3734,20 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
{
- struct inode *inode = &ci->vfs_inode;
- int last = 0;
-
+ int i, last = 0;
+ int bits = (fmode << 1) | 1;
spin_lock(&ci->i_ceph_lock);
- dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
- ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
- BUG_ON(ci->i_nr_by_mode[fmode] == 0);
- if (--ci->i_nr_by_mode[fmode] == 0)
- last++;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (bits & (1 << i)) {
+ BUG_ON(ci->i_nr_by_mode[i] == 0);
+ if (--ci->i_nr_by_mode[i] == 0)
+ last++;
+ }
+ }
+ dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
+ &ci->vfs_inode, fmode,
+ ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
+ ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
spin_unlock(&ci->i_ceph_lock);
if (last && ci->i_vino.snap == CEPH_NOSNAP)
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 6e0fedf6713b..c64a0b794d49 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -59,7 +59,7 @@ int ceph_init_dentry(struct dentry *dentry)
di->dentry = dentry;
di->lease_session = NULL;
- dentry->d_time = jiffies;
+ di->time = jiffies;
/* avoid reordering d_fsdata setup so that the check above is safe */
smp_mb();
dentry->d_fsdata = di;
@@ -1124,7 +1124,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
void ceph_invalidate_dentry_lease(struct dentry *dentry)
{
spin_lock(&dentry->d_lock);
- dentry->d_time = jiffies;
+ ceph_dentry(dentry)->time = jiffies;
ceph_dentry(dentry)->lease_shared_gen = 0;
spin_unlock(&dentry->d_lock);
}
@@ -1133,7 +1133,8 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
* Check if dentry lease is valid. If not, delete the lease. Try to
* renew if the least is more than half up.
*/
-static int dentry_lease_is_valid(struct dentry *dentry)
+static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
+ struct inode *dir)
{
struct ceph_dentry_info *di;
struct ceph_mds_session *s;
@@ -1141,12 +1142,11 @@ static int dentry_lease_is_valid(struct dentry *dentry)
u32 gen;
unsigned long ttl;
struct ceph_mds_session *session = NULL;
- struct inode *dir = NULL;
u32 seq = 0;
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
- if (di->lease_session) {
+ if (di && di->lease_session) {
s = di->lease_session;
spin_lock(&s->s_gen_ttl_lock);
gen = s->s_cap_gen;
@@ -1154,17 +1154,24 @@ static int dentry_lease_is_valid(struct dentry *dentry)
spin_unlock(&s->s_gen_ttl_lock);
if (di->lease_gen == gen &&
- time_before(jiffies, dentry->d_time) &&
+ time_before(jiffies, di->time) &&
time_before(jiffies, ttl)) {
valid = 1;
if (di->lease_renew_after &&
time_after(jiffies, di->lease_renew_after)) {
- /* we should renew */
- dir = d_inode(dentry->d_parent);
- session = ceph_get_mds_session(s);
- seq = di->lease_seq;
- di->lease_renew_after = 0;
- di->lease_renew_from = jiffies;
+ /*
+ * We should renew. If we're in RCU walk mode
+ * though, we can't do that so just return
+ * -ECHILD.
+ */
+ if (flags & LOOKUP_RCU) {
+ valid = -ECHILD;
+ } else {
+ session = ceph_get_mds_session(s);
+ seq = di->lease_seq;
+ di->lease_renew_after = 0;
+ di->lease_renew_from = jiffies;
+ }
}
}
}
@@ -1207,15 +1214,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *parent;
struct inode *dir;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
+ if (flags & LOOKUP_RCU) {
+ parent = ACCESS_ONCE(dentry->d_parent);
+ dir = d_inode_rcu(parent);
+ if (!dir)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
+ }
dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
- parent = dget_parent(dentry);
- dir = d_inode(parent);
-
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
@@ -1224,12 +1235,16 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
} else if (d_really_is_positive(dentry) &&
ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
valid = 1;
- } else if (dentry_lease_is_valid(dentry) ||
- dir_lease_is_valid(dir, dentry)) {
- if (d_really_is_positive(dentry))
- valid = ceph_is_any_caps(d_inode(dentry));
- else
- valid = 1;
+ } else {
+ valid = dentry_lease_is_valid(dentry, flags, dir);
+ if (valid == -ECHILD)
+ return valid;
+ if (valid || dir_lease_is_valid(dir, dentry)) {
+ if (d_really_is_positive(dentry))
+ valid = ceph_is_any_caps(d_inode(dentry));
+ else
+ valid = 1;
+ }
}
if (!valid) {
@@ -1238,6 +1253,9 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
struct ceph_mds_request *req;
int op, mask, err;
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
op = ceph_snap(dir) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
@@ -1273,7 +1291,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
ceph_dir_clear_complete(dir);
}
- dput(parent);
+ if (!(flags & LOOKUP_RCU))
+ dput(parent);
return valid;
}
@@ -1286,10 +1305,14 @@ static void ceph_d_release(struct dentry *dentry)
dout("d_release %p\n", dentry);
ceph_dentry_lru_del(dentry);
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry->d_lock);
+
if (di->lease_session)
ceph_put_mds_session(di->lease_session);
kmem_cache_free(ceph_dentry_cachep, di);
- dentry->d_fsdata = NULL;
}
static int ceph_snapdir_d_revalidate(struct dentry *dentry,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 0daaf7ceedc5..0f5375d8e030 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -708,7 +708,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
}
}
- ceph_put_page_vector(osd_data->pages, num_pages, false);
+ ceph_put_page_vector(osd_data->pages, num_pages, !aio_req->write);
ceph_osdc_put_request(req);
if (rc < 0)
@@ -821,6 +821,54 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
}
}
+/*
+ * Wait on any unsafe replies for the given inode. First wait on the
+ * newest request, and make that the upper bound. Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+void ceph_sync_write_wait(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct list_head *head = &ci->i_unsafe_writes;
+ struct ceph_osd_request *req;
+ u64 last_tid;
+
+ if (!S_ISREG(inode->i_mode))
+ return;
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (list_empty(head))
+ goto out;
+
+ /* set upper bound as _last_ entry in chain */
+
+ req = list_last_entry(head, struct ceph_osd_request,
+ r_unsafe_item);
+ last_tid = req->r_tid;
+
+ do {
+ ceph_osdc_get_request(req);
+ spin_unlock(&ci->i_unsafe_lock);
+
+ dout("sync_write_wait on tid %llu (until %llu)\n",
+ req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+
+ spin_lock(&ci->i_unsafe_lock);
+ /*
+ * from here on look at first entry in chain, since we
+ * only want to wait for anything older than last_tid
+ */
+ if (list_empty(head))
+ break;
+ req = list_first_entry(head, struct ceph_osd_request,
+ r_unsafe_item);
+ } while (req->r_tid < last_tid);
+out:
+ spin_unlock(&ci->i_unsafe_lock);
+}
static ssize_t
ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
@@ -964,7 +1012,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
len = ret;
}
- ceph_put_page_vector(pages, num_pages, false);
+ ceph_put_page_vector(pages, num_pages, !write);
ceph_osdc_put_request(req);
if (ret < 0)
@@ -985,6 +1033,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
}
if (aio_req) {
+ LIST_HEAD(osd_reqs);
+
if (aio_req->num_reqs == 0) {
kfree(aio_req);
return ret;
@@ -993,8 +1043,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
CEPH_CAP_FILE_RD);
- while (!list_empty(&aio_req->osd_reqs)) {
- req = list_first_entry(&aio_req->osd_reqs,
+ list_splice(&aio_req->osd_reqs, &osd_reqs);
+ while (!list_empty(&osd_reqs)) {
+ req = list_first_entry(&osd_reqs,
struct ceph_osd_request,
r_unsafe_item);
list_del_init(&req->r_unsafe_item);
@@ -1448,16 +1499,14 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
loff_t i_size;
- int ret;
+ loff_t ret;
inode_lock(inode);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
- if (ret < 0) {
- offset = ret;
+ if (ret < 0)
goto out;
- }
}
i_size = i_size_read(inode);
@@ -1473,7 +1522,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
* write() or lseek() might have altered it
*/
if (offset == 0) {
- offset = file->f_pos;
+ ret = file->f_pos;
goto out;
}
offset += file->f_pos;
@@ -1493,11 +1542,11 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
break;
}
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+ ret = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
inode_unlock(inode);
- return offset;
+ return ret;
}
static inline void ceph_zero_partial_page(
@@ -1583,9 +1632,9 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
{
int ret = 0;
struct ceph_inode_info *ci = ceph_inode(inode);
- s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
- s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+ s32 stripe_unit = ci->i_layout.stripe_unit;
+ s32 stripe_count = ci->i_layout.stripe_count;
+ s32 object_size = ci->i_layout.object_size;
u64 object_set_size = object_size * stripe_count;
u64 nearly, t;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 99bdef66213a..dd3a6dbf71eb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -446,7 +446,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
- ci->i_pool_ns_len = 0;
+ RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
ci->i_fragtree = RB_ROOT;
mutex_init(&ci->i_fragtree_mutex);
@@ -468,7 +468,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item);
ci->i_prealloc_cap_flush = NULL;
- ci->i_cap_flush_tree = RB_ROOT;
+ INIT_LIST_HEAD(&ci->i_cap_flush_list);
init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
@@ -477,7 +477,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
- for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
ci->i_nr_by_mode[i] = 0;
mutex_init(&ci->i_truncate_mutex);
@@ -570,6 +570,8 @@ void ceph_destroy_inode(struct inode *inode)
if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+ ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+
call_rcu(&inode->i_rcu, ceph_i_callback);
}
@@ -583,6 +585,14 @@ int ceph_drop_inode(struct inode *inode)
return 1;
}
+void ceph_evict_inode(struct inode *inode)
+{
+ /* wait unsafe sync writes */
+ ceph_sync_write_wait(inode);
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
static inline blkcnt_t calc_inode_blocks(u64 size)
{
return (size + (1<<9) - 1) >> 9;
@@ -733,6 +743,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
int issued = 0, implemented, new_issued;
struct timespec mtime, atime, ctime;
struct ceph_buffer *xattr_blob = NULL;
+ struct ceph_string *pool_ns = NULL;
struct ceph_cap *new_cap = NULL;
int err = 0;
bool wake = false;
@@ -760,6 +771,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
iinfo->xattr_len);
}
+ if (iinfo->pool_ns_len > 0)
+ pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
+ iinfo->pool_ns_len);
+
spin_lock(&ci->i_ceph_lock);
/*
@@ -814,10 +829,18 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (new_version ||
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
- if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
+ s64 old_pool = ci->i_layout.pool_id;
+ struct ceph_string *old_ns;
+
+ ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
+ old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+ lockdep_is_held(&ci->i_ceph_lock));
+ rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
+
+ if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
- ci->i_layout = info->layout;
- ci->i_pool_ns_len = iinfo->pool_ns_len;
+
+ pool_ns = old_ns;
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
@@ -985,6 +1008,7 @@ out:
ceph_put_cap(mdsc, new_cap);
if (xattr_blob)
ceph_buffer_put(xattr_blob);
+ ceph_put_string(pool_ns);
return err;
}
@@ -1018,7 +1042,7 @@ static void update_dentry_lease(struct dentry *dentry,
goto out_unlock;
if (di->lease_gen == session->s_cap_gen &&
- time_before(ttl, dentry->d_time))
+ time_before(ttl, di->time))
goto out_unlock; /* we already have a newer lease. */
if (di->lease_session && di->lease_session != session)
@@ -1032,7 +1056,7 @@ static void update_dentry_lease(struct dentry *dentry,
di->lease_seq = le32_to_cpu(lease->seq);
di->lease_renew_after = half_ttl;
di->lease_renew_from = 0;
- dentry->d_time = ttl;
+ di->time = ttl;
out_unlock:
spin_unlock(&dentry->d_lock);
return;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index be6b1657b1af..7d752d53353a 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -21,10 +21,10 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
if (!err) {
- l.stripe_unit = ceph_file_layout_su(ci->i_layout);
- l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- l.object_size = ceph_file_layout_object_size(ci->i_layout);
- l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+ l.stripe_unit = ci->i_layout.stripe_unit;
+ l.stripe_count = ci->i_layout.stripe_count;
+ l.object_size = ci->i_layout.object_size;
+ l.data_pool = ci->i_layout.pool_id;
l.preferred_osd = (s32)-1;
if (copy_to_user(arg, &l, sizeof(l)))
return -EFAULT;
@@ -82,19 +82,19 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
if (l.stripe_count)
nl.stripe_count = l.stripe_count;
else
- nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+ nl.stripe_count = ci->i_layout.stripe_count;
if (l.stripe_unit)
nl.stripe_unit = l.stripe_unit;
else
- nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
+ nl.stripe_unit = ci->i_layout.stripe_unit;
if (l.object_size)
nl.object_size = l.object_size;
else
- nl.object_size = ceph_file_layout_object_size(ci->i_layout);
+ nl.object_size = ci->i_layout.object_size;
if (l.data_pool)
nl.data_pool = l.data_pool;
else
- nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
+ nl.data_pool = ci->i_layout.pool_id;
/* this is obsolete, and always -1 */
nl.preferred_osd = le64_to_cpu(-1);
@@ -183,7 +183,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
struct ceph_object_locator oloc;
- struct ceph_object_id oid;
+ CEPH_DEFINE_OID_ONSTACK(oid);
u64 len = 1, olen;
u64 tmp;
struct ceph_pg pgid;
@@ -202,8 +202,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
return -EIO;
}
dl.file_offset -= dl.object_offset;
- dl.object_size = ceph_file_layout_object_size(ci->i_layout);
- dl.block_size = ceph_file_layout_su(ci->i_layout);
+ dl.object_size = ci->i_layout.object_size;
+ dl.block_size = ci->i_layout.stripe_unit;
/* block_offset = object_offset % block_size */
tmp = dl.object_offset;
@@ -212,10 +212,13 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
- oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
+ oloc.pool = ci->i_layout.pool_id;
+ oloc.pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
ceph_oid_printf(&oid, "%s", dl.object_name);
r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
+
+ ceph_oloc_destroy(&oloc);
if (r < 0) {
up_read(&osdc->lock);
return r;
@@ -247,9 +250,8 @@ static long ceph_ioctl_lazyio(struct file *file)
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
spin_lock(&ci->i_ceph_lock);
- ci->i_nr_by_mode[fi->fmode]--;
fi->fmode |= CEPH_FILE_MODE_LAZY;
- ci->i_nr_by_mode[fi->fmode]++;
+ ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4e8678a612b6..fa59a85226b2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -48,7 +48,7 @@
struct ceph_reconnect_state {
int nr_caps;
struct ceph_pagelist *pagelist;
- bool flock;
+ unsigned msg_version;
};
static void __wake_requests(struct ceph_mds_client *mdsc,
@@ -100,12 +100,15 @@ static int parse_reply_info_in(void **p, void *end,
} else
info->inline_version = CEPH_INLINE_NONE;
+ info->pool_ns_len = 0;
+ info->pool_ns_data = NULL;
if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
- ceph_decode_need(p, end, info->pool_ns_len, bad);
- *p += info->pool_ns_len;
- } else {
- info->pool_ns_len = 0;
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
}
return 0;
@@ -469,7 +472,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
INIT_LIST_HEAD(&s->s_cap_flushing);
- INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
dout("register_session mds%d\n", mds);
if (mds >= mdsc->max_sessions) {
@@ -1145,19 +1147,17 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
invalidate = true;
- while (true) {
- struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
- if (!n)
- break;
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
- rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
- list_add(&cf->list, &to_remove);
+ while (!list_empty(&ci->i_cap_flush_list)) {
+ cf = list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
+ list_add(&cf->i_list, &to_remove);
}
spin_lock(&mdsc->cap_dirty_lock);
- list_for_each_entry(cf, &to_remove, list)
- rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+ list_for_each_entry(cf, &to_remove, i_list)
+ list_del(&cf->g_list);
if (!list_empty(&ci->i_dirty_item)) {
pr_warn_ratelimited(
@@ -1181,7 +1181,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_unlock(&mdsc->cap_dirty_lock);
if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
- list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
+ list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
ci->i_prealloc_cap_flush = NULL;
}
}
@@ -1189,8 +1189,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
while (!list_empty(&to_remove)) {
struct ceph_cap_flush *cf;
cf = list_first_entry(&to_remove,
- struct ceph_cap_flush, list);
- list_del(&cf->list);
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
ceph_free_cap_flush(cf);
}
@@ -1212,6 +1212,8 @@ static void remove_session_caps(struct ceph_mds_session *session)
dout("remove_session_caps on %p\n", session);
iterate_session_caps(session, remove_session_caps_cb, fsc);
+ wake_up_all(&fsc->mdsc->cap_flushing_wq);
+
spin_lock(&session->s_cap_lock);
if (session->s_nr_caps > 0) {
struct inode *inode;
@@ -1478,35 +1480,21 @@ static int trim_caps(struct ceph_mds_client *mdsc,
return 0;
}
-static int check_capsnap_flush(struct ceph_inode_info *ci,
- u64 want_snap_seq)
-{
- int ret = 1;
- spin_lock(&ci->i_ceph_lock);
- if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
- struct ceph_cap_snap *capsnap =
- list_first_entry(&ci->i_cap_snaps,
- struct ceph_cap_snap, ci_item);
- ret = capsnap->follows >= want_snap_seq;
- }
- spin_unlock(&ci->i_ceph_lock);
- return ret;
-}
-
static int check_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_tid)
{
- struct rb_node *n;
- struct ceph_cap_flush *cf;
int ret = 1;
spin_lock(&mdsc->cap_dirty_lock);
- n = rb_first(&mdsc->cap_flush_tree);
- cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
- if (cf && cf->tid <= want_flush_tid) {
- dout("check_caps_flush still flushing tid %llu <= %llu\n",
- cf->tid, want_flush_tid);
- ret = 0;
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ if (cf->tid <= want_flush_tid) {
+ dout("check_caps_flush still flushing tid "
+ "%llu <= %llu\n", cf->tid, want_flush_tid);
+ ret = 0;
+ }
}
spin_unlock(&mdsc->cap_dirty_lock);
return ret;
@@ -1518,54 +1506,9 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
* returns true if we've flushed through want_flush_tid
*/
static void wait_caps_flush(struct ceph_mds_client *mdsc,
- u64 want_flush_tid, u64 want_snap_seq)
+ u64 want_flush_tid)
{
- int mds;
-
- dout("check_caps_flush want %llu snap want %llu\n",
- want_flush_tid, want_snap_seq);
- mutex_lock(&mdsc->mutex);
- for (mds = 0; mds < mdsc->max_sessions; ) {
- struct ceph_mds_session *session = mdsc->sessions[mds];
- struct inode *inode = NULL;
-
- if (!session) {
- mds++;
- continue;
- }
- get_session(session);
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&session->s_mutex);
- if (!list_empty(&session->s_cap_snaps_flushing)) {
- struct ceph_cap_snap *capsnap =
- list_first_entry(&session->s_cap_snaps_flushing,
- struct ceph_cap_snap,
- flushing_item);
- struct ceph_inode_info *ci = capsnap->ci;
- if (!check_capsnap_flush(ci, want_snap_seq)) {
- dout("check_cap_flush still flushing snap %p "
- "follows %lld <= %lld to mds%d\n",
- &ci->vfs_inode, capsnap->follows,
- want_snap_seq, mds);
- inode = igrab(&ci->vfs_inode);
- }
- }
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
-
- if (inode) {
- wait_event(mdsc->cap_flushing_wq,
- check_capsnap_flush(ceph_inode(inode),
- want_snap_seq));
- iput(inode);
- } else {
- mds++;
- }
-
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
+ dout("check_caps_flush want %llu\n", want_flush_tid);
wait_event(mdsc->cap_flushing_wq,
check_caps_flush(mdsc, want_flush_tid));
@@ -2163,6 +2106,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
mds = __choose_mds(mdsc, req);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+ if (mdsc->mdsmap_err) {
+ err = mdsc->mdsmap_err;
+ dout("do_request mdsmap err %d\n", err);
+ goto finish;
+ }
dout("do_request no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
goto out;
@@ -2292,14 +2240,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
- /* deny access to directories with pool_ns layouts */
- if (req->r_inode && S_ISDIR(req->r_inode->i_mode) &&
- ceph_inode(req->r_inode)->i_pool_ns_len)
- return -EIO;
- if (req->r_locked_dir &&
- ceph_inode(req->r_locked_dir)->i_pool_ns_len)
- return -EIO;
-
/* issue */
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
@@ -2791,13 +2731,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_mds_cap_reconnect v2;
struct ceph_mds_cap_reconnect_v1 v1;
} rec;
- size_t reclen;
struct ceph_inode_info *ci;
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
char *path;
int pathlen, err;
u64 pathbase;
+ u64 snap_follows;
struct dentry *dentry;
ci = cap->ci;
@@ -2820,9 +2760,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
path = NULL;
pathlen = 0;
}
- err = ceph_pagelist_encode_string(pagelist, path, pathlen);
- if (err)
- goto out_free;
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
@@ -2830,14 +2767,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
cap->mseq = 0; /* and migrate_seq */
cap->cap_gen = cap->session->s_cap_gen;
- if (recon_state->flock) {
+ if (recon_state->msg_version >= 2) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v2.pathbase = cpu_to_le64(pathbase);
rec.v2.flock_len = 0;
- reclen = sizeof(rec.v2);
} else {
rec.v1.cap_id = cpu_to_le64(cap->cap_id);
rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
@@ -2847,13 +2783,23 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v1.pathbase = cpu_to_le64(pathbase);
- reclen = sizeof(rec.v1);
+ }
+
+ if (list_empty(&ci->i_cap_snaps)) {
+ snap_follows = 0;
+ } else {
+ struct ceph_cap_snap *capsnap =
+ list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap, ci_item);
+ snap_follows = capsnap->follows;
}
spin_unlock(&ci->i_ceph_lock);
- if (recon_state->flock) {
+ if (recon_state->msg_version >= 2) {
int num_fcntl_locks, num_flock_locks;
struct ceph_filelock *flocks;
+ size_t struct_len, total_len = 0;
+ u8 struct_v = 0;
encode_again:
ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
@@ -2872,20 +2818,51 @@ encode_again:
goto encode_again;
goto out_free;
}
+
+ if (recon_state->msg_version >= 3) {
+ /* version, compat_version and struct_len */
+ total_len = 2 * sizeof(u8) + sizeof(u32);
+ struct_v = 2;
+ }
/*
* number of encoded locks is stable, so copy to pagelist
*/
- rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
- (num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock));
- err = ceph_pagelist_append(pagelist, &rec, reclen);
- if (!err)
- err = ceph_locks_to_pagelist(flocks, pagelist,
- num_fcntl_locks,
- num_flock_locks);
+ struct_len = 2 * sizeof(u32) +
+ (num_fcntl_locks + num_flock_locks) *
+ sizeof(struct ceph_filelock);
+ rec.v2.flock_len = cpu_to_le32(struct_len);
+
+ struct_len += sizeof(rec.v2);
+ struct_len += sizeof(u32) + pathlen;
+
+ if (struct_v >= 2)
+ struct_len += sizeof(u64); /* snap_follows */
+
+ total_len += struct_len;
+ err = ceph_pagelist_reserve(pagelist, total_len);
+
+ if (!err) {
+ if (recon_state->msg_version >= 3) {
+ ceph_pagelist_encode_8(pagelist, struct_v);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, struct_len);
+ }
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
+ ceph_locks_to_pagelist(flocks, pagelist,
+ num_fcntl_locks,
+ num_flock_locks);
+ if (struct_v >= 2)
+ ceph_pagelist_encode_64(pagelist, snap_follows);
+ }
kfree(flocks);
} else {
- err = ceph_pagelist_append(pagelist, &rec, reclen);
+ size_t size = sizeof(u32) + pathlen + sizeof(rec.v1);
+ err = ceph_pagelist_reserve(pagelist, size);
+ if (!err) {
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+ }
}
recon_state->nr_caps++;
@@ -2976,7 +2953,12 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
recon_state.nr_caps = 0;
recon_state.pagelist = pagelist;
- recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
+ if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
+ recon_state.msg_version = 3;
+ else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK)
+ recon_state.msg_version = 2;
+ else
+ recon_state.msg_version = 1;
err = iterate_session_caps(session, encode_caps_cb, &recon_state);
if (err < 0)
goto fail;
@@ -3005,8 +2987,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
goto fail;
}
- if (recon_state.flock)
- reply->hdr.version = cpu_to_le16(2);
+ reply->hdr.version = cpu_to_le16(recon_state.msg_version);
/* raced with cap release? */
if (s_nr_caps != recon_state.nr_caps) {
@@ -3231,7 +3212,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
msecs_to_jiffies(le32_to_cpu(h->duration_ms));
di->lease_seq = seq;
- dentry->d_time = di->lease_renew_from + duration;
+ di->time = di->lease_renew_from + duration;
di->lease_renew_after = di->lease_renew_from +
(duration >> 1);
di->lease_renew_from = 0;
@@ -3297,47 +3278,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
}
/*
- * Preemptively release a lease we expect to invalidate anyway.
- * Pass @inode always, @dentry is optional.
- */
-void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
- struct dentry *dentry)
-{
- struct ceph_dentry_info *di;
- struct ceph_mds_session *session;
- u32 seq;
-
- BUG_ON(inode == NULL);
- BUG_ON(dentry == NULL);
-
- /* is dentry lease valid? */
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- if (!di || !di->lease_session ||
- di->lease_session->s_mds < 0 ||
- di->lease_gen != di->lease_session->s_cap_gen ||
- !time_before(jiffies, dentry->d_time)) {
- dout("lease_release inode %p dentry %p -- "
- "no lease\n",
- inode, dentry);
- spin_unlock(&dentry->d_lock);
- return;
- }
-
- /* we do have a lease on this dentry; note mds and seq */
- session = ceph_get_mds_session(di->lease_session);
- seq = di->lease_seq;
- __ceph_mdsc_drop_dentry_lease(dentry);
- spin_unlock(&dentry->d_lock);
-
- dout("lease_release inode %p dentry %p to mds%d\n",
- inode, dentry, session->s_mds);
- ceph_mdsc_lease_send_msg(session, inode, dentry,
- CEPH_MDS_LEASE_RELEASE, seq);
- ceph_put_mds_session(session);
-}
-
-/*
* drop all leases (and dentry refs) in preparation for umount
*/
static void drop_leases(struct ceph_mds_client *mdsc)
@@ -3470,7 +3410,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
mdsc->last_cap_flush_tid = 1;
- mdsc->cap_flush_tree = RB_ROOT;
+ INIT_LIST_HEAD(&mdsc->cap_flush_list);
INIT_LIST_HEAD(&mdsc->cap_dirty);
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
mdsc->num_cap_flushing = 0;
@@ -3585,7 +3525,7 @@ restart:
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
- u64 want_tid, want_flush, want_snap;
+ u64 want_tid, want_flush;
if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return;
@@ -3598,17 +3538,19 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
ceph_flush_dirty_caps(mdsc);
spin_lock(&mdsc->cap_dirty_lock);
want_flush = mdsc->last_cap_flush_tid;
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_last_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ cf->wake = true;
+ }
spin_unlock(&mdsc->cap_dirty_lock);
- down_read(&mdsc->snap_rwsem);
- want_snap = mdsc->last_snap_seq;
- up_read(&mdsc->snap_rwsem);
-
- dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
- want_tid, want_flush, want_snap);
+ dout("sync want tid %lld flush_seq %lld\n",
+ want_tid, want_flush);
wait_unsafe_requests(mdsc, want_tid);
- wait_caps_flush(mdsc, want_flush, want_snap);
+ wait_caps_flush(mdsc, want_flush);
}
/*
@@ -3729,11 +3671,86 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
dout("mdsc_destroy %p done\n", mdsc);
}
+void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+ struct ceph_fs_client *fsc = mdsc->fsc;
+ const char *mds_namespace = fsc->mount_options->mds_namespace;
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ u32 epoch;
+ u32 map_len;
+ u32 num_fs;
+ u32 mount_fscid = (u32)-1;
+ u8 struct_v, struct_cv;
+ int err = -EINVAL;
+
+ ceph_decode_need(&p, end, sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+
+ dout("handle_fsmap epoch %u\n", epoch);
+
+ ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+ struct_v = ceph_decode_8(&p);
+ struct_cv = ceph_decode_8(&p);
+ map_len = ceph_decode_32(&p);
+
+ ceph_decode_need(&p, end, sizeof(u32) * 3, bad);
+ p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */
+
+ num_fs = ceph_decode_32(&p);
+ while (num_fs-- > 0) {
+ void *info_p, *info_end;
+ u32 info_len;
+ u8 info_v, info_cv;
+ u32 fscid, namelen;
+
+ ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+ info_v = ceph_decode_8(&p);
+ info_cv = ceph_decode_8(&p);
+ info_len = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, info_len, bad);
+ info_p = p;
+ info_end = p + info_len;
+ p = info_end;
+
+ ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
+ fscid = ceph_decode_32(&info_p);
+ namelen = ceph_decode_32(&info_p);
+ ceph_decode_need(&info_p, info_end, namelen, bad);
+
+ if (mds_namespace &&
+ strlen(mds_namespace) == namelen &&
+ !strncmp(mds_namespace, (char *)info_p, namelen)) {
+ mount_fscid = fscid;
+ break;
+ }
+ }
+
+ ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
+ if (mount_fscid != (u32)-1) {
+ fsc->client->monc.fs_cluster_id = mount_fscid;
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+ 0, true);
+ ceph_monc_renew_subs(&fsc->client->monc);
+ } else {
+ err = -ENOENT;
+ goto err_out;
+ }
+ return;
+bad:
+ pr_err("error decoding fsmap\n");
+err_out:
+ mutex_lock(&mdsc->mutex);
+ mdsc->mdsmap_err = -ENOENT;
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+ mutex_unlock(&mdsc->mutex);
+ return;
+}
/*
* handle mds map update.
*/
-void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
{
u32 epoch;
u32 maplen;
@@ -3840,7 +3857,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
switch (type) {
case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(mdsc, msg);
+ ceph_mdsc_handle_mdsmap(mdsc, msg);
+ break;
+ case CEPH_MSG_FS_MAP_USER:
+ ceph_mdsc_handle_fsmap(mdsc, msg);
break;
case CEPH_MSG_CLIENT_SESSION:
handle_session(s, msg);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e7d38aac7109..6b3679737d4a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -45,6 +45,7 @@ struct ceph_mds_reply_info_in {
u32 inline_len;
char *inline_data;
u32 pool_ns_len;
+ char *pool_ns_data;
};
struct ceph_mds_reply_dir_entry {
@@ -151,7 +152,6 @@ struct ceph_mds_session {
/* protected by mutex */
struct list_head s_cap_flushing; /* inodes w/ flushing caps */
- struct list_head s_cap_snaps_flushing;
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
@@ -275,8 +275,10 @@ struct ceph_mds_request {
struct ceph_pool_perm {
struct rb_node node;
- u32 pool;
int perm;
+ s64 pool;
+ size_t pool_ns_len;
+ char pool_ns[];
};
/*
@@ -290,6 +292,7 @@ struct ceph_mds_client {
struct completion safe_umount_waiters;
wait_queue_head_t session_close_wq;
struct list_head waiting_for_map;
+ int mdsmap_err;
struct ceph_mds_session **sessions; /* NULL for mds if no session */
atomic_t num_sessions;
@@ -321,7 +324,7 @@ struct ceph_mds_client {
spinlock_t snap_flush_lock;
u64 last_cap_flush_tid;
- struct rb_root cap_flush_tree;
+ struct list_head cap_flush_list;
struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */
@@ -382,10 +385,6 @@ extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
- struct inode *inode,
- struct dentry *dn);
-
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct inode *dir);
@@ -420,8 +419,10 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
struct dentry *dentry, char action,
u32 seq);
-extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg);
+extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
+extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
extern struct ceph_mds_session *
ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 9caaa7ffc93f..9ff5219d849e 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -520,9 +520,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
ihold(inode);
atomic_set(&capsnap->nref, 1);
- capsnap->ci = ci;
INIT_LIST_HEAD(&capsnap->ci_item);
- INIT_LIST_HEAD(&capsnap->flushing_item);
capsnap->follows = old_snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL);
@@ -551,7 +549,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
ci->i_wrbuffer_ref_head = 0;
capsnap->context = old_snapc;
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
- old_snapc = NULL;
if (used & CEPH_CAP_FILE_WR) {
dout("queue_cap_snap %p cap_snap %p snapc %p"
@@ -563,6 +560,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
__ceph_finish_cap_snap(ci, capsnap);
}
capsnap = NULL;
+ old_snapc = NULL;
update_snapc:
if (ci->i_head_snapc) {
@@ -603,6 +601,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->dirty_pages);
return 0;
}
+
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
inode, capsnap, capsnap->context,
capsnap->context->seq, ceph_cap_string(capsnap->dirty),
@@ -799,9 +799,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
inode = &ci->vfs_inode;
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, &session, 0);
- spin_unlock(&ci->i_ceph_lock);
+ ceph_flush_snaps(ci, &session);
iput(inode);
spin_lock(&mdsc->snap_flush_lock);
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 91e02481ce06..e247f6f0feb7 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -108,7 +108,6 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
* mount options
*/
enum {
- Opt_mds_namespace,
Opt_wsize,
Opt_rsize,
Opt_rasize,
@@ -121,6 +120,7 @@ enum {
Opt_last_int,
/* int args above */
Opt_snapdirname,
+ Opt_mds_namespace,
Opt_last_string,
/* string args above */
Opt_dirstat,
@@ -144,7 +144,6 @@ enum {
};
static match_table_t fsopt_tokens = {
- {Opt_mds_namespace, "mds_namespace=%d"},
{Opt_wsize, "wsize=%d"},
{Opt_rsize, "rsize=%d"},
{Opt_rasize, "rasize=%d"},
@@ -156,6 +155,7 @@ static match_table_t fsopt_tokens = {
{Opt_congestion_kb, "write_congestion_kb=%d"},
/* int args above */
{Opt_snapdirname, "snapdirname=%s"},
+ {Opt_mds_namespace, "mds_namespace=%s"},
/* string args above */
{Opt_dirstat, "dirstat"},
{Opt_nodirstat, "nodirstat"},
@@ -212,11 +212,14 @@ static int parse_fsopt_token(char *c, void *private)
if (!fsopt->snapdir_name)
return -ENOMEM;
break;
-
- /* misc */
case Opt_mds_namespace:
- fsopt->mds_namespace = intval;
+ fsopt->mds_namespace = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ if (!fsopt->mds_namespace)
+ return -ENOMEM;
break;
+ /* misc */
case Opt_wsize:
fsopt->wsize = intval;
break;
@@ -302,6 +305,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
{
dout("destroy_mount_options %p\n", args);
kfree(args->snapdir_name);
+ kfree(args->mds_namespace);
kfree(args->server_path);
kfree(args);
}
@@ -333,6 +337,9 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
if (ret)
return ret;
+ ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
+ if (ret)
+ return ret;
ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
if (ret)
@@ -376,7 +383,6 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
fsopt->congestion_kb = default_congestion_kb();
- fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
/*
* Distinguish the server list from the path in "dev_name".
@@ -469,8 +475,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noacl");
#endif
- if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
- seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
+ if (fsopt->mds_namespace)
+ seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace);
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -509,9 +515,11 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
switch (type) {
case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(fsc->mdsc, msg);
+ ceph_mdsc_handle_mdsmap(fsc->mdsc, msg);
+ return 0;
+ case CEPH_MSG_FS_MAP_USER:
+ ceph_mdsc_handle_fsmap(fsc->mdsc, msg);
return 0;
-
default:
return -1;
}
@@ -543,8 +551,14 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail;
}
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
- ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
+
+ if (fsopt->mds_namespace == NULL) {
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+ 0, true);
+ } else {
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP,
+ 0, false);
+ }
fsc->mount_options = fsopt;
@@ -672,8 +686,8 @@ static int __init init_caches(void)
if (ceph_dentry_cachep == NULL)
goto bad_dentry;
- ceph_file_cachep = KMEM_CACHE(ceph_file_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
+
if (ceph_file_cachep == NULL)
goto bad_file;
@@ -731,6 +745,7 @@ static const struct super_operations ceph_super_ops = {
.destroy_inode = ceph_destroy_inode,
.write_inode = ceph_write_inode,
.drop_inode = ceph_drop_inode,
+ .evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
.show_options = ceph_show_options,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 0168b49fb6ad..3e3fa9163059 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -62,7 +62,6 @@ struct ceph_mount_options {
int cap_release_safety;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
- int mds_namespace;
/*
* everything above this point can be memcmp'd; everything below
@@ -70,6 +69,7 @@ struct ceph_mount_options {
*/
char *snapdir_name; /* default ".snap" */
+ char *mds_namespace; /* default NULL */
char *server_path; /* default "/" */
};
@@ -147,6 +147,14 @@ struct ceph_cap {
#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
+struct ceph_cap_flush {
+ u64 tid;
+ int caps; /* 0 means capsnap */
+ bool wake; /* wake up flush waiters when finish ? */
+ struct list_head g_list; // global
+ struct list_head i_list; // per inode
+};
+
/*
* Snapped cap state that is pending flush to mds. When a snapshot occurs,
* we first complete any in-process sync writes and writeback any dirty
@@ -154,10 +162,11 @@ struct ceph_cap {
*/
struct ceph_cap_snap {
atomic_t nref;
- struct ceph_inode_info *ci;
- struct list_head ci_item, flushing_item;
+ struct list_head ci_item;
+
+ struct ceph_cap_flush cap_flush;
- u64 follows, flush_tid;
+ u64 follows;
int issued, dirty;
struct ceph_snap_context *context;
@@ -186,16 +195,6 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
}
}
-struct ceph_cap_flush {
- u64 tid;
- int caps;
- struct rb_node g_node; // global
- union {
- struct rb_node i_node; // inode
- struct list_head list;
- };
-};
-
/*
* The frag tree describes how a directory is fragmented, potentially across
* multiple metadata servers. It is also used to indicate points where
@@ -246,7 +245,7 @@ struct ceph_dentry_info {
unsigned long lease_renew_after, lease_renew_from;
struct list_head lru;
struct dentry *dentry;
- u64 time;
+ unsigned long time;
u64 offset;
};
@@ -287,7 +286,6 @@ struct ceph_inode_info {
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
- size_t i_pool_ns_len;
char *i_symlink;
/* for dirs */
@@ -311,7 +309,7 @@ struct ceph_inode_info {
* overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */
struct ceph_cap_flush *i_prealloc_cap_flush;
- struct rb_root i_cap_flush_tree;
+ struct list_head i_cap_flush_list;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
@@ -322,7 +320,7 @@ struct ceph_inode_info {
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
- int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
+ int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */
struct mutex i_truncate_mutex;
u32 i_truncate_seq; /* last truncate to smaller size */
@@ -471,6 +469,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */
+#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count,
@@ -750,6 +750,7 @@ extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
extern void ceph_destroy_inode(struct inode *inode);
extern int ceph_drop_inode(struct inode *inode);
+extern void ceph_evict_inode(struct inode *inode);
extern struct inode *ceph_get_inode(struct super_block *sb,
struct ceph_vino vino);
@@ -890,9 +891,8 @@ extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc);
-extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int again);
+extern void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session);
extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
@@ -907,10 +907,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page);
/* for counting open files by mode */
-static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
-{
- ci->i_nr_by_mode[mode]++;
-}
+extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
/* addr.c */
@@ -931,6 +928,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
extern int ceph_release(struct inode *inode, struct file *filp);
extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
char *data, size_t len);
+extern void ceph_sync_write_wait(struct inode *inode);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
extern const struct file_operations ceph_snapdir_fops;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 4870b29df224..adc231892b0d 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -57,81 +57,88 @@ struct ceph_vxattr {
static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
{
- size_t s;
- char *p = (char *)&ci->i_layout;
-
- for (s = 0; s < sizeof(ci->i_layout); s++, p++)
- if (*p)
- return true;
- return false;
+ struct ceph_file_layout *fl = &ci->i_layout;
+ return (fl->stripe_unit > 0 || fl->stripe_count > 0 ||
+ fl->object_size > 0 || fl->pool_id >= 0 ||
+ rcu_dereference_raw(fl->pool_ns) != NULL);
}
static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
- int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
- s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ struct ceph_string *pool_ns;
+ s64 pool = ci->i_layout.pool_id;
const char *pool_name;
+ const char *ns_field = " pool_namespace=";
char buf[128];
+ size_t len, total_len = 0;
+ int ret;
+
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) {
- size_t len = strlen(pool_name);
- ret = snprintf(buf, sizeof(buf),
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
- if (!size) {
- ret += len;
- } else if (ret + len > size) {
- ret = -ERANGE;
- } else {
- memcpy(val, buf, ret);
+ len = snprintf(buf, sizeof(buf),
+ "stripe_unit=%u stripe_count=%u object_size=%u pool=",
+ ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+ ci->i_layout.object_size);
+ total_len = len + strlen(pool_name);
+ } else {
+ len = snprintf(buf, sizeof(buf),
+ "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
+ ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+ ci->i_layout.object_size, (unsigned long long)pool);
+ total_len = len;
+ }
+
+ if (pool_ns)
+ total_len += strlen(ns_field) + pool_ns->len;
+
+ if (!size) {
+ ret = total_len;
+ } else if (total_len > size) {
+ ret = -ERANGE;
+ } else {
+ memcpy(val, buf, len);
+ ret = len;
+ if (pool_name) {
+ len = strlen(pool_name);
memcpy(val + ret, pool_name, len);
ret += len;
}
- } else {
- ret = snprintf(buf, sizeof(buf),
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
- (unsigned long long)pool);
- if (size) {
- if (ret <= size)
- memcpy(val, buf, ret);
- else
- ret = -ERANGE;
+ if (pool_ns) {
+ len = strlen(ns_field);
+ memcpy(val + ret, ns_field, len);
+ ret += len;
+ memcpy(val + ret, pool_ns->str, pool_ns->len);
+ ret += pool_ns->len;
}
}
up_read(&osdc->lock);
+ ceph_put_string(pool_ns);
return ret;
}
static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_su(ci->i_layout));
+ return snprintf(val, size, "%u", ci->i_layout.stripe_unit);
}
static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+ return snprintf(val, size, "%u", ci->i_layout.stripe_count);
}
static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+ return snprintf(val, size, "%u", ci->i_layout.object_size);
}
static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
@@ -140,7 +147,7 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
- s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ s64 pool = ci->i_layout.pool_id;
const char *pool_name;
down_read(&osdc->lock);
@@ -153,6 +160,18 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
return ret;
}
+static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ int ret = 0;
+ struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ if (ns) {
+ ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str);
+ ceph_put_string(ns);
+ }
+ return ret;
+}
+
/* directories */
static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -241,6 +260,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
XATTR_LAYOUT_FIELD(dir, layout, object_size),
XATTR_LAYOUT_FIELD(dir, layout, pool),
+ XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
XATTR_NAME_CEPH(dir, entries),
XATTR_NAME_CEPH(dir, files),
XATTR_NAME_CEPH(dir, subdirs),
@@ -268,6 +288,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
XATTR_LAYOUT_FIELD(file, layout, stripe_count),
XATTR_LAYOUT_FIELD(file, layout, object_size),
XATTR_LAYOUT_FIELD(file, layout, pool),
+ XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
{ .name = NULL, 0 } /* Required table terminator */
};
static size_t ceph_file_vxattrs_name_size; /* total size of all names */
diff --git a/fs/exec.c b/fs/exec.c
index ca239fc86d8d..a1789cd684bf 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -866,7 +866,8 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
goto out;
}
- *buf = vmalloc(i_size);
+ if (id != READING_FIRMWARE_PREALLOC_BUFFER)
+ *buf = vmalloc(i_size);
if (!*buf) {
ret = -ENOMEM;
goto out;
@@ -897,8 +898,10 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
out_free:
if (ret < 0) {
- vfree(*buf);
- *buf = NULL;
+ if (id != READING_FIRMWARE_PREALLOC_BUFFER) {
+ vfree(*buf);
+ *buf = NULL;
+ }
}
out:
diff --git a/fs/inode.c b/fs/inode.c
index 9cef4e16aeda..ad445542c285 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(inc_nlink);
void address_space_init_once(struct address_space *mapping)
{
memset(mapping, 0, sizeof(*mapping));
- INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+ INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT);
spin_lock_init(&mapping->tree_lock);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 8664417955a2..6abdda209642 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
CFLAGS_nfstrace.o += -I$(src)
nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
- direct.o pagelist.o read.o symlink.o unlink.o \
+ io.o direct.o pagelist.o read.o symlink.o unlink.o \
write.o namespace.o mount_clnt.o nfstrace.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index e5b89675263e..a69ef4e9c24c 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
if (!p)
return -EIO;
b->simple.nr_sigs = be32_to_cpup(p++);
- if (!b->simple.nr_sigs) {
- dprintk("no signature\n");
+ if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) {
+ dprintk("Bad signature count: %d\n", b->simple.nr_sigs);
return -EIO;
}
@@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
memcpy(&b->simple.sigs[i].sig, p,
b->simple.sigs[i].sig_len);
- b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+ b->simple.len += 8 + 4 + \
+ (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2);
}
break;
case PNFS_BLOCK_VOLUME_SLICE:
@@ -104,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_inline_decode(xdr, 4);
if (!p)
return -EIO;
+
b->concat.volumes_count = be32_to_cpup(p++);
+ if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
+ dprintk("Too many volumes: %d\n", b->concat.volumes_count);
+ return -EIO;
+ }
p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
if (!p)
@@ -116,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_inline_decode(xdr, 8 + 4);
if (!p)
return -EIO;
+
p = xdr_decode_hyper(p, &b->stripe.chunk_size);
b->stripe.volumes_count = be32_to_cpup(p++);
+ if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
+ dprintk("Too many volumes: %d\n", b->stripe.volumes_count);
+ return -EIO;
+ }
p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
if (!p)
@@ -224,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
+ struct block_device *bdev;
dev_t dev;
dev = bl_resolve_deviceid(server, v, gfp_mask);
if (!dev)
return -EIO;
- d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
- if (IS_ERR(d->bdev)) {
+ bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
+ if (IS_ERR(bdev)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
- MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
- return PTR_ERR(d->bdev);
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
+ return PTR_ERR(bdev);
}
+ d->bdev = bdev;
d->len = i_size_read(d->bdev->bd_inode);
@@ -287,44 +300,71 @@ bl_validate_designator(struct pnfs_block_volume *v)
}
}
+/*
+ * Try to open the udev path for the WWN. At least on Debian the udev
+ * by-id path will always point to the dm-multipath device if one exists.
+ */
+static struct block_device *
+bl_open_udev_path(struct pnfs_block_volume *v)
+{
+ struct block_device *bdev;
+ const char *devname;
+
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN",
+ v->scsi.designator_len, v->scsi.designator);
+ if (!devname)
+ return ERR_PTR(-ENOMEM);
+
+ bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
+ if (IS_ERR(bdev)) {
+ pr_warn("pNFS: failed to open device %s (%ld)\n",
+ devname, PTR_ERR(bdev));
+ }
+
+ kfree(devname);
+ return bdev;
+}
+
+/*
+ * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the
+ * wwn- links will only point to the first discovered SCSI device there.
+ */
+static struct block_device *
+bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v)
+{
+ struct block_device *bdev;
+ const char *devname;
+
+ devname = kasprintf(GFP_KERNEL,
+ "/dev/disk/by-id/dm-uuid-mpath-%d%*phN",
+ v->scsi.designator_type,
+ v->scsi.designator_len, v->scsi.designator);
+ if (!devname)
+ return ERR_PTR(-ENOMEM);
+
+ bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
+ kfree(devname);
+ return bdev;
+}
+
static int
bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
+ struct block_device *bdev;
const struct pr_ops *ops;
- const char *devname;
int error;
if (!bl_validate_designator(v))
return -EINVAL;
- switch (v->scsi.designator_len) {
- case 8:
- devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
- v->scsi.designator);
- break;
- case 12:
- devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
- v->scsi.designator);
- break;
- case 16:
- devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
- v->scsi.designator);
- break;
- default:
- return -EINVAL;
- }
-
- d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
- if (IS_ERR(d->bdev)) {
- pr_warn("pNFS: failed to open device %s (%ld)\n",
- devname, PTR_ERR(d->bdev));
- kfree(devname);
- return PTR_ERR(d->bdev);
- }
-
- kfree(devname);
+ bdev = bl_open_dm_mpath_udev_path(v);
+ if (IS_ERR(bdev))
+ bdev = bl_open_udev_path(v);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+ d->bdev = bdev;
d->len = i_size_read(d->bdev->bd_inode);
d->map = bl_map_simple;
@@ -352,7 +392,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return 0;
out_blkdev_put:
- blkdev_put(d->bdev, FMODE_READ);
+ blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE);
return error;
}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 720b3ff55fa9..992bcb19c11e 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
return be;
}
+static void __ext_put_deviceids(struct list_head *head)
+{
+ struct pnfs_block_extent *be, *tmp;
+
+ list_for_each_entry_safe(be, tmp, head, be_list) {
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ }
+}
+
static void
__ext_tree_insert(struct rb_root *root,
struct pnfs_block_extent *new, bool merge_ok)
@@ -163,7 +173,8 @@ free_new:
}
static int
-__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
+__ext_tree_remove(struct rb_root *root,
+ sector_t start, sector_t end, struct list_head *tmp)
{
struct pnfs_block_extent *be;
sector_t len1 = 0, len2 = 0;
@@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
struct pnfs_block_extent *next = ext_tree_next(be);
rb_erase(&be->be_node, root);
- nfs4_put_deviceid_node(be->be_device);
- kfree(be);
+ list_add_tail(&be->be_list, tmp);
be = next;
}
@@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
sector_t start, sector_t end)
{
int err, err2;
+ LIST_HEAD(tmp);
spin_lock(&bl->bl_ext_lock);
- err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
if (rw) {
- err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
+ err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
if (!err)
err = err2;
}
spin_unlock(&bl->bl_ext_lock);
+ __ext_put_deviceids(&tmp);
return err;
}
@@ -396,12 +408,13 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
sector_t end = start + len;
struct pnfs_block_extent *be;
int err = 0;
+ LIST_HEAD(tmp);
spin_lock(&bl->bl_ext_lock);
/*
* First remove all COW extents or holes from written to range.
*/
- err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
if (err)
goto out;
@@ -459,6 +472,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
}
out:
spin_unlock(&bl->bl_ext_lock);
+
+ __ext_put_deviceids(&tmp);
return err;
}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index aaa2e8d3df6f..c92a75e066a6 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -119,27 +119,30 @@ out:
* hashed by filehandle.
*/
static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
- struct nfs_fh *fh, nfs4_stateid *stateid)
+ struct nfs_fh *fh)
{
struct nfs_server *server;
+ struct nfs_inode *nfsi;
struct inode *ino;
struct pnfs_layout_hdr *lo;
+restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry(lo, &server->layouts, plh_layouts) {
- if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid))
+ nfsi = NFS_I(lo->plh_inode);
+ if (nfs_compare_fh(fh, &nfsi->fh))
continue;
- if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
+ if (nfsi->layout != lo)
continue;
ino = igrab(lo->plh_inode);
if (!ino)
break;
spin_lock(&ino->i_lock);
/* Is this layout in the process of being freed? */
- if (NFS_I(ino)->layout != lo) {
+ if (nfsi->layout != lo) {
spin_unlock(&ino->i_lock);
iput(ino);
- break;
+ goto restart;
}
pnfs_get_layout_hdr(lo);
spin_unlock(&ino->i_lock);
@@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
}
static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
- struct nfs_fh *fh, nfs4_stateid *stateid)
+ struct nfs_fh *fh)
{
struct pnfs_layout_hdr *lo;
spin_lock(&clp->cl_lock);
rcu_read_lock();
- lo = get_layout_by_fh_locked(clp, fh, stateid);
+ lo = get_layout_by_fh_locked(clp, fh);
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
@@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
/*
* Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
*/
-static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo,
+static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new)
{
u32 oldseq, newseq;
- oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ /* Is the stateid still not initialised? */
+ if (!pnfs_layout_is_valid(lo))
+ return NFS4ERR_DELAY;
+
+ /* Mismatched stateid? */
+ if (!nfs4_stateid_match_other(&lo->plh_stateid, new))
+ return NFS4ERR_BAD_STATEID;
+
newseq = be32_to_cpu(new->seqid);
+ /* Are we already in a layout recall situation? */
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
+ lo->plh_return_seq != 0) {
+ if (newseq < lo->plh_return_seq)
+ return NFS4ERR_OLD_STATEID;
+ if (newseq > lo->plh_return_seq)
+ return NFS4ERR_DELAY;
+ goto out;
+ }
+ /* Check that the stateid matches what we think it should be. */
+ oldseq = be32_to_cpu(lo->plh_stateid.seqid);
if (newseq > oldseq + 1)
- return false;
- return true;
+ return NFS4ERR_DELAY;
+ /* Crazy server! */
+ if (newseq <= oldseq)
+ return NFS4ERR_OLD_STATEID;
+out:
+ return NFS_OK;
}
static u32 initiate_file_draining(struct nfs_client *clp,
@@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
LIST_HEAD(free_me_list);
- lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
+ lo = get_layout_by_fh(clp, &args->cbl_fh);
if (!lo) {
trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
&args->cbl_stateid, -rv);
@@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,
}
ino = lo->plh_inode;
+ pnfs_layoutcommit_inode(ino, false);
+
spin_lock(&ino->i_lock);
- if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
- rv = NFS4ERR_DELAY;
+ rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
+ if (rv != NFS_OK)
goto unlock;
- }
pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
- spin_unlock(&ino->i_lock);
-
- pnfs_layoutcommit_inode(ino, false);
- spin_lock(&ino->i_lock);
/*
* Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
*/
@@ -223,11 +245,13 @@ static u32 initiate_file_draining(struct nfs_client *clp,
goto unlock;
}
+ /* Embrace your forgetfulness! */
+ rv = NFS4ERR_NOMATCHING_LAYOUT;
+
if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
&args->cbl_range);
}
- pnfs_mark_layout_returned_if_empty(lo);
unlock:
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index d81f96aacd51..656f68f7fe53 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -925,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
if (hdr_arg.minorversion == 0) {
cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident);
if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
- return rpc_drop_reply;
+ goto out_invalidcred;
}
cps.minorversion = hdr_arg.minorversion;
@@ -953,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
nfs_put_client(cps.clp);
dprintk("%s: done, status = %u\n", __func__, ntohl(status));
return rpc_success;
+
+out_invalidcred:
+ pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
+ return rpc_autherr_badcred;
}
/*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 487c5607d52f..003ebce4bbc4 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
*/
struct nfs_client *
nfs_get_client(const struct nfs_client_initdata *cl_init,
- const struct rpc_timeout *timeparms,
- const char *ip_addr,
rpc_authflavor_t authflavour)
{
struct nfs_client *clp, *new = NULL;
@@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
&nn->nfs_client_list);
spin_unlock(&nn->nfs_client_lock);
new->cl_flags = cl_init->init_flags;
- return rpc_ops->init_client(new, timeparms, ip_addr);
+ return rpc_ops->init_client(new, cl_init);
}
spin_unlock(&nn->nfs_client_lock);
@@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values);
* Create an RPC client handle
*/
int nfs_create_rpc_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
+ const struct nfs_client_initdata *cl_init,
rpc_authflavor_t flavor)
{
struct rpc_clnt *clnt = NULL;
@@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp,
.protocol = clp->cl_proto,
.address = (struct sockaddr *)&clp->cl_addr,
.addrsize = clp->cl_addrlen,
- .timeout = timeparms,
+ .timeout = cl_init->timeparms,
.servername = clp->cl_hostname,
+ .nodename = cl_init->nodename,
.program = &nfs_program,
.version = clp->rpc_ops->version,
.authflavor = flavor,
@@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
* nfs_init_client - Initialise an NFS2 or NFS3 client
*
* @clp: nfs_client to initialise
- * @timeparms: timeout parameters for underlying RPC transport
- * @ip_addr: IP presentation address (not used)
+ * @cl_init: Initialisation parameters
*
* Returns pointer to an NFS client, or an ERR_PTR value.
*/
struct nfs_client *nfs_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr)
+ const struct nfs_client_initdata *cl_init)
{
int error;
@@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp,
* Create a client RPC handle for doing FSSTAT with UNIX auth only
* - RFC 2623, sec 2.3.2
*/
- error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
+ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
if (error < 0)
goto error;
nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server,
const struct nfs_parsed_mount_data *data,
struct nfs_subversion *nfs_mod)
{
+ struct rpc_timeout timeparms;
struct nfs_client_initdata cl_init = {
.hostname = data->nfs_server.hostname,
.addr = (const struct sockaddr *)&data->nfs_server.address,
@@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server,
.nfs_mod = nfs_mod,
.proto = data->nfs_server.protocol,
.net = data->net,
+ .timeparms = &timeparms,
};
- struct rpc_timeout timeparms;
struct nfs_client *clp;
int error;
@@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server,
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
+ clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX);
if (IS_ERR(clp)) {
dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
return PTR_ERR(clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index baaa38859899..177fefb26c18 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2252,21 +2252,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st
return NULL;
}
-static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_access_entry *cache;
- int err = -ENOENT;
+ bool retry = true;
+ int err;
spin_lock(&inode->i_lock);
- if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
- goto out_zap;
- cache = nfs_access_search_rbtree(inode, cred);
- if (cache == NULL)
- goto out;
- if (!nfs_have_delegated_attributes(inode) &&
- !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
- goto out_stale;
+ for(;;) {
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+ goto out_zap;
+ cache = nfs_access_search_rbtree(inode, cred);
+ err = -ENOENT;
+ if (cache == NULL)
+ goto out;
+ /* Found an entry, is our attribute cache valid? */
+ if (!nfs_attribute_cache_expired(inode) &&
+ !(nfsi->cache_validity & NFS_INO_INVALID_ATTR))
+ break;
+ err = -ECHILD;
+ if (!may_block)
+ goto out;
+ if (!retry)
+ goto out_zap;
+ spin_unlock(&inode->i_lock);
+ err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (err)
+ return err;
+ spin_lock(&inode->i_lock);
+ retry = false;
+ }
res->jiffies = cache->jiffies;
res->cred = cache->cred;
res->mask = cache->mask;
@@ -2275,12 +2291,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
out:
spin_unlock(&inode->i_lock);
return err;
-out_stale:
- rb_erase(&cache->rb_node, &nfsi->access_cache);
- list_del(&cache->lru);
- spin_unlock(&inode->i_lock);
- nfs_access_free_entry(cache);
- return -ENOENT;
out_zap:
spin_unlock(&inode->i_lock);
nfs_access_zap_cache(inode);
@@ -2307,13 +2317,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred,
cache = NULL;
if (cache == NULL)
goto out;
- if (!nfs_have_delegated_attributes(inode) &&
- !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+ err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode);
+ if (err)
goto out;
res->jiffies = cache->jiffies;
res->cred = cache->cred;
res->mask = cache->mask;
- err = 0;
out:
rcu_read_unlock();
return err;
@@ -2402,18 +2411,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask);
static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
{
struct nfs_access_entry cache;
+ bool may_block = (mask & MAY_NOT_BLOCK) == 0;
int status;
trace_nfs_access_enter(inode);
status = nfs_access_get_cached_rcu(inode, cred, &cache);
if (status != 0)
- status = nfs_access_get_cached(inode, cred, &cache);
+ status = nfs_access_get_cached(inode, cred, &cache, may_block);
if (status == 0)
goto out_cached;
status = -ECHILD;
- if (mask & MAY_NOT_BLOCK)
+ if (!may_block)
goto out;
/* Be clever: ask server to check for all possible rights */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e6210ead71d0..72b7d13ee3c6 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -196,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
WARN_ON_ONCE(verfp->committed < 0);
}
+static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1,
+ const struct nfs_writeverf *v2)
+{
+ return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier);
+}
+
/*
* nfs_direct_cmp_hdr_verf - compare verifier for pgio header
* @dreq - direct request possibly spanning multiple servers
@@ -215,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
nfs_direct_set_hdr_verf(dreq, hdr);
return 0;
}
- return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+ return nfs_direct_cmp_verf(verfp, &hdr->verf);
}
/*
@@ -238,7 +244,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
if (verfp->committed < 0)
return 1;
- return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
+ return nfs_direct_cmp_verf(verfp, &data->verf);
}
/**
@@ -366,22 +372,10 @@ out:
* Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
* the iocb is still valid here if this is a synchronous request.
*/
-static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
+static void nfs_direct_complete(struct nfs_direct_req *dreq)
{
struct inode *inode = dreq->inode;
- if (dreq->iocb && write) {
- loff_t pos = dreq->iocb->ki_pos + dreq->count;
-
- spin_lock(&inode->i_lock);
- if (i_size_read(inode) < pos)
- i_size_write(inode, pos);
- spin_unlock(&inode->i_lock);
- }
-
- if (write)
- nfs_zap_mapping(inode, inode->i_mapping);
-
inode_dio_end(inode);
if (dreq->iocb) {
@@ -436,7 +430,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
}
out_put:
if (put_dreq(dreq))
- nfs_direct_complete(dreq, false);
+ nfs_direct_complete(dreq);
hdr->release(hdr);
}
@@ -542,7 +536,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
}
if (put_dreq(dreq))
- nfs_direct_complete(dreq, false);
+ nfs_direct_complete(dreq);
return 0;
}
@@ -583,17 +577,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
if (!count)
goto out;
- inode_lock(inode);
- result = nfs_sync_mapping(mapping);
- if (result)
- goto out_unlock;
-
task_io_account_read(count);
result = -ENOMEM;
dreq = nfs_direct_req_alloc();
if (dreq == NULL)
- goto out_unlock;
+ goto out;
dreq->inode = inode;
dreq->bytes_left = dreq->max_count = count;
@@ -608,10 +597,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
+ nfs_start_io_direct(inode);
+
NFS_I(inode)->read_io += count;
result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
- inode_unlock(inode);
+ nfs_end_io_direct(inode);
if (!result) {
result = nfs_direct_wait(dreq);
@@ -619,13 +610,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
iocb->ki_pos += result;
}
- nfs_direct_req_release(dreq);
- return result;
-
out_release:
nfs_direct_req_release(dreq);
-out_unlock:
- inode_unlock(inode);
out:
return result;
}
@@ -657,6 +643,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
dreq->count = 0;
+ dreq->verf.committed = NFS_INVALID_STABLE_HOW;
+ nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
for (i = 0; i < dreq->mirror_count; i++)
dreq->mirrors[i].count = 0;
get_dreq(dreq);
@@ -775,7 +763,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
nfs_direct_write_reschedule(dreq);
break;
default:
- nfs_direct_complete(dreq, true);
+ nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
+ nfs_direct_complete(dreq);
}
}
@@ -991,6 +980,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t result = -EINVAL;
+ size_t count;
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
@@ -1001,34 +991,24 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
file, iov_iter_count(iter), (long long) iocb->ki_pos);
- nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES,
- iov_iter_count(iter));
+ result = generic_write_checks(iocb, iter);
+ if (result <= 0)
+ return result;
+ count = result;
+ nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
pos = iocb->ki_pos;
end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
- inode_lock(inode);
-
- result = nfs_sync_mapping(mapping);
- if (result)
- goto out_unlock;
-
- if (mapping->nrpages) {
- result = invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
- if (result)
- goto out_unlock;
- }
-
- task_io_account_write(iov_iter_count(iter));
+ task_io_account_write(count);
result = -ENOMEM;
dreq = nfs_direct_req_alloc();
if (!dreq)
- goto out_unlock;
+ goto out;
dreq->inode = inode;
- dreq->bytes_left = dreq->max_count = iov_iter_count(iter);
+ dreq->bytes_left = dreq->max_count = count;
dreq->io_start = pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
@@ -1040,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
+ nfs_start_io_direct(inode);
+
result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
if (mapping->nrpages) {
@@ -1047,30 +1029,19 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
pos >> PAGE_SHIFT, end);
}
- inode_unlock(inode);
+ nfs_end_io_direct(inode);
if (!result) {
result = nfs_direct_wait(dreq);
if (result > 0) {
- struct inode *inode = mapping->host;
-
iocb->ki_pos = pos + result;
- spin_lock(&inode->i_lock);
- if (i_size_read(inode) < iocb->ki_pos)
- i_size_write(inode, iocb->ki_pos);
- spin_unlock(&inode->i_lock);
-
/* XXX: should check the generic_write_sync retval */
generic_write_sync(iocb, result);
}
}
- nfs_direct_req_release(dreq);
- return result;
-
out_release:
nfs_direct_req_release(dreq);
-out_unlock:
- inode_unlock(inode);
+out:
return result;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 717a8d6af52d..7d620970f2e1 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
iocb->ki_filp,
iov_iter_count(to), (unsigned long) iocb->ki_pos);
- result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping);
+ nfs_start_io_read(inode);
+ result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
if (!result) {
result = generic_file_read_iter(iocb, to);
if (result > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
}
+ nfs_end_io_read(inode);
return result;
}
EXPORT_SYMBOL_GPL(nfs_file_read);
@@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
filp, (unsigned long) count, (unsigned long long) *ppos);
- res = nfs_revalidate_mapping_protected(inode, filp->f_mapping);
+ nfs_start_io_read(inode);
+ res = nfs_revalidate_mapping(inode, filp->f_mapping);
if (!res) {
res = generic_file_splice_read(filp, ppos, pipe, count, flags);
if (res > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
}
+ nfs_end_io_read(inode);
return res;
}
EXPORT_SYMBOL_GPL(nfs_file_splice_read);
@@ -272,16 +276,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
trace_nfs_fsync_enter(inode);
- inode_dio_wait(inode);
do {
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret != 0)
break;
- inode_lock(inode);
ret = nfs_file_fsync_commit(file, start, end, datasync);
if (!ret)
ret = pnfs_sync_inode(inode, !!datasync);
- inode_unlock(inode);
/*
* If nfs_file_fsync_commit detected a server reboot, then
* resend all dirty pages that might have been covered by
@@ -359,19 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file, mapping->host->i_ino, len, (long long) pos);
start:
- /*
- * Prevent starvation issues if someone is doing a consistency
- * sync-to-disk
- */
- ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
- nfs_wait_bit_killable, TASK_KILLABLE);
- if (ret)
- return ret;
- /*
- * Wait for O_DIRECT to complete
- */
- inode_dio_wait(mapping->host);
-
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -432,7 +420,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
return status;
NFS_I(mapping->host)->write_io += copied;
- if (nfs_ctx_key_to_expire(ctx)) {
+ if (nfs_ctx_key_to_expire(ctx, mapping->host)) {
status = nfs_wb_all(mapping->host);
if (status < 0)
return status;
@@ -470,31 +458,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset,
*/
static int nfs_release_page(struct page *page, gfp_t gfp)
{
- struct address_space *mapping = page->mapping;
-
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
- /* Always try to initiate a 'commit' if relevant, but only
- * wait for it if the caller allows blocking. Even then,
- * only wait 1 second and only if the 'bdi' is not congested.
- * Waiting indefinitely can cause deadlocks when the NFS
- * server is on this machine, when a new TCP connection is
- * needed and in other rare cases. There is no particular
- * need to wait extensively here. A short wait has the
- * benefit that someone else can worry about the freezer.
- */
- if (mapping) {
- struct nfs_server *nfss = NFS_SERVER(mapping->host);
- nfs_commit_inode(mapping->host, 0);
- if (gfpflags_allow_blocking(gfp) &&
- !bdi_write_congested(&nfss->backing_dev_info)) {
- wait_on_page_bit_killable_timeout(page, PG_private,
- HZ);
- if (PagePrivate(page))
- set_bdi_congested(&nfss->backing_dev_info,
- BLK_RW_ASYNC);
- }
- }
/* If PagePrivate() is set, then the page is not freeable */
if (PagePrivate(page))
return 0;
@@ -604,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
filp, filp->f_mapping->host->i_ino,
(long long)page_offset(page));
+ sb_start_pagefault(inode->i_sb);
+
/* make sure the cache has finished storing the page */
nfs_fscache_wait_on_page_write(NFS_I(inode), page);
@@ -630,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
out_unlock:
unlock_page(page);
out:
+ sb_end_pagefault(inode->i_sb);
return ret;
}
@@ -645,7 +613,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode)
ctx = nfs_file_open_context(filp);
if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
- nfs_ctx_key_to_expire(ctx))
+ nfs_ctx_key_to_expire(ctx, inode))
return 1;
return 0;
}
@@ -656,23 +624,17 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
unsigned long written = 0;
ssize_t result;
- size_t count = iov_iter_count(from);
result = nfs_key_timeout_notify(file, inode);
if (result)
return result;
- if (iocb->ki_flags & IOCB_DIRECT) {
- result = generic_write_checks(iocb, from);
- if (result <= 0)
- return result;
+ if (iocb->ki_flags & IOCB_DIRECT)
return nfs_file_direct_write(iocb, from);
- }
dprintk("NFS: write(%pD2, %zu@%Ld)\n",
- file, count, (long long) iocb->ki_pos);
+ file, iov_iter_count(from), (long long) iocb->ki_pos);
- result = -EBUSY;
if (IS_SWAPFILE(inode))
goto out_swapfile;
/*
@@ -684,28 +646,33 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
- result = count;
- if (!count)
+ nfs_start_io_write(inode);
+ result = generic_write_checks(iocb, from);
+ if (result > 0) {
+ current->backing_dev_info = inode_to_bdi(inode);
+ result = generic_perform_write(file, from, iocb->ki_pos);
+ current->backing_dev_info = NULL;
+ }
+ nfs_end_io_write(inode);
+ if (result <= 0)
goto out;
- result = generic_file_write_iter(iocb, from);
- if (result > 0)
- written = result;
+ written = generic_write_sync(iocb, result);
+ iocb->ki_pos += written;
/* Return error values */
- if (result >= 0 && nfs_need_check_write(file, inode)) {
+ if (nfs_need_check_write(file, inode)) {
int err = vfs_fsync(file, 0);
if (err < 0)
result = err;
}
- if (result > 0)
- nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+ nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
out:
return result;
out_swapfile:
printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
- goto out;
+ return -EBUSY;
}
EXPORT_SYMBOL_GPL(nfs_file_write);
@@ -780,11 +747,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
}
static int
-is_time_granular(struct timespec *ts) {
- return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
-}
-
-static int
do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
{
struct inode *inode = filp->f_mapping->host;
@@ -817,12 +779,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
* This makes locking act as a cache coherency point.
*/
nfs_sync_mapping(filp->f_mapping);
- if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
- if (is_time_granular(&NFS_SERVER(inode)->time_delta))
- __nfs_revalidate_inode(NFS_SERVER(inode), inode);
- else
- nfs_zap_caches(inode);
- }
+ if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ nfs_zap_mapping(inode, filp->f_mapping);
out:
return status;
}
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index aa59757389dc..a3fc48ba4931 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task,
static void
filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
{
+ loff_t end_offs = 0;
if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
- hdr->res.verf->committed != NFS_DATA_SYNC)
+ hdr->res.verf->committed == NFS_FILE_SYNC)
return;
+ if (hdr->res.verf->committed == NFS_DATA_SYNC)
+ end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
- pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
- hdr->mds_offset + hdr->res.count);
+ /* Note: if the write is unstable, don't set end_offs until commit */
+ pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
}
@@ -354,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task,
}
filelayout_set_layoutcommit(hdr);
+
+ /* zero out the fattr */
+ hdr->fattr.valid = 0;
+ if (task->tk_status >= 0)
+ nfs_writeback_update_inode(hdr);
+
return 0;
}
@@ -375,8 +384,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
return -EAGAIN;
}
- if (data->verf.committed == NFS_UNSTABLE)
- pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
+ pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
return 0;
}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 0e8018bc9880..e6206eaf2bdf 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1325,15 +1325,16 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
* we always send layoutcommit after DS writes.
*/
static void
-ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
+ff_layout_set_layoutcommit(struct inode *inode,
+ struct pnfs_layout_segment *lseg,
+ loff_t end_offset)
{
- if (!ff_layout_need_layoutcommit(hdr->lseg))
+ if (!ff_layout_need_layoutcommit(lseg))
return;
- pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
- hdr->mds_offset + hdr->res.count);
- dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
- (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+ pnfs_set_layoutcommit(inode, lseg, end_offset);
+ dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino,
+ (unsigned long long) NFS_I(inode)->layout->plh_lwb);
}
static bool
@@ -1469,6 +1470,7 @@ static void ff_layout_read_release(void *data)
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ loff_t end_offs = 0;
int err;
trace_nfs4_pnfs_write(hdr, task->tk_status);
@@ -1494,7 +1496,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
if (hdr->res.verf->committed == NFS_FILE_SYNC ||
hdr->res.verf->committed == NFS_DATA_SYNC)
- ff_layout_set_layoutcommit(hdr);
+ end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
+
+ /* Note: if the write is unstable, don't set end_offs until commit */
+ ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
/* zero out fattr since we don't care DS attr at all */
hdr->fattr.valid = 0;
@@ -1530,9 +1535,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
return -EAGAIN;
}
- if (data->verf.committed == NFS_UNSTABLE
- && ff_layout_need_layoutcommit(data->lseg))
- pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
+ ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
return 0;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index dda689d7a8a7..bf4ec5ecc97e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -662,9 +662,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
if (S_ISREG(inode->i_mode)) {
- inode_lock(inode);
- err = nfs_sync_inode(inode);
- inode_unlock(inode);
+ err = filemap_write_and_wait(inode->i_mapping);
if (err)
goto out;
}
@@ -879,7 +877,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
struct nfs_inode *nfsi = NFS_I(inode);
spin_lock(&inode->i_lock);
- list_add(&ctx->list, &nfsi->open_files);
+ if (ctx->mode & FMODE_WRITE)
+ list_add(&ctx->list, &nfsi->open_files);
+ else
+ list_add_tail(&ctx->list, &nfsi->open_files);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
@@ -972,6 +973,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
if (NFS_STALE(inode))
goto out;
+ /* pNFS: Attributes aren't updated until we layoutcommit */
+ if (S_ISREG(inode->i_mode)) {
+ status = pnfs_sync_inode(inode, false);
+ if (status)
+ goto out;
+ }
+
status = -ENOMEM;
fattr = nfs_alloc_fattr();
if (fattr == NULL)
@@ -1122,14 +1130,12 @@ out:
}
/**
- * __nfs_revalidate_mapping - Revalidate the pagecache
+ * nfs_revalidate_mapping - Revalidate the pagecache
* @inode - pointer to host inode
* @mapping - pointer to mapping
- * @may_lock - take inode->i_mutex?
*/
-static int __nfs_revalidate_mapping(struct inode *inode,
- struct address_space *mapping,
- bool may_lock)
+int nfs_revalidate_mapping(struct inode *inode,
+ struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
unsigned long *bitlock = &nfsi->flags;
@@ -1178,12 +1184,7 @@ static int __nfs_revalidate_mapping(struct inode *inode,
nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
- if (may_lock) {
- inode_lock(inode);
- ret = nfs_invalidate_mapping(inode, mapping);
- inode_unlock(inode);
- } else
- ret = nfs_invalidate_mapping(inode, mapping);
+ ret = nfs_invalidate_mapping(inode, mapping);
trace_nfs_invalidate_mapping_exit(inode, ret);
clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
@@ -1193,27 +1194,28 @@ out:
return ret;
}
-/**
- * nfs_revalidate_mapping - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
- */
-int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+static bool nfs_file_has_writers(struct nfs_inode *nfsi)
{
- return __nfs_revalidate_mapping(inode, mapping, false);
+ struct inode *inode = &nfsi->vfs_inode;
+
+ assert_spin_locked(&inode->i_lock);
+
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (list_empty(&nfsi->open_files))
+ return false;
+ /* Note: This relies on nfsi->open_files being ordered with writers
+ * being placed at the head of the list.
+ * See nfs_inode_attach_open_context()
+ */
+ return (list_first_entry(&nfsi->open_files,
+ struct nfs_open_context,
+ list)->mode & FMODE_WRITE) == FMODE_WRITE;
}
-/**
- * nfs_revalidate_mapping_protected - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
- *
- * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex
- * while invalidating the mapping.
- */
-int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping)
+static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
{
- return __nfs_revalidate_mapping(inode, mapping, true);
+ return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi);
}
static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
@@ -1280,22 +1282,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
return -EIO;
- if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
- inode->i_version != fattr->change_attr)
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ if (!nfs_file_has_buffered_writers(nfsi)) {
+ /* Verify a few of the more important attributes */
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr)
+ invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE;
- /* Verify a few of the more important attributes */
- if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
- invalid |= NFS_INO_INVALID_ATTR;
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
+ invalid |= NFS_INO_INVALID_ATTR;
- if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
- cur_size = i_size_read(inode);
- new_isize = nfs_size_to_loff_t(fattr->size);
- if (cur_size != new_isize)
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime))
+ invalid |= NFS_INO_INVALID_ATTR;
+
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ cur_size = i_size_read(inode);
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ if (cur_size != new_isize)
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ }
}
- if (nfsi->nrequests != 0)
- invalid &= ~NFS_INO_REVAL_PAGECACHE;
/* Have any file permissions changed? */
if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
@@ -1470,28 +1474,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
}
-/*
- * Don't trust the change_attribute, mtime, ctime or size if
- * a pnfs LAYOUTCOMMIT is outstanding
- */
-static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
- struct nfs_fattr *fattr)
-{
- if (pnfs_layoutcommit_outstanding(inode))
- fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
- NFS_ATTR_FATTR_MTIME |
- NFS_ATTR_FATTR_CTIME |
- NFS_ATTR_FATTR_SIZE);
-}
-
static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
{
int ret;
trace_nfs_refresh_inode_enter(inode);
- nfs_inode_attrs_handle_layoutcommit(inode, fattr);
-
if (nfs_inode_attrs_need_update(inode, fattr))
ret = nfs_update_inode(inode, fattr);
else
@@ -1527,7 +1515,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode);
static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
{
- unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ unsigned long invalid = NFS_INO_INVALID_ATTR;
/*
* Don't revalidate the pagecache if we hold a delegation, but do
@@ -1676,6 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
unsigned long invalid = 0;
unsigned long now = jiffies;
unsigned long save_cache_validity;
+ bool have_writers = nfs_file_has_buffered_writers(nfsi);
bool cache_revalidated = true;
dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
@@ -1725,17 +1714,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Do atomic weak cache consistency updates */
invalid |= nfs_wcc_update_inode(inode, fattr);
+ if (pnfs_layoutcommit_outstanding(inode)) {
+ nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR;
+ cache_revalidated = false;
+ }
+
/* More cache consistency checks */
if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
if (inode->i_version != fattr->change_attr) {
dprintk("NFS: change_attr change on server for file %s/%ld\n",
inode->i_sb->s_id, inode->i_ino);
- invalid |= NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_DATA
- | NFS_INO_INVALID_ACCESS
- | NFS_INO_INVALID_ACL;
- if (S_ISDIR(inode->i_mode))
- nfs_force_lookup_revalidate(inode);
+ /* Could it be a race with writeback? */
+ if (!have_writers) {
+ invalid |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ if (S_ISDIR(inode->i_mode))
+ nfs_force_lookup_revalidate(inode);
+ }
inode->i_version = fattr->change_attr;
}
} else {
@@ -1768,9 +1765,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (new_isize != cur_isize) {
/* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */
- if ((nfsi->nrequests == 0) || new_isize > cur_isize) {
+ if (nfsi->nrequests == 0 || new_isize > cur_isize) {
i_size_write(inode, new_isize);
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+ if (!have_writers)
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
}
dprintk("NFS: isize change on server for file %s/%ld "
"(%Ld to %Ld)\n",
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5ea04d87fc65..7ce5e023c3c3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -66,13 +66,16 @@ struct nfs_clone_mount {
struct nfs_client_initdata {
unsigned long init_flags;
- const char *hostname;
- const struct sockaddr *addr;
+ const char *hostname; /* Hostname of the server */
+ const struct sockaddr *addr; /* Address of the server */
+ const char *nodename; /* Hostname of the client */
+ const char *ip_addr; /* IP address of the client */
size_t addrlen;
struct nfs_subversion *nfs_mod;
int proto;
u32 minorversion;
struct net *net;
+ const struct rpc_timeout *timeparms;
};
/*
@@ -147,9 +150,8 @@ extern void nfs_umount(const struct nfs_mount_request *info);
extern const struct rpc_program nfs_program;
extern void nfs_clients_init(struct net *net);
extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
-int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t);
+int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
- const struct rpc_timeout *, const char *,
rpc_authflavor_t);
int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
void nfs_server_insert_lists(struct nfs_server *);
@@ -184,7 +186,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
rpc_authflavor_t);
extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
-extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr,
int ds_addrlen, int ds_proto,
unsigned int ds_timeo,
@@ -193,7 +195,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
rpc_authflavor_t au_flavor);
extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
struct inode *);
-extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo,
unsigned int ds_retrans, rpc_authflavor_t au_flavor);
@@ -338,8 +340,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
/* proc.c */
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr);
+ const struct nfs_client_initdata *);
/* dir.c */
extern void nfs_force_use_readdirplus(struct inode *dir);
@@ -411,6 +412,19 @@ extern void __exit unregister_nfs_fs(void);
extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
+/* io.c */
+extern void nfs_start_io_read(struct inode *inode);
+extern void nfs_end_io_read(struct inode *inode);
+extern void nfs_start_io_write(struct inode *inode);
+extern void nfs_end_io_write(struct inode *inode);
+extern void nfs_start_io_direct(struct inode *inode);
+extern void nfs_end_io_direct(struct inode *inode);
+
+static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
+{
+ return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
+}
+
/* namespace.c */
#define NFS_PATH_CANONICAL 1
extern char *nfs_path(char **p, struct dentry *dentry,
@@ -496,9 +510,29 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
struct inode *inode,
struct nfs_direct_req *dreq);
int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
-bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode);
void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
+int nfs_filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend);
+
+#ifdef CONFIG_NFS_V4_1
+static inline
+void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
+{
+ int i;
+
+ for (i = 0; i < cinfo->nbuckets; i++)
+ cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+}
+#else
+static inline
+void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
+{
+}
+#endif
+
+
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
struct page *, struct page *, enum migrate_mode);
@@ -506,6 +540,13 @@ extern int nfs_migrate_page(struct address_space *,
#define nfs_migrate_page NULL
#endif
+static inline int
+nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
+ const struct nfs_write_verifier *v2)
+{
+ return memcmp(v1->data, v2->data, sizeof(v1->data));
+}
+
/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
@@ -521,8 +562,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
/* nfs4proc.c */
extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr);
+ const struct nfs_client_initdata *);
extern int nfs40_walk_client_list(struct nfs_client *clp,
struct nfs_client **result,
struct rpc_cred *cred);
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
new file mode 100644
index 000000000000..1fc5d1ce327e
--- /dev/null
+++ b/fs/nfs/io.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016 Trond Myklebust
+ *
+ * I/O and data path helper functionality.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+
+#include "internal.h"
+
+/* Call with exclusively locked inode->i_rwsem */
+static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
+{
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ inode_dio_wait(inode);
+ }
+}
+
+/**
+ * nfs_start_io_read - declare the file is being used for buffered reads
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+void
+nfs_start_io_read(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ /* Be an optimist! */
+ down_read(&inode->i_rwsem);
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0)
+ return;
+ up_read(&inode->i_rwsem);
+ /* Slow path.... */
+ down_write(&inode->i_rwsem);
+ nfs_block_o_direct(nfsi, inode);
+ downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_read - declare that the buffered read operation is done
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_read(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
+
+/**
+ * nfs_start_io_write - declare the file is being used for buffered writes
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+void
+nfs_start_io_write(struct inode *inode)
+{
+ down_write(&inode->i_rwsem);
+ nfs_block_o_direct(NFS_I(inode), inode);
+}
+
+/**
+ * nfs_end_io_write - declare that the buffered write operation is done
+ * @inode - file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_write(struct inode *inode)
+{
+ up_write(&inode->i_rwsem);
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
+{
+ if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ set_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ nfs_wb_all(inode);
+ }
+}
+
+/**
+ * nfs_end_io_direct - declare the file is being used for direct i/o
+ * @inode - file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+void
+nfs_start_io_direct(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ /* Be an optimist! */
+ down_read(&inode->i_rwsem);
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0)
+ return;
+ up_read(&inode->i_rwsem);
+ /* Slow path.... */
+ down_write(&inode->i_rwsem);
+ nfs_block_buffered(nfsi, inode);
+ downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode - file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_direct(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 9e9fa347a948..ee753547fb0a 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -76,19 +76,23 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
* low timeout interval so that if a connection is lost, we retry through
* the MDS.
*/
-struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
rpc_authflavor_t au_flavor)
{
+ struct rpc_timeout ds_timeout;
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
struct nfs_client_initdata cl_init = {
.addr = ds_addr,
.addrlen = ds_addrlen,
+ .nodename = mds_clp->cl_rpcclient->cl_nodename,
+ .ip_addr = mds_clp->cl_ipaddr,
.nfs_mod = &nfs_v3,
.proto = ds_proto,
.net = mds_clp->cl_net,
+ .timeparms = &ds_timeout,
};
- struct rpc_timeout ds_timeout;
struct nfs_client *clp;
char buf[INET6_ADDRSTRLEN + 1];
@@ -97,10 +101,12 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
+ if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
+ set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
/* Use the MDS nfs_client cl_ipaddr. */
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
- clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
- au_flavor);
+ clp = nfs_get_client(&cl_init, au_flavor);
return clp;
}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index aa03ed09ba06..33da841a21bb 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -113,15 +113,17 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
return -EOPNOTSUPP;
- nfs_wb_all(inode);
inode_lock(inode);
+ err = nfs_sync_inode(inode);
+ if (err)
+ goto out_unlock;
err = nfs42_proc_fallocate(&msg, filep, offset, len);
if (err == 0)
truncate_pagecache_range(inode, offset, (offset + len) -1);
if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
-
+out_unlock:
inode_unlock(inode);
return err;
}
@@ -154,11 +156,20 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
if (status)
return status;
+ status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping,
+ pos_src, pos_src + (loff_t)count - 1);
+ if (status)
+ return status;
+
status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
dst_lock, FMODE_WRITE);
if (status)
return status;
+ status = nfs_sync_inode(dst_inode);
+ if (status)
+ return status;
+
status = nfs4_call_sync(server->client, server, &msg,
&args.seq_args, &res.seq_res, 0);
if (status == -ENOTSUPP)
@@ -258,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep,
if (status)
return status;
- nfs_wb_all(inode);
+ status = nfs_filemap_write_and_wait_range(inode->i_mapping,
+ offset, LLONG_MAX);
+ if (status)
+ return status;
+
status = nfs4_call_sync(server->client, server, &msg,
&args.seq_args, &res.seq_res, 0);
if (status == -ENOTSUPP)
@@ -336,8 +351,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
* Mark the bad layout state as invalid, then retry
* with the current stateid.
*/
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
+ pnfs_mark_layout_stateid_invalid(lo, &head);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
} else
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 6dc6f2aea0d6..8b2605882a20 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -330,13 +330,21 @@ static int decode_write_response(struct xdr_stream *xdr,
struct nfs42_write_res *res)
{
__be32 *p;
- int stateids;
p = xdr_inline_decode(xdr, 4 + 8 + 4);
if (unlikely(!p))
goto out_overflow;
- stateids = be32_to_cpup(p++);
+ /*
+ * We never use asynchronous mode, so warn if a server returns
+ * a stateid.
+ */
+ if (unlikely(*p != 0)) {
+ pr_err_once("%s: server has set unrequested "
+ "asynchronous mode\n", __func__);
+ return -EREMOTEIO;
+ }
+ p++;
p = xdr_decode_hyper(p, &res->count);
res->verifier.committed = be32_to_cpup(p);
return decode_verifier(xdr, &res->verifier.verifier);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 768456fa1b17..4be567a54958 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -185,6 +185,7 @@ struct nfs4_state {
struct nfs4_exception {
struct nfs4_state *state;
struct inode *inode;
+ nfs4_stateid *stateid;
long timeout;
unsigned char delay : 1,
recovering : 1,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 10410e8b5853..8d7d08d4f95f 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
* Returns pointer to an NFS client, or an ERR_PTR value.
*/
struct nfs_client *nfs4_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr)
+ const struct nfs_client_initdata *cl_init)
{
char buf[INET6_ADDRSTRLEN + 1];
+ const char *ip_addr = cl_init->ip_addr;
struct nfs_client *old;
int error;
@@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
- error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
+ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
if (error == -EINVAL)
- error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
+ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
if (error < 0)
goto error;
@@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server,
.hostname = hostname,
.addr = addr,
.addrlen = addrlen,
+ .ip_addr = ip_addr,
.nfs_mod = &nfs_v4,
.proto = proto,
.minorversion = minorversion,
.net = net,
+ .timeparms = timeparms,
};
struct nfs_client *clp;
int error;
@@ -809,7 +811,7 @@ static int nfs4_set_client(struct nfs_server *server,
set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
+ clp = nfs_get_client(&cl_init, authflavour);
if (IS_ERR(clp)) {
error = PTR_ERR(clp);
goto error;
@@ -842,20 +844,24 @@ error:
* low timeout interval so that if a connection is lost, we retry through
* the MDS.
*/
-struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
u32 minor_version, rpc_authflavor_t au_flavor)
{
+ struct rpc_timeout ds_timeout;
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
struct nfs_client_initdata cl_init = {
.addr = ds_addr,
.addrlen = ds_addrlen,
+ .nodename = mds_clp->cl_rpcclient->cl_nodename,
+ .ip_addr = mds_clp->cl_ipaddr,
.nfs_mod = &nfs_v4,
.proto = ds_proto,
.minorversion = minor_version,
.net = mds_clp->cl_net,
+ .timeparms = &ds_timeout,
};
- struct rpc_timeout ds_timeout;
struct nfs_client *clp;
char buf[INET6_ADDRSTRLEN + 1];
@@ -863,14 +869,16 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
+ if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
+ __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
/*
* Set an authflavor equual to the MDS value. Use the MDS nfs_client
* cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
* (section 13.1 RFC 5661).
*/
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
- clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
- au_flavor);
+ clp = nfs_get_client(&cl_init, au_flavor);
dprintk("<-- %s %p\n", __func__, clp);
return clp;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 014b0e41ace5..d085ad794884 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (openflags & O_TRUNC) {
attr.ia_valid |= ATTR_SIZE;
attr.ia_size = 0;
- nfs_sync_inode(inode);
+ filemap_write_and_wait(inode->i_mapping);
}
inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
@@ -133,21 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t count, unsigned int flags)
{
- struct inode *in_inode = file_inode(file_in);
- struct inode *out_inode = file_inode(file_out);
- int ret;
-
- if (in_inode == out_inode)
+ if (file_inode(file_in) == file_inode(file_out))
return -EINVAL;
- /* flush any pending writes */
- ret = nfs_sync_inode(in_inode);
- if (ret)
- return ret;
- ret = nfs_sync_inode(out_inode);
- if (ret)
- return ret;
-
return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ff416d0e24bc..da5c9e58e907 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -363,6 +363,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state *state = exception->state;
+ const nfs4_stateid *stateid = exception->stateid;
struct inode *inode = exception->inode;
int ret = errorcode;
@@ -376,9 +377,18 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (inode && nfs_async_inode_return_delegation(inode,
- NULL) == 0)
- goto wait_on_recovery;
+ if (inode) {
+ int err;
+
+ err = nfs_async_inode_return_delegation(inode,
+ stateid);
+ if (err == 0)
+ goto wait_on_recovery;
+ if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) {
+ exception->retry = 1;
+ break;
+ }
+ }
if (state == NULL)
break;
ret = nfs4_schedule_stateid_recovery(server, state);
@@ -427,6 +437,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_DELAY:
nfs_inc_server_stats(server, NFSIOS_DELAY);
case -NFS4ERR_GRACE:
+ case -NFS4ERR_LAYOUTTRYLATER:
case -NFS4ERR_RECALLCONFLICT:
exception->delay = 1;
return 0;
@@ -2669,28 +2680,17 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
return res;
}
-static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
- struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state, struct nfs4_label *ilabel,
- struct nfs4_label *olabel)
+static int _nfs4_do_setattr(struct inode *inode,
+ struct nfs_setattrargs *arg,
+ struct nfs_setattrres *res,
+ struct rpc_cred *cred,
+ struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(inode);
- struct nfs_setattrargs arg = {
- .fh = NFS_FH(inode),
- .iap = sattr,
- .server = server,
- .bitmask = server->attr_bitmask,
- .label = ilabel,
- };
- struct nfs_setattrres res = {
- .fattr = fattr,
- .label = olabel,
- .server = server,
- };
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
- .rpc_argp = &arg,
- .rpc_resp = &res,
+ .rpc_argp = arg,
+ .rpc_resp = res,
.rpc_cred = cred,
};
struct rpc_cred *delegation_cred = NULL;
@@ -2699,17 +2699,13 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
bool truncate;
int status;
- arg.bitmask = nfs4_bitmask(server, ilabel);
- if (ilabel)
- arg.bitmask = nfs4_bitmask(server, olabel);
-
- nfs_fattr_init(fattr);
+ nfs_fattr_init(res->fattr);
/* Servers should only apply open mode checks for file size changes */
- truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
+ truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
fmode = truncate ? FMODE_WRITE : FMODE_READ;
- if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) {
+ if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) {
/* Use that stateid */
} else if (truncate && state != NULL) {
struct nfs_lockowner lockowner = {
@@ -2719,19 +2715,19 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
if (!nfs4_valid_open_stateid(state))
return -EBADF;
if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
- &arg.stateid, &delegation_cred) == -EIO)
+ &arg->stateid, &delegation_cred) == -EIO)
return -EBADF;
} else
- nfs4_stateid_copy(&arg.stateid, &zero_stateid);
+ nfs4_stateid_copy(&arg->stateid, &zero_stateid);
if (delegation_cred)
msg.rpc_cred = delegation_cred;
- status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+ status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
put_rpccred(delegation_cred);
if (status == 0 && state != NULL)
renew_lease(server, timestamp);
- trace_nfs4_setattr(inode, &arg.stateid, status);
+ trace_nfs4_setattr(inode, &arg->stateid, status);
return status;
}
@@ -2741,13 +2737,31 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs4_label *olabel)
{
struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_setattrargs arg = {
+ .fh = NFS_FH(inode),
+ .iap = sattr,
+ .server = server,
+ .bitmask = server->attr_bitmask,
+ .label = ilabel,
+ };
+ struct nfs_setattrres res = {
+ .fattr = fattr,
+ .label = olabel,
+ .server = server,
+ };
struct nfs4_exception exception = {
.state = state,
.inode = inode,
+ .stateid = &arg.stateid,
};
int err;
+
+ arg.bitmask = nfs4_bitmask(server, ilabel);
+ if (ilabel)
+ arg.bitmask = nfs4_bitmask(server, olabel);
+
do {
- err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
+ err = _nfs4_do_setattr(inode, &arg, &res, cred, state);
switch (err) {
case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -3267,13 +3281,6 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
return status;
}
-static int nfs4_do_find_root_sec(struct nfs_server *server,
- struct nfs_fh *fhandle, struct nfs_fsinfo *info)
-{
- int mv = server->nfs_client->cl_minorversion;
- return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info);
-}
-
/**
* nfs4_proc_get_rootfh - get file handle for server's pseudoroot
* @server: initialized nfs_server handle
@@ -3293,7 +3300,8 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
status = nfs4_lookup_root(server, fhandle, info);
if (auth_probe || status == NFS4ERR_WRONGSEC)
- status = nfs4_do_find_root_sec(server, fhandle, info);
+ status = server->nfs_client->cl_mvops->find_root_sec(server,
+ fhandle, info);
if (status == 0)
status = nfs4_server_capabilities(server, fhandle);
@@ -4392,7 +4400,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
struct rpc_message *msg)
{
hdr->timestamp = jiffies;
- hdr->pgio_done_cb = nfs4_read_done_cb;
+ if (!hdr->pgio_done_cb)
+ hdr->pgio_done_cb = nfs4_read_done_cb;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
}
@@ -7869,11 +7878,13 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
struct inode *inode = lgp->args.inode;
struct nfs_server *server = NFS_SERVER(inode);
struct pnfs_layout_hdr *lo;
- int status = task->tk_status;
+ int nfs4err = task->tk_status;
+ int err, status = 0;
+ LIST_HEAD(head);
dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
- switch (status) {
+ switch (nfs4err) {
case 0:
goto out;
@@ -7905,45 +7916,42 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
status = -EOVERFLOW;
goto out;
}
- /* Fallthrough */
+ status = -EBUSY;
+ break;
case -NFS4ERR_RECALLCONFLICT:
- nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
- exception);
status = -ERECALLCONFLICT;
- goto out;
+ break;
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
exception->timeout = 0;
spin_lock(&inode->i_lock);
- if (nfs4_stateid_match(&lgp->args.stateid,
+ lo = NFS_I(inode)->layout;
+ /* If the open stateid was bad, then recover it. */
+ if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
+ nfs4_stateid_match_other(&lgp->args.stateid,
&lgp->args.ctx->state->stateid)) {
spin_unlock(&inode->i_lock);
- /* If the open stateid was bad, then recover it. */
exception->state = lgp->args.ctx->state;
break;
}
- lo = NFS_I(inode)->layout;
- if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) &&
- nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
- LIST_HEAD(head);
-
- /*
- * Mark the bad layout state as invalid, then retry
- * with the current stateid.
- */
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
- spin_unlock(&inode->i_lock);
- pnfs_free_lseg_list(&head);
- status = -EAGAIN;
- goto out;
- } else
- spin_unlock(&inode->i_lock);
- }
- status = nfs4_handle_exception(server, status, exception);
- if (exception->retry)
+ /*
+ * Mark the bad layout state as invalid, then retry
+ */
+ pnfs_mark_layout_stateid_invalid(lo, &head);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
status = -EAGAIN;
+ goto out;
+ }
+
+ err = nfs4_handle_exception(server, nfs4err, exception);
+ if (!status) {
+ if (exception->retry)
+ status = -EAGAIN;
+ else
+ status = err;
+ }
out:
dprintk("<-- %s\n", __func__);
return status;
@@ -8129,8 +8137,7 @@ static void nfs4_layoutreturn_release(void *calldata)
spin_lock(&lo->plh_inode->i_lock);
pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
be32_to_cpu(lrp->args.stateid.seqid));
- pnfs_mark_layout_returned_if_empty(lo);
- if (lrp->res.lrs_present)
+ if (lrp->res.lrs_present && pnfs_layout_is_valid(lo))
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
pnfs_clear_layoutreturn_waitbit(lo);
spin_unlock(&lo->plh_inode->i_lock);
@@ -8835,7 +8842,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
#endif
};
-ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
{
ssize_t error, error2;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 661e753fe1c9..7bd3a5c09d31 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */
*p = cpu_to_be32(0); /* reclaim */
encode_nfs4_stateid(xdr, &args->stateid);
- p = reserve_space(xdr, 20);
- *p++ = cpu_to_be32(1); /* newoffset = TRUE */
- p = xdr_encode_hyper(p, args->lastbytewritten);
+ if (args->lastbytewritten != U64_MAX) {
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(1); /* newoffset = TRUE */
+ p = xdr_encode_hyper(p, args->lastbytewritten);
+ } else {
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(0); /* newoffset = FALSE */
+ }
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 31c7763b94d5..2ca9167bc97d 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -37,7 +37,6 @@
{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
{ 1 << NFS_INO_STALE, "STALE" }, \
{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
- { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0fbe734cc38c..70806cae0d36 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -259,7 +259,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
* is required.
* Note that caller must hold inode->i_lock.
*/
-static int
+int
pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
struct list_head *lseg_list)
{
@@ -334,14 +334,17 @@ pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
}
static void
-init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid)
{
INIT_LIST_HEAD(&lseg->pls_list);
INIT_LIST_HEAD(&lseg->pls_lc_list);
atomic_set(&lseg->pls_refcount, 1);
- smp_mb();
set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
lseg->pls_layout = lo;
+ lseg->pls_range = *range;
+ lseg->pls_seq = be32_to_cpu(stateid->seqid);
}
static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
@@ -486,15 +489,6 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
(end2 == NFS4_MAX_UINT64 || end2 > start1);
}
-static bool
-should_free_lseg(const struct pnfs_layout_range *lseg_range,
- const struct pnfs_layout_range *recall_range)
-{
- return (recall_range->iomode == IOMODE_ANY ||
- lseg_range->iomode == recall_range->iomode) &&
- pnfs_lseg_range_intersecting(lseg_range, recall_range);
-}
-
static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
struct list_head *tmp_list)
{
@@ -533,6 +527,27 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
return (s32)(s1 - s2) > 0;
}
+static bool
+pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *recall_range)
+{
+ return (recall_range->iomode == IOMODE_ANY ||
+ lseg_range->iomode == recall_range->iomode) &&
+ pnfs_lseg_range_intersecting(lseg_range, recall_range);
+}
+
+static bool
+pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
+ const struct pnfs_layout_range *recall_range,
+ u32 seq)
+{
+ if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
+ return false;
+ if (recall_range == NULL)
+ return true;
+ return pnfs_should_free_range(&lseg->pls_range, recall_range);
+}
+
/**
* pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
* @lo: layout header containing the lsegs
@@ -562,10 +577,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
if (list_empty(&lo->plh_segs))
return 0;
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
- if (!recall_range ||
- should_free_lseg(&lseg->pls_range, recall_range)) {
- if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq))
- continue;
+ if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
dprintk("%s: freeing lseg %p iomode %d seq %u"
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg->pls_seq,
@@ -761,24 +773,25 @@ void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
bool update_barrier)
{
- u32 oldseq, newseq, new_barrier;
- int empty = list_empty(&lo->plh_segs);
+ u32 oldseq, newseq, new_barrier = 0;
+ bool invalid = !pnfs_layout_is_valid(lo);
oldseq = be32_to_cpu(lo->plh_stateid.seqid);
newseq = be32_to_cpu(new->seqid);
- if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
+ if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) {
nfs4_stateid_copy(&lo->plh_stateid, new);
- if (update_barrier) {
- new_barrier = be32_to_cpu(new->seqid);
- } else {
- /* Because of wraparound, we want to keep the barrier
- * "close" to the current seqids.
- */
- new_barrier = newseq - atomic_read(&lo->plh_outstanding);
- }
- if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
- lo->plh_barrier = new_barrier;
+ /*
+ * Because of wraparound, we want to keep the barrier
+ * "close" to the current seqids.
+ */
+ new_barrier = newseq - atomic_read(&lo->plh_outstanding);
}
+ if (update_barrier)
+ new_barrier = be32_to_cpu(new->seqid);
+ else if (new_barrier == 0)
+ return;
+ if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
+ lo->plh_barrier = new_barrier;
}
static bool
@@ -873,15 +886,37 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
}
+static void
+pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
+{
+ lo->plh_return_iomode = 0;
+ lo->plh_return_seq = 0;
+ clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+}
+
static bool
-pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
+ nfs4_stateid *stateid,
+ enum pnfs_iomode *iomode)
{
if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
return false;
- lo->plh_return_iomode = 0;
- lo->plh_return_seq = 0;
pnfs_get_layout_hdr(lo);
- clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
+ if (stateid != NULL) {
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ if (lo->plh_return_seq != 0)
+ stateid->seqid = cpu_to_be32(lo->plh_return_seq);
+ }
+ if (iomode != NULL)
+ *iomode = lo->plh_return_iomode;
+ pnfs_clear_layoutreturn_info(lo);
+ return true;
+ }
+ if (stateid != NULL)
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ if (iomode != NULL)
+ *iomode = IOMODE_ANY;
return true;
}
@@ -949,10 +984,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
enum pnfs_iomode iomode;
bool send;
- nfs4_stateid_copy(&stateid, &lo->plh_stateid);
- stateid.seqid = cpu_to_be32(lo->plh_return_seq);
- iomode = lo->plh_return_iomode;
- send = pnfs_prepare_layoutreturn(lo);
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
spin_unlock(&inode->i_lock);
if (send) {
/* Send an async layoutreturn so we dont deadlock */
@@ -989,7 +1021,6 @@ _pnfs_return_layout(struct inode *ino)
dprintk("NFS: %s no layout to return\n", __func__);
goto out;
}
- nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
/* Reference matched in nfs4_layoutreturn_release */
pnfs_get_layout_hdr(lo);
empty = list_empty(&lo->plh_segs);
@@ -1012,8 +1043,7 @@ _pnfs_return_layout(struct inode *ino)
goto out_put_layout_hdr;
}
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- send = pnfs_prepare_layoutreturn(lo);
+ send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
if (send)
@@ -1080,11 +1110,10 @@ bool pnfs_roc(struct inode *ino)
goto out_noroc;
}
- nfs4_stateid_copy(&stateid, &lo->plh_stateid);
/* always send layoutreturn if being marked so */
- if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED,
- &lo->plh_flags))
- layoutreturn = pnfs_prepare_layoutreturn(lo);
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ layoutreturn = pnfs_prepare_layoutreturn(lo,
+ &stateid, NULL);
list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
/* If we are sending layoutreturn, invalidate all valid lsegs */
@@ -1132,7 +1161,6 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
spin_lock(&ino->i_lock);
lo = NFS_I(ino)->layout;
- pnfs_mark_layout_returned_if_empty(lo);
if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
lo->plh_barrier = barrier;
spin_unlock(&ino->i_lock);
@@ -1505,7 +1533,7 @@ pnfs_update_layout(struct inode *ino,
struct pnfs_layout_segment *lseg = NULL;
nfs4_stateid stateid;
long timeout = 0;
- unsigned long giveup = jiffies + rpc_get_timeout(server->client);
+ unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
bool first;
if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
@@ -1645,33 +1673,44 @@ lookup_again:
lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+ atomic_dec(&lo->plh_outstanding);
if (IS_ERR(lseg)) {
switch(PTR_ERR(lseg)) {
- case -ERECALLCONFLICT:
+ case -EBUSY:
if (time_after(jiffies, giveup))
lseg = NULL;
- /* Fallthrough */
- case -EAGAIN:
- pnfs_put_layout_hdr(lo);
- if (first)
- pnfs_clear_first_layoutget(lo);
- if (lseg) {
- trace_pnfs_update_layout(ino, pos, count,
- iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
- goto lookup_again;
+ break;
+ case -ERECALLCONFLICT:
+ /* Huh? We hold no layouts, how is there a recall? */
+ if (first) {
+ lseg = NULL;
+ break;
}
+ /* Destroy the existing layout and start over */
+ if (time_after(jiffies, giveup))
+ pnfs_destroy_layout(NFS_I(ino));
/* Fallthrough */
+ case -EAGAIN:
+ break;
default:
if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
lseg = NULL;
}
+ goto out_put_layout_hdr;
+ }
+ if (lseg) {
+ if (first)
+ pnfs_clear_first_layoutget(lo);
+ trace_pnfs_update_layout(ino, pos, count,
+ iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
+ pnfs_put_layout_hdr(lo);
+ goto lookup_again;
}
} else {
pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
}
- atomic_dec(&lo->plh_outstanding);
out_put_layout_hdr:
if (first)
pnfs_clear_first_layoutget(lo);
@@ -1735,9 +1774,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
return lseg;
}
- init_lseg(lo, lseg);
- lseg->pls_range = res->range;
- lseg->pls_seq = be32_to_cpu(res->stateid.seqid);
+ pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
spin_lock(&ino->i_lock);
if (pnfs_layoutgets_blocked(lo)) {
@@ -1758,16 +1795,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
* inode invalid, and don't bother validating the stateid
* sequence number.
*/
- pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0);
+ pnfs_mark_layout_stateid_invalid(lo, &free_me);
nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
}
- clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-
pnfs_get_lseg(lseg);
pnfs_layout_insert_lseg(lo, lseg, &free_me);
+ if (!pnfs_layout_is_valid(lo)) {
+ pnfs_clear_layoutreturn_info(lo);
+ clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ }
+
if (res->return_on_close)
set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
@@ -1787,14 +1827,14 @@ static void
pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
u32 seq)
{
- if (lo->plh_return_iomode == iomode)
- return;
- if (lo->plh_return_iomode != 0)
+ if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
iomode = IOMODE_ANY;
lo->plh_return_iomode = iomode;
set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
- if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
+ if (seq != 0) {
+ WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
lo->plh_return_seq = seq;
+ }
}
/**
@@ -1824,7 +1864,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
- if (should_free_lseg(&lseg->pls_range, return_range)) {
+ if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
dprintk("%s: marking lseg %p iomode %d "
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode,
@@ -1855,19 +1895,17 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
bool return_now = false;
spin_lock(&inode->i_lock);
- pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq);
+ pnfs_set_plh_return_info(lo, range.iomode, 0);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- if (!pnfs_mark_matching_lsegs_return(lo, &free_me,
- &range, lseg->pls_seq)) {
+ if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) {
nfs4_stateid stateid;
- enum pnfs_iomode iomode = lo->plh_return_iomode;
+ enum pnfs_iomode iomode;
- nfs4_stateid_copy(&stateid, &lo->plh_stateid);
- return_now = pnfs_prepare_layoutreturn(lo);
+ return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
spin_unlock(&inode->i_lock);
if (return_now)
pnfs_send_layoutreturn(lo, &stateid, iomode, false);
@@ -2382,7 +2420,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
nfs_fattr_init(&data->fattr);
data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
data->res.fattr = &data->fattr;
- data->args.lastbytewritten = end_pos - 1;
+ if (end_pos != 0)
+ data->args.lastbytewritten = end_pos - 1;
+ else
+ data->args.lastbytewritten = U64_MAX;
data->res.server = NFS_SERVER(inode);
if (ld->prepare_layoutcommit) {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index b21bd0bee784..31d99b2927b0 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -268,6 +268,8 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
const struct pnfs_layout_range *recall_range,
u32 seq);
+int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *lseg_list);
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -375,6 +377,11 @@ static inline bool nfs_have_layout(struct inode *inode)
return NFS_I(inode)->layout != NULL;
}
+static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo)
+{
+ return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0;
+}
+
static inline struct nfs4_deviceid_node *
nfs4_get_deviceid(struct nfs4_deviceid_node *d)
{
@@ -545,19 +552,6 @@ pnfs_calc_offset_length(u64 offset, u64 end)
return 1 + end - offset;
}
-/**
- * pnfs_mark_layout_returned_if_empty - marks the layout as returned
- * @lo: layout header
- *
- * Note: Caller must hold inode->i_lock
- */
-static inline void
-pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
-{
- if (list_empty(&lo->plh_segs))
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-}
-
static inline void
pnfs_copy_range(struct pnfs_layout_range *dst,
const struct pnfs_layout_range *src)
@@ -629,6 +623,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync)
}
static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+ return false;
+}
+
+
+static inline bool
pnfs_roc(struct inode *ino)
{
return false;
@@ -716,13 +717,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
return false;
}
-static inline bool
-pnfs_layoutcommit_outstanding(struct inode *inode)
-{
- return false;
-}
-
-
static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
{
return NULL;
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index b38e3c0dc790..f3468b57a32a 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -595,7 +595,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
}
static struct nfs_client *(*get_v3_ds_connect)(
- struct nfs_client *mds_clp,
+ struct nfs_server *mds_srv,
const struct sockaddr *ds_addr,
int ds_addrlen,
int ds_proto,
@@ -654,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
rpc_clnt_test_and_add_xprt, NULL);
} else
- clp = get_v3_ds_connect(mds_srv->nfs_client,
+ clp = get_v3_ds_connect(mds_srv,
(struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP,
timeo, retrans, au_flavor);
@@ -690,7 +690,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
dprintk("%s: DS %s: trying address %s\n",
__func__, ds->ds_remotestr, da->da_remotestr);
- clp = nfs4_set_ds_client(mds_srv->nfs_client,
+ clp = nfs4_set_ds_client(mds_srv,
(struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP,
timeo, retrans, minor_version,
@@ -940,6 +940,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
int
pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
{
+ int ret;
+
+ if (!pnfs_layoutcommit_outstanding(inode))
+ return 0;
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret < 0)
+ return ret;
if (datasync)
return 0;
return pnfs_layoutcommit_inode(inode, true);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2137e0202f25..18d446e1a82b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1684,6 +1684,7 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
{
rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
unsigned int i;
+ int use_auth_null = false;
/*
* If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1691,14 +1692,21 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
*
* AUTH_NULL has a special meaning when it's in the server list - it
* means that the server will ignore the rpc creds, so any flavor
- * can be used.
+ * can be used but still use the sec= that was specified.
*/
for (i = 0; i < count; i++) {
flavor = server_authlist[i];
- if (nfs_auth_info_match(&args->auth_info, flavor) ||
- flavor == RPC_AUTH_NULL)
+ if (nfs_auth_info_match(&args->auth_info, flavor))
goto out;
+
+ if (flavor == RPC_AUTH_NULL)
+ use_auth_null = true;
+ }
+
+ if (use_auth_null) {
+ flavor = RPC_AUTH_NULL;
+ goto out;
}
dfprintk(MOUNT,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 593fa21a02c0..3a6724c6eb5f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -625,7 +625,7 @@ static int nfs_writepage_locked(struct page *page,
int err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
- nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
+ nfs_pageio_init_write(&pgio, inode, 0,
false, &nfs_async_write_completion_ops);
err = nfs_do_writepage(page, wbc, &pgio, launder);
nfs_pageio_complete(&pgio);
@@ -657,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- unsigned long *bitlock = &NFS_I(inode)->flags;
struct nfs_pageio_descriptor pgio;
int err;
- /* Stop dirtying of new pages while we sync */
- err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING,
- nfs_wait_bit_killable, TASK_KILLABLE);
- if (err)
- goto out_err;
-
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
@@ -674,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
nfs_pageio_complete(&pgio);
- clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
- smp_mb__after_atomic();
- wake_up_bit(bitlock, NFS_INO_FLUSHING);
-
if (err < 0)
goto out_err;
err = pgio.pg_error;
@@ -1195,9 +1184,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode)
/*
* Test if the open context credential key is marked to expire soon.
*/
-bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
{
- return rpcauth_cred_key_to_expire(ctx->cred);
+ struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
+
+ return rpcauth_cred_key_to_expire(auth, ctx->cred);
}
/*
@@ -1289,6 +1280,9 @@ int nfs_updatepage(struct file *file, struct page *page,
dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n",
file, count, (long long)(page_file_offset(page) + offset));
+ if (!count)
+ goto out;
+
if (nfs_can_extend_write(file, page, inode)) {
count = max(count + offset, nfs_page_length(page));
offset = 0;
@@ -1299,7 +1293,7 @@ int nfs_updatepage(struct file *file, struct page *page,
nfs_set_pageerror(page);
else
__set_page_dirty_nobuffers(page);
-
+out:
dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
return status;
@@ -1800,7 +1794,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* Okay, COMMIT succeeded, apparently. Check the verifier
* returned by the server against all stored verfs. */
- if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) {
+ if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) {
/* We have a match */
nfs_inode_remove_request(req);
dprintk(" OK\n");
@@ -1924,6 +1918,24 @@ out_mark_dirty:
EXPORT_SYMBOL_GPL(nfs_write_inode);
/*
+ * Wrapper for filemap_write_and_wait_range()
+ *
+ * Needed for pNFS in order to ensure data becomes visible to the
+ * client.
+ */
+int nfs_filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
+{
+ int ret;
+
+ ret = filemap_write_and_wait_range(mapping, lstart, lend);
+ if (ret == 0)
+ ret = pnfs_sync_inode(mapping->host, true);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range);
+
+/*
* flush the inode to disk.
*/
int nfs_wb_all(struct inode *inode)
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 1a85d94f5b25..2c90e285d7c6 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -622,10 +622,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
lock = nilfs_mdt_bgl_lock(inode, group);
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
- nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu",
- (unsigned long long)req->pr_entry_nr,
- (unsigned long)inode->i_ino);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)req->pr_entry_nr);
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
@@ -663,10 +663,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
lock = nilfs_mdt_bgl_lock(inode, group);
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
- nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu",
- (unsigned long long)req->pr_entry_nr,
- (unsigned long)inode->i_ino);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)req->pr_entry_nr);
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
@@ -772,10 +772,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
do {
if (!nilfs_clear_bit_atomic(lock, group_offset,
bitmap)) {
- nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu",
- (unsigned long long)entry_nrs[j],
- (unsigned long)inode->i_ino);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)entry_nrs[j]);
} else {
n++;
}
@@ -816,12 +816,11 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
for (k = 0; k < nempties; k++) {
ret = nilfs_palloc_delete_entry_block(inode,
last_nrs[k]);
- if (ret && ret != -ENOENT) {
- nilfs_warning(inode->i_sb, __func__,
- "failed to delete block of entry %llu: ino=%lu, err=%d",
- (unsigned long long)last_nrs[k],
- (unsigned long)inode->i_ino, ret);
- }
+ if (ret && ret != -ENOENT)
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "error %d deleting block that object (entry=%llu, ino=%lu) belongs to",
+ ret, (unsigned long long)last_nrs[k],
+ inode->i_ino);
}
desc_kaddr = kmap_atomic(desc_bh->b_page);
@@ -835,12 +834,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
if (nfree == nilfs_palloc_entries_per_group(inode)) {
ret = nilfs_palloc_delete_bitmap_block(inode, group);
- if (ret && ret != -ENOENT) {
- nilfs_warning(inode->i_sb, __func__,
- "failed to delete bitmap block of group %lu: ino=%lu, err=%d",
- group,
- (unsigned long)inode->i_ino, ret);
- }
+ if (ret && ret != -ENOENT)
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "error %d deleting bitmap block of group=%lu, ino=%lu",
+ ret, group, inode->i_ino);
}
}
return 0;
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index f2a7877e0c8c..01fb1831ca25 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -41,8 +41,8 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
struct inode *inode = bmap->b_inode;
if (err == -EINVAL) {
- nilfs_error(inode->i_sb, fname,
- "broken bmap (inode number=%lu)", inode->i_ino);
+ __nilfs_error(inode->i_sb, fname,
+ "broken bmap (inode number=%lu)", inode->i_ino);
err = -EIO;
}
return err;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b6a4c8f93ac8..2b6ffbe5997a 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -22,7 +22,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_ondisk.h> /* nilfs_binfo, nilfs_inode, etc */
#include "alloc.h"
#include "dat.h"
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 4cca998ec7a0..d5c23da43513 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -41,7 +41,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
struct inode *inode = NILFS_BTNC_I(btnc);
struct buffer_head *bh;
- bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+ bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
if (unlikely(!bh))
return NULL;
@@ -70,7 +70,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
struct page *page;
int err;
- bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+ bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
if (unlikely(!bh))
return -ENOMEM;
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 982d1e3df3a5..2e315f9f2e51 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -339,12 +339,14 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
* nilfs_btree_node_broken - verify consistency of btree node
* @node: btree node block to be examined
* @size: node size (in bytes)
+ * @inode: host inode of btree
* @blocknr: block number
*
* Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
*/
static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
- size_t size, sector_t blocknr)
+ size_t size, struct inode *inode,
+ sector_t blocknr)
{
int level, flags, nchildren;
int ret = 0;
@@ -358,9 +360,10 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
(flags & NILFS_BTREE_NODE_ROOT) ||
nchildren < 0 ||
nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
- printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): "
- "level = %d, flags = 0x%x, nchildren = %d\n",
- (unsigned long long)blocknr, level, flags, nchildren);
+ nilfs_msg(inode->i_sb, KERN_CRIT,
+ "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d",
+ inode->i_ino, (unsigned long long)blocknr, level,
+ flags, nchildren);
ret = 1;
}
return ret;
@@ -369,12 +372,12 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
/**
* nilfs_btree_root_broken - verify consistency of btree root node
* @node: btree root node to be examined
- * @ino: inode number
+ * @inode: host inode of btree
*
* Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
*/
static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
- unsigned long ino)
+ struct inode *inode)
{
int level, flags, nchildren;
int ret = 0;
@@ -387,8 +390,9 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
level >= NILFS_BTREE_LEVEL_MAX ||
nchildren < 0 ||
nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
- pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n",
- ino, level, flags, nchildren);
+ nilfs_msg(inode->i_sb, KERN_CRIT,
+ "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d",
+ inode->i_ino, level, flags, nchildren);
ret = 1;
}
return ret;
@@ -396,13 +400,15 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
int nilfs_btree_broken_node_block(struct buffer_head *bh)
{
+ struct inode *inode;
int ret;
if (buffer_nilfs_checked(bh))
return 0;
+ inode = bh->b_page->mapping->host;
ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
- bh->b_size, bh->b_blocknr);
+ bh->b_size, inode, bh->b_blocknr);
if (likely(!ret))
set_buffer_nilfs_checked(bh);
return ret;
@@ -448,13 +454,15 @@ nilfs_btree_get_node(const struct nilfs_bmap *btree,
return node;
}
-static int
-nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
+static int nilfs_btree_bad_node(const struct nilfs_bmap *btree,
+ struct nilfs_btree_node *node, int level)
{
if (unlikely(nilfs_btree_node_get_level(node) != level)) {
dump_stack();
- printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n",
- nilfs_btree_node_get_level(node), level);
+ nilfs_msg(btree->b_inode->i_sb, KERN_CRIT,
+ "btree level mismatch (ino=%lu): %d != %d",
+ btree->b_inode->i_ino,
+ nilfs_btree_node_get_level(node), level);
return 1;
}
return 0;
@@ -509,6 +517,9 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
out_no_wait:
if (!buffer_uptodate(bh)) {
+ nilfs_msg(btree->b_inode->i_sb, KERN_ERR,
+ "I/O error reading b-tree node block (ino=%lu, blocknr=%llu)",
+ btree->b_inode->i_ino, (unsigned long long)ptr);
brelse(bh);
return -EIO;
}
@@ -568,7 +579,7 @@ static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
return ret;
node = nilfs_btree_get_nonroot_node(path, level);
- if (nilfs_btree_bad_node(node, level))
+ if (nilfs_btree_bad_node(btree, node, level))
return -EINVAL;
if (!found)
found = nilfs_btree_node_lookup(node, key, &index);
@@ -616,7 +627,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
if (ret < 0)
return ret;
node = nilfs_btree_get_nonroot_node(path, level);
- if (nilfs_btree_bad_node(node, level))
+ if (nilfs_btree_bad_node(btree, node, level))
return -EINVAL;
index = nilfs_btree_node_get_nchildren(node) - 1;
ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
@@ -2072,8 +2083,10 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree,
ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
if (ret < 0) {
if (unlikely(ret == -ENOENT))
- printk(KERN_CRIT "%s: key = %llu, level == %d\n",
- __func__, (unsigned long long)key, level);
+ nilfs_msg(btree->b_inode->i_sb, KERN_CRIT,
+ "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d",
+ btree->b_inode->i_ino,
+ (unsigned long long)key, level);
goto out;
}
@@ -2110,12 +2123,11 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
level >= NILFS_BTREE_LEVEL_MAX) {
dump_stack();
- printk(KERN_WARNING
- "%s: invalid btree level: %d (key=%llu, ino=%lu, "
- "blocknr=%llu)\n",
- __func__, level, (unsigned long long)key,
- NILFS_BMAP_I(btree)->vfs_inode.i_ino,
- (unsigned long long)bh->b_blocknr);
+ nilfs_msg(btree->b_inode->i_sb, KERN_WARNING,
+ "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)",
+ level, (unsigned long long)key,
+ btree->b_inode->i_ino,
+ (unsigned long long)bh->b_blocknr);
return;
}
@@ -2394,8 +2406,7 @@ int nilfs_btree_init(struct nilfs_bmap *bmap)
__nilfs_btree_init(bmap);
- if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap),
- bmap->b_inode->i_ino))
+ if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode))
ret = -EIO;
return ret;
}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index df1a25faa83b..2184e47fa4bf 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -22,7 +22,7 @@
#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/list.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_ondisk.h> /* nilfs_btree_node */
#include "btnode.h"
#include "bmap.h"
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 8a3d3b65af3f..a15a1601e931 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -21,7 +21,6 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/errno.h>
-#include <linux/nilfs2_fs.h>
#include "mdt.h"
#include "cpfile.h"
@@ -332,9 +331,9 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
int ret, ncps, nicps, nss, count, i;
if (unlikely(start == 0 || start > end)) {
- printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
- "[%llu, %llu)\n", __func__,
- (unsigned long long)start, (unsigned long long)end);
+ nilfs_msg(cpfile->i_sb, KERN_ERR,
+ "cannot delete checkpoints: invalid range [%llu, %llu)",
+ (unsigned long long)start, (unsigned long long)end);
return -EINVAL;
}
@@ -386,9 +385,9 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
cpfile, cno);
if (ret == 0)
continue;
- printk(KERN_ERR
- "%s: cannot delete block\n",
- __func__);
+ nilfs_msg(cpfile->i_sb, KERN_ERR,
+ "error %d deleting checkpoint block",
+ ret);
break;
}
}
@@ -991,14 +990,12 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
int err;
if (cpsize > sb->s_blocksize) {
- printk(KERN_ERR
- "NILFS: too large checkpoint size: %zu bytes.\n",
- cpsize);
+ nilfs_msg(sb, KERN_ERR,
+ "too large checkpoint size: %zu bytes", cpsize);
return -EINVAL;
} else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
- printk(KERN_ERR
- "NILFS: too small checkpoint size: %zu bytes.\n",
- cpsize);
+ nilfs_msg(sb, KERN_ERR,
+ "too small checkpoint size: %zu bytes", cpsize);
return -EINVAL;
}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 0249744ae234..6eca972f9673 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -21,7 +21,8 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_api.h> /* nilfs_cpstat */
+#include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */
int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 7367610ea807..dffedb2f8817 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -349,10 +349,11 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
kaddr = kmap_atomic(entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
- printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
- (unsigned long long)vblocknr,
- (unsigned long long)le64_to_cpu(entry->de_start),
- (unsigned long long)le64_to_cpu(entry->de_end));
+ nilfs_msg(dat->i_sb, KERN_CRIT,
+ "%s: invalid vblocknr = %llu, [%llu, %llu)",
+ __func__, (unsigned long long)vblocknr,
+ (unsigned long long)le64_to_cpu(entry->de_start),
+ (unsigned long long)le64_to_cpu(entry->de_end));
kunmap_atomic(kaddr);
brelse(entry_bh);
return -EINVAL;
@@ -479,14 +480,12 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
int err;
if (entry_size > sb->s_blocksize) {
- printk(KERN_ERR
- "NILFS: too large DAT entry size: %zu bytes.\n",
- entry_size);
+ nilfs_msg(sb, KERN_ERR, "too large DAT entry size: %zu bytes",
+ entry_size);
return -EINVAL;
} else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
- printk(KERN_ERR
- "NILFS: too small DAT entry size: %zu bytes.\n",
- entry_size);
+ nilfs_msg(sb, KERN_ERR, "too small DAT entry size: %zu bytes",
+ entry_size);
return -EINVAL;
}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index abbfdabcabea..57dc6cf466d0 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -22,6 +22,7 @@
#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
+#include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */
struct nilfs_palloc_req;
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index e506f4f7120a..908ebbf0ac7e 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -42,6 +42,28 @@
#include "nilfs.h"
#include "page.h"
+static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen)
+{
+ unsigned int len = le16_to_cpu(dlen);
+
+#if (PAGE_SIZE >= 65536)
+ if (len == NILFS_MAX_REC_LEN)
+ return 1 << 16;
+#endif
+ return len;
+}
+
+static inline __le16 nilfs_rec_len_to_disk(unsigned int len)
+{
+#if (PAGE_SIZE >= 65536)
+ if (len == (1 << 16))
+ return cpu_to_le16(NILFS_MAX_REC_LEN);
+
+ BUG_ON(len > (1 << 16));
+#endif
+ return cpu_to_le16(len);
+}
+
/*
* nilfs uses block-sized chunks. Arguably, sector-sized ones would be
* more robust, but we have what we have
@@ -140,10 +162,9 @@ out:
/* Too bad, we had an error */
Ebadsize:
- nilfs_error(sb, "nilfs_check_page",
+ nilfs_error(sb,
"size of directory #%lu is not a multiple of chunk size",
- dir->i_ino
- );
+ dir->i_ino);
goto fail;
Eshort:
error = "rec_len is smaller than minimal";
@@ -157,19 +178,18 @@ Enamelen:
Espan:
error = "directory entry across blocks";
bad_entry:
- nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
- "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
- (unsigned long) le64_to_cpu(p->inode),
+ nilfs_error(sb,
+ "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+ dir->i_ino, error, (page->index << PAGE_SHIFT) + offs,
+ (unsigned long)le64_to_cpu(p->inode),
rec_len, p->name_len);
goto fail;
Eend:
p = (struct nilfs_dir_entry *)(kaddr + offs);
- nilfs_error(sb, "nilfs_check_page",
- "entry in directory #%lu spans the page boundary"
- "offset=%lu, inode=%lu",
- dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
- (unsigned long) le64_to_cpu(p->inode));
+ nilfs_error(sb,
+ "entry in directory #%lu spans the page boundary offset=%lu, inode=%lu",
+ dir->i_ino, (page->index << PAGE_SHIFT) + offs,
+ (unsigned long)le64_to_cpu(p->inode));
fail:
SetPageError(page);
return false;
@@ -267,8 +287,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
struct page *page = nilfs_get_page(inode, n);
if (IS_ERR(page)) {
- nilfs_error(sb, __func__, "bad page in #%lu",
- inode->i_ino);
+ nilfs_error(sb, "bad page in #%lu", inode->i_ino);
ctx->pos += PAGE_SIZE - offset;
return -EIO;
}
@@ -278,8 +297,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
NILFS_DIR_REC_LEN(1);
for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
if (de->rec_len == 0) {
- nilfs_error(sb, __func__,
- "zero-length directory entry");
+ nilfs_error(sb, "zero-length directory entry");
nilfs_put_page(page);
return -EIO;
}
@@ -345,7 +363,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
kaddr += nilfs_last_byte(dir, n) - reclen;
while ((char *) de <= kaddr) {
if (de->rec_len == 0) {
- nilfs_error(dir->i_sb, __func__,
+ nilfs_error(dir->i_sb,
"zero-length directory entry");
nilfs_put_page(page);
goto out;
@@ -360,7 +378,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
n = 0;
/* next page is past the blocks we've got */
if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
- nilfs_error(dir->i_sb, __func__,
+ nilfs_error(dir->i_sb,
"dir %lu size %lld exceeds block count %llu",
dir->i_ino, dir->i_size,
(unsigned long long)dir->i_blocks);
@@ -469,7 +487,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
goto got_it;
}
if (de->rec_len == 0) {
- nilfs_error(dir->i_sb, __func__,
+ nilfs_error(dir->i_sb,
"zero-length directory entry");
err = -EIO;
goto out_unlock;
@@ -541,7 +559,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
while ((char *)de < (char *)dir) {
if (de->rec_len == 0) {
- nilfs_error(inode->i_sb, __func__,
+ nilfs_error(inode->i_sb,
"zero-length directory entry");
err = -EIO;
goto out;
@@ -628,7 +646,7 @@ int nilfs_empty_dir(struct inode *inode)
while ((char *)de <= kaddr) {
if (de->rec_len == 0) {
- nilfs_error(inode->i_sb, __func__,
+ nilfs_error(inode->i_sb,
"zero-length directory entry (kaddr=%p, de=%p)",
kaddr, de);
goto not_empty;
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 251a44928405..96e3ed0d9652 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -337,14 +337,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
key = nilfs_bmap_data_get_key(bmap, *bh);
if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
- printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
- (unsigned long long)key);
+ nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT,
+ "%s (ino=%lu): invalid key: %llu", __func__,
+ bmap->b_inode->i_ino, (unsigned long long)key);
return -EINVAL;
}
ptr = nilfs_direct_get_ptr(bmap, key);
if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
- printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
- (unsigned long long)ptr);
+ nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT,
+ "%s (ino=%lu): invalid pointer: %llu", __func__,
+ bmap->b_inode->i_ino, (unsigned long long)ptr);
return -EINVAL;
}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index 3015a6e78724..cfe85e848bba 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -24,16 +24,6 @@
#include "bmap.h"
-/**
- * struct nilfs_direct_node - direct node
- * @dn_flags: flags
- * @dn_pad: padding
- */
-struct nilfs_direct_node {
- __u8 dn_flags;
- __u8 pad[7];
-};
-
#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
#define NILFS_DIRECT_KEY_MIN 0
#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e9148f94d696..853a831dcde0 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -148,8 +148,15 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
{
wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
+ if (!buffer_uptodate(bh)) {
+ struct inode *inode = bh->b_page->mapping->host;
+
+ nilfs_msg(inode->i_sb, KERN_ERR,
+ "I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)",
+ buffer_nilfs_node(bh) ? "node" : "data",
+ inode->i_ino, (unsigned long long)bh->b_blocknr);
return -EIO;
+ }
if (buffer_dirty(bh))
return -EEXIST;
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 1d2b1805327a..b8fa45c20c63 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -145,15 +145,14 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
int err;
if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
- nilfs_error(sb, __func__, "bad inode number: %lu",
- (unsigned long) ino);
+ nilfs_error(sb, "bad inode number: %lu", (unsigned long)ino);
return -EINVAL;
}
err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
if (unlikely(err))
- nilfs_warning(sb, __func__, "unable to read inode: %lu",
- (unsigned long) ino);
+ nilfs_msg(sb, KERN_WARNING, "error %d reading inode: ino=%lu",
+ err, (unsigned long)ino);
return err;
}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 23ad2f091e76..188b94fe0ec5 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -23,7 +23,6 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
#include "mdt.h"
#include "alloc.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index a0ebdb17e912..af04f553d7c9 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -112,13 +112,10 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
* However, the page having this block must
* be locked in this case.
*/
- printk(KERN_WARNING
- "nilfs_get_block: a race condition "
- "while inserting a data block. "
- "(inode number=%lu, file block "
- "offset=%llu)\n",
- inode->i_ino,
- (unsigned long long)blkoff);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "%s (ino=%lu): a race condition while inserting a data block at offset=%llu",
+ __func__, inode->i_ino,
+ (unsigned long long)blkoff);
err = 0;
}
nilfs_transaction_abort(inode->i_sb);
@@ -359,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
root = NILFS_I(dir)->i_root;
ii = NILFS_I(inode);
- ii->i_state = 1 << NILFS_I_NEW;
+ ii->i_state = BIT(NILFS_I_NEW);
ii->i_root = root;
err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
@@ -558,7 +555,7 @@ static int nilfs_iget_set(struct inode *inode, void *opaque)
inode->i_ino = args->ino;
if (args->for_gc) {
- NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
+ NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE);
NILFS_I(inode)->i_cno = args->cno;
NILFS_I(inode)->i_root = NULL;
} else {
@@ -726,9 +723,9 @@ repeat:
goto repeat;
failed:
- nilfs_warning(ii->vfs_inode.i_sb, __func__,
- "failed to truncate bmap (ino=%lu, err=%d)",
- ii->vfs_inode.i_ino, ret);
+ nilfs_msg(ii->vfs_inode.i_sb, KERN_WARNING,
+ "error %d truncating bmap (ino=%lu)", ret,
+ ii->vfs_inode.i_ino);
}
void nilfs_truncate(struct inode *inode)
@@ -939,9 +936,9 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
* This will happen when somebody is freeing
* this inode.
*/
- nilfs_warning(inode->i_sb, __func__,
- "cannot get inode (ino=%lu)",
- inode->i_ino);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "cannot set file dirty (ino=%lu): the file is being freed",
+ inode->i_ino);
spin_unlock(&nilfs->ns_inode_lock);
return -EINVAL; /*
* NILFS_I_DIRTY may remain for
@@ -962,8 +959,9 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
err = nilfs_load_inode_block(inode, &ibh);
if (unlikely(err)) {
- nilfs_warning(inode->i_sb, __func__,
- "failed to reget inode block.");
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "cannot mark inode dirty (ino=%lu): error %d loading inode block",
+ inode->i_ino, err);
return err;
}
nilfs_update_inode(inode, ibh, flags);
@@ -989,8 +987,8 @@ void nilfs_dirty_inode(struct inode *inode, int flags)
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
if (is_bad_inode(inode)) {
- nilfs_warning(inode->i_sb, __func__,
- "tried to mark bad_inode dirty. ignored.");
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "tried to mark bad_inode dirty. ignored.");
dump_stack();
return;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 358b57e2cdf9..f1d7989459fd 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -25,7 +25,6 @@
#include <linux/compat.h> /* compat_ptr() */
#include <linux/mount.h> /* mnt_want_write_file(), mnt_drop_write_file() */
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
#include "nilfs.h"
#include "segment.h"
#include "bmap.h"
@@ -584,27 +583,25 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
if (unlikely(ret < 0)) {
if (ret == -ENOENT)
- printk(KERN_CRIT
- "%s: invalid virtual block address (%s): "
- "ino=%llu, cno=%llu, offset=%llu, "
- "blocknr=%llu, vblocknr=%llu\n",
- __func__, vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
+ nilfs_msg(inode->i_sb, KERN_CRIT,
+ "%s: invalid virtual block address (%s): ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
return ret;
}
if (unlikely(!list_empty(&bh->b_assoc_buffers))) {
- printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, "
- "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n",
- __func__, vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
+ nilfs_msg(inode->i_sb, KERN_CRIT,
+ "%s: conflicting %s buffer: ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
brelse(bh);
return -EEXIST;
}
@@ -854,8 +851,8 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
return 0;
failed:
- printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
- msg, ret);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR, "error %d preparing GC: %s", ret,
+ msg);
return ret;
}
@@ -963,10 +960,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
}
ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
- if (ret < 0)
- printk(KERN_ERR "NILFS: GC failed during preparation: "
- "cannot read source blocks: err=%d\n", ret);
- else {
+ if (ret < 0) {
+ nilfs_msg(inode->i_sb, KERN_ERR,
+ "error %d preparing GC: cannot read source blocks",
+ ret);
+ } else {
if (nilfs_sb_need_update(nilfs))
set_nilfs_discontinued(nilfs);
ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 0d7b71fbeff8..d56d3a5bea88 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -207,8 +207,12 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
out_no_wait:
err = -EIO;
- if (!buffer_uptodate(first_bh))
+ if (!buffer_uptodate(first_bh)) {
+ nilfs_msg(inode->i_sb, KERN_ERR,
+ "I/O error reading meta-data file (ino=%lu, block-offset=%lu)",
+ inode->i_ino, block);
goto failed_bh;
+ }
out:
*out_bh = first_bh;
return 0;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1ec8ae5995a5..dbcf1dc93a51 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -283,9 +283,9 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
goto out;
if (!inode->i_nlink) {
- nilfs_warning(inode->i_sb, __func__,
- "deleting nonexistent file (%lu), %d",
- inode->i_ino, inode->i_nlink);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "deleting nonexistent file (ino=%lu), %d",
+ inode->i_ino, inode->i_nlink);
set_nlink(inode, 1);
}
err = nilfs_delete_entry(de, page);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index b1d48bc0532d..33f8c8fc96e8 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -23,7 +23,8 @@
#include <linux/buffer_head.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_api.h>
+#include <linux/nilfs2_ondisk.h>
#include "the_nilfs.h"
#include "bmap.h"
@@ -119,20 +120,19 @@ enum {
/*
* Macros to check inode numbers
*/
-#define NILFS_MDT_INO_BITS \
- ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \
- 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \
- 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
+#define NILFS_MDT_INO_BITS \
+ (BIT(NILFS_DAT_INO) | BIT(NILFS_CPFILE_INO) | \
+ BIT(NILFS_SUFILE_INO) | BIT(NILFS_IFILE_INO) | \
+ BIT(NILFS_ATIME_INO) | BIT(NILFS_SKETCH_INO))
-#define NILFS_SYS_INO_BITS \
- ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
+#define NILFS_SYS_INO_BITS (BIT(NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
#define NILFS_MDT_INODE(sb, ino) \
- ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
+ ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino)))
#define NILFS_VALID_INODE(sb, ino) \
- ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
+ ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino)))
/**
* struct nilfs_transaction_info: context information for synchronization
@@ -299,10 +299,36 @@ static inline int nilfs_mark_inode_dirty_sync(struct inode *inode)
/* super.c */
extern struct inode *nilfs_alloc_inode(struct super_block *);
extern void nilfs_destroy_inode(struct inode *);
+
extern __printf(3, 4)
-void nilfs_error(struct super_block *, const char *, const char *, ...);
+void __nilfs_msg(struct super_block *sb, const char *level,
+ const char *fmt, ...);
extern __printf(3, 4)
-void nilfs_warning(struct super_block *, const char *, const char *, ...);
+void __nilfs_error(struct super_block *sb, const char *function,
+ const char *fmt, ...);
+
+#ifdef CONFIG_PRINTK
+
+#define nilfs_msg(sb, level, fmt, ...) \
+ __nilfs_msg(sb, level, fmt, ##__VA_ARGS__)
+#define nilfs_error(sb, fmt, ...) \
+ __nilfs_error(sb, __func__, fmt, ##__VA_ARGS__)
+
+#else
+
+#define nilfs_msg(sb, level, fmt, ...) \
+ do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ (void)(sb); \
+ } while (0)
+#define nilfs_error(sb, fmt, ...) \
+ do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __nilfs_error(sb, "", " "); \
+ } while (0)
+
+#endif /* CONFIG_PRINTK */
+
extern struct nilfs_super_block *
nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
extern int nilfs_store_magic_and_option(struct super_block *,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index d97ba5f11b77..f11a3ad2df0c 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -30,9 +30,9 @@
#include "mdt.h"
-#define NILFS_BUFFER_INHERENT_BITS \
- ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
- (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked))
+#define NILFS_BUFFER_INHERENT_BITS \
+ (BIT(BH_Uptodate) | BIT(BH_Mapped) | BIT(BH_NILFS_Node) | \
+ BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked))
static struct buffer_head *
__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
@@ -85,9 +85,9 @@ void nilfs_forget_buffer(struct buffer_head *bh)
{
struct page *page = bh->b_page;
const unsigned long clear_bits =
- (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
- 1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
- 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
+ (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
+ BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
+ BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected));
lock_buffer(bh);
set_mask_bits(&bh->b_state, clear_bits, 0);
@@ -124,17 +124,17 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
dbh->b_bdev = sbh->b_bdev;
bh = dbh;
- bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
+ bits = sbh->b_state & (BIT(BH_Uptodate) | BIT(BH_Mapped));
while ((bh = bh->b_this_page) != dbh) {
lock_buffer(bh);
bits &= bh->b_state;
unlock_buffer(bh);
}
- if (bits & (1UL << BH_Uptodate))
+ if (bits & BIT(BH_Uptodate))
SetPageUptodate(dpage);
else
ClearPageUptodate(dpage);
- if (bits & (1UL << BH_Mapped))
+ if (bits & BIT(BH_Mapped))
SetPageMappedToDisk(dpage);
else
ClearPageMappedToDisk(dpage);
@@ -215,7 +215,7 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
create_empty_buffers(dst, sbh->b_size, 0);
if (copy_dirty)
- mask |= (1UL << BH_Dirty);
+ mask |= BIT(BH_Dirty);
dbh = dbufs = page_buffers(dst);
do {
@@ -403,11 +403,10 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
BUG_ON(!PageLocked(page));
- if (!silent) {
- nilfs_warning(sb, __func__,
- "discard page: offset %lld, ino %lu",
- page_offset(page), inode->i_ino);
- }
+ if (!silent)
+ nilfs_msg(sb, KERN_WARNING,
+ "discard dirty page: offset=%lld, ino=%lu",
+ page_offset(page), inode->i_ino);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
@@ -415,18 +414,18 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
if (page_has_buffers(page)) {
struct buffer_head *bh, *head;
const unsigned long clear_bits =
- (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
- 1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
- 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
+ (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
+ BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
+ BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected));
bh = head = page_buffers(page);
do {
lock_buffer(bh);
- if (!silent) {
- nilfs_warning(sb, __func__,
- "discard block %llu, size %zu",
- (u64)bh->b_blocknr, bh->b_size);
- }
+ if (!silent)
+ nilfs_msg(sb, KERN_WARNING,
+ "discard dirty block: blocknr=%llu, size=%zu",
+ (u64)bh->b_blocknr, bh->b_size);
+
set_mask_bits(&bh->b_state, clear_bits, 0);
unlock_buffer(bh);
} while (bh = bh->b_this_page, bh != head);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d893dc912b62..5139efed1888 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -54,38 +54,37 @@ struct nilfs_recovery_block {
};
-static int nilfs_warn_segment_error(int err)
+static int nilfs_warn_segment_error(struct super_block *sb, int err)
{
+ const char *msg = NULL;
+
switch (err) {
case NILFS_SEG_FAIL_IO:
- printk(KERN_WARNING
- "NILFS warning: I/O error on loading last segment\n");
+ nilfs_msg(sb, KERN_ERR, "I/O error reading segment");
return -EIO;
case NILFS_SEG_FAIL_MAGIC:
- printk(KERN_WARNING
- "NILFS warning: Segment magic number invalid\n");
+ msg = "Magic number mismatch";
break;
case NILFS_SEG_FAIL_SEQ:
- printk(KERN_WARNING
- "NILFS warning: Sequence number mismatch\n");
+ msg = "Sequence number mismatch";
break;
case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
- printk(KERN_WARNING
- "NILFS warning: Checksum error in super root\n");
+ msg = "Checksum error in super root";
break;
case NILFS_SEG_FAIL_CHECKSUM_FULL:
- printk(KERN_WARNING
- "NILFS warning: Checksum error in segment payload\n");
+ msg = "Checksum error in segment payload";
break;
case NILFS_SEG_FAIL_CONSISTENCY:
- printk(KERN_WARNING
- "NILFS warning: Inconsistent segment\n");
+ msg = "Inconsistency found";
break;
case NILFS_SEG_NO_SUPER_ROOT:
- printk(KERN_WARNING
- "NILFS warning: No super root in the last segment\n");
+ msg = "No super root in the last segment";
break;
+ default:
+ nilfs_msg(sb, KERN_ERR, "unrecognized segment error %d", err);
+ return -EINVAL;
}
+ nilfs_msg(sb, KERN_WARNING, "invalid segment: %s", msg);
return -EINVAL;
}
@@ -178,7 +177,7 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
brelse(bh_sr);
failed:
- return nilfs_warn_segment_error(ret);
+ return nilfs_warn_segment_error(nilfs->ns_sb, ret);
}
/**
@@ -553,11 +552,10 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
put_page(page);
failed_inode:
- printk(KERN_WARNING
- "NILFS warning: error recovering data block "
- "(err=%d, ino=%lu, block-offset=%llu)\n",
- err, (unsigned long)rb->ino,
- (unsigned long long)rb->blkoff);
+ nilfs_msg(sb, KERN_WARNING,
+ "error %d recovering data block (ino=%lu, block-offset=%llu)",
+ err, (unsigned long)rb->ino,
+ (unsigned long long)rb->blkoff);
if (!err2)
err2 = err;
next:
@@ -680,8 +678,8 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
}
if (nsalvaged_blocks) {
- printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
- sb->s_id, nsalvaged_blocks);
+ nilfs_msg(sb, KERN_INFO, "salvaged %lu blocks",
+ nsalvaged_blocks);
ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
}
out:
@@ -692,10 +690,9 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
confused:
err = -EINVAL;
failed:
- printk(KERN_ERR
- "NILFS (device %s): Error roll-forwarding "
- "(err=%d, pseg block=%llu). ",
- sb->s_id, err, (unsigned long long)pseg_start);
+ nilfs_msg(sb, KERN_ERR,
+ "error %d roll-forwarding partial segment at blocknr = %llu",
+ err, (unsigned long long)pseg_start);
goto out;
}
@@ -715,9 +712,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
set_buffer_dirty(bh);
err = sync_dirty_buffer(bh);
if (unlikely(err))
- printk(KERN_WARNING
- "NILFS warning: buffer sync write failed during "
- "post-cleaning of recovery.\n");
+ nilfs_msg(nilfs->ns_sb, KERN_WARNING,
+ "buffer sync write failed during post-cleaning of recovery.");
brelse(bh);
}
@@ -752,8 +748,8 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
if (unlikely(err)) {
- printk(KERN_ERR
- "NILFS: error loading the latest checkpoint.\n");
+ nilfs_msg(sb, KERN_ERR,
+ "error %d loading the latest checkpoint", err);
return err;
}
@@ -764,8 +760,9 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
if (unlikely(err)) {
- printk(KERN_ERR "NILFS: Error preparing segments for "
- "recovery.\n");
+ nilfs_msg(sb, KERN_ERR,
+ "error %d preparing segment for recovery",
+ err);
goto failed;
}
@@ -778,8 +775,9 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
nilfs_detach_log_writer(sb);
if (unlikely(err)) {
- printk(KERN_ERR "NILFS: Oops! recovery failed. "
- "(err=%d)\n", err);
+ nilfs_msg(sb, KERN_ERR,
+ "error %d writing segment for recovery",
+ err);
goto failed;
}
@@ -961,5 +959,5 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
failed:
brelse(bh_sum);
nilfs_dispose_segment_list(&segments);
- return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
+ return ret < 0 ? ret : nilfs_warn_segment_error(nilfs->ns_sb, ret);
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index a962d7d83447..6f87b2ac1aeb 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -514,7 +514,11 @@ static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
} while (--segbuf->sb_nbio > 0);
if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
- printk(KERN_ERR "NILFS: IO error writing segment\n");
+ nilfs_msg(segbuf->sb_super, KERN_ERR,
+ "I/O error writing log (start-blocknr=%llu, block-count=%lu) in segment %llu",
+ (unsigned long long)segbuf->sb_pseg_start,
+ segbuf->sb_sum.nblocks,
+ (unsigned long long)segbuf->sb_segnum);
err = -EIO;
}
return err;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index e78b68a81aec..bedcae2c28e6 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -150,7 +150,8 @@ static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int);
#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
-static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
+static int nilfs_prepare_segment_lock(struct super_block *sb,
+ struct nilfs_transaction_info *ti)
{
struct nilfs_transaction_info *cur_ti = current->journal_info;
void *save = NULL;
@@ -164,8 +165,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
* it is saved and will be restored on
* nilfs_transaction_commit().
*/
- printk(KERN_WARNING
- "NILFS warning: journal info from a different FS\n");
+ nilfs_msg(sb, KERN_WARNING, "journal info from a different FS");
save = current->journal_info;
}
if (!ti) {
@@ -215,7 +215,7 @@ int nilfs_transaction_begin(struct super_block *sb,
int vacancy_check)
{
struct the_nilfs *nilfs;
- int ret = nilfs_prepare_segment_lock(ti);
+ int ret = nilfs_prepare_segment_lock(sb, ti);
struct nilfs_transaction_info *trace_ti;
if (unlikely(ret < 0))
@@ -373,7 +373,7 @@ static void nilfs_transaction_lock(struct super_block *sb,
nilfs_segctor_do_immediate_flush(sci);
up_write(&nilfs->ns_segctor_sem);
- yield();
+ cond_resched();
}
if (gcflag)
ti->ti_flags |= NILFS_TI_GC;
@@ -1858,11 +1858,11 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
*/
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- const unsigned long set_bits = (1 << BH_Uptodate);
+ const unsigned long set_bits = BIT(BH_Uptodate);
const unsigned long clear_bits =
- (1 << BH_Dirty | 1 << BH_Async_Write |
- 1 << BH_Delay | 1 << BH_NILFS_Volatile |
- 1 << BH_NILFS_Redirected);
+ (BIT(BH_Dirty) | BIT(BH_Async_Write) |
+ BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
+ BIT(BH_NILFS_Redirected));
set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
@@ -1951,8 +1951,9 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
err = nilfs_ifile_get_inode_block(
ifile, ii->vfs_inode.i_ino, &ibh);
if (unlikely(err)) {
- nilfs_warning(sci->sc_super, __func__,
- "failed to get inode block.");
+ nilfs_msg(sci->sc_super, KERN_WARNING,
+ "log writer: error %d getting inode block (ino=%lu)",
+ err, ii->vfs_inode.i_ino);
return err;
}
mark_buffer_dirty(ibh);
@@ -2131,10 +2132,10 @@ static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
{
spin_lock(&sci->sc_state_lock);
- if (!(sci->sc_flush_request & (1 << bn))) {
+ if (!(sci->sc_flush_request & BIT(bn))) {
unsigned long prev_req = sci->sc_flush_request;
- sci->sc_flush_request |= (1 << bn);
+ sci->sc_flush_request |= BIT(bn);
if (!prev_req)
wake_up(&sci->sc_wait_daemon);
}
@@ -2318,7 +2319,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
}
#define FLUSH_FILE_BIT (0x1) /* data file only */
-#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */
+#define FLUSH_DAT_BIT BIT(NILFS_DAT_INO) /* DAT only */
/**
* nilfs_segctor_accept - record accepted sequence count of log-write requests
@@ -2458,8 +2459,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
if (likely(!err))
break;
- nilfs_warning(sb, __func__,
- "segment construction failed. (err=%d)", err);
+ nilfs_msg(sb, KERN_WARNING, "error %d cleaning segments", err);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(sci->sc_interval);
}
@@ -2467,9 +2467,9 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
sci->sc_nfreesegs);
if (ret) {
- printk(KERN_WARNING
- "NILFS warning: error %d on discard request, "
- "turning discards off for the device\n", ret);
+ nilfs_msg(sb, KERN_WARNING,
+ "error %d on discard request, turning discards off for the device",
+ ret);
nilfs_clear_opt(nilfs, DISCARD);
}
}
@@ -2551,10 +2551,9 @@ static int nilfs_segctor_thread(void *arg)
/* start sync. */
sci->sc_task = current;
wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
- printk(KERN_INFO
- "segctord starting. Construction interval = %lu seconds, "
- "CP frequency < %lu seconds\n",
- sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+ nilfs_msg(sci->sc_super, KERN_INFO,
+ "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
+ sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
spin_lock(&sci->sc_state_lock);
loop:
@@ -2628,8 +2627,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
if (IS_ERR(t)) {
int err = PTR_ERR(t);
- printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
- err);
+ nilfs_msg(sci->sc_super, KERN_ERR,
+ "error %d creating segctord thread", err);
return err;
}
wait_event(sci->sc_wait_task, sci->sc_task != NULL);
@@ -2739,14 +2738,14 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
nilfs_segctor_write_out(sci);
if (!list_empty(&sci->sc_dirty_files)) {
- nilfs_warning(sci->sc_super, __func__,
- "dirty file(s) after the final construction");
+ nilfs_msg(sci->sc_super, KERN_WARNING,
+ "disposed unprocessed dirty file(s) when stopping log writer");
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
if (!list_empty(&sci->sc_iput_queue)) {
- nilfs_warning(sci->sc_super, __func__,
- "iput queue is not empty");
+ nilfs_msg(sci->sc_super, KERN_WARNING,
+ "disposed unprocessed inode(s) in iput queue when stopping log writer");
nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
}
@@ -2822,8 +2821,8 @@ void nilfs_detach_log_writer(struct super_block *sb)
spin_lock(&nilfs->ns_inode_lock);
if (!list_empty(&nilfs->ns_dirty_files)) {
list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
- nilfs_warning(sb, __func__,
- "Hit dirty file after stopped log writer");
+ nilfs_msg(sb, KERN_WARNING,
+ "disposed unprocessed dirty file(s) when detaching log writer");
}
spin_unlock(&nilfs->ns_inode_lock);
up_write(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 6565c10b7b76..1060949d7dd2 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -23,7 +23,6 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/workqueue.h>
-#include <linux/nilfs2_fs.h>
#include "nilfs.h"
struct nilfs_root;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 1963595a1580..1541a1e9221a 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -22,7 +22,6 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/errno.h>
-#include <linux/nilfs2_fs.h>
#include "mdt.h"
#include "sufile.h"
@@ -181,9 +180,9 @@ int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
down_write(&NILFS_MDT(sufile)->mi_sem);
for (seg = segnumv; seg < segnumv + nsegs; seg++) {
if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
- printk(KERN_WARNING
- "%s: invalid segment number: %llu\n", __func__,
- (unsigned long long)*seg);
+ nilfs_msg(sufile->i_sb, KERN_WARNING,
+ "%s: invalid segment number: %llu",
+ __func__, (unsigned long long)*seg);
nerr++;
}
}
@@ -240,8 +239,9 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
int ret;
if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
- printk(KERN_WARNING "%s: invalid segment number: %llu\n",
- __func__, (unsigned long long)segnum);
+ nilfs_msg(sufile->i_sb, KERN_WARNING,
+ "%s: invalid segment number: %llu",
+ __func__, (unsigned long long)segnum);
return -EINVAL;
}
down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -419,8 +419,9 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (unlikely(!nilfs_segment_usage_clean(su))) {
- printk(KERN_WARNING "%s: segment %llu must be clean\n",
- __func__, (unsigned long long)segnum);
+ nilfs_msg(sufile->i_sb, KERN_WARNING,
+ "%s: segment %llu must be clean", __func__,
+ (unsigned long long)segnum);
kunmap_atomic(kaddr);
return;
}
@@ -444,7 +445,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
- if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
+ if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) &&
su->su_nblocks == cpu_to_le32(0)) {
kunmap_atomic(kaddr);
return;
@@ -455,7 +456,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
/* make the segment garbage */
su->su_lastmod = cpu_to_le64(0);
su->su_nblocks = cpu_to_le32(0);
- su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
+ su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY));
kunmap_atomic(kaddr);
nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
@@ -476,8 +477,9 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (nilfs_segment_usage_clean(su)) {
- printk(KERN_WARNING "%s: segment %llu is already clean\n",
- __func__, (unsigned long long)segnum);
+ nilfs_msg(sufile->i_sb, KERN_WARNING,
+ "%s: segment %llu is already clean",
+ __func__, (unsigned long long)segnum);
kunmap_atomic(kaddr);
return;
}
@@ -692,7 +694,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
su2 = su;
for (j = 0; j < n; j++, su = (void *)su + susz) {
if ((le32_to_cpu(su->su_flags) &
- ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) ||
+ ~BIT(NILFS_SEGMENT_USAGE_ERROR)) ||
nilfs_segment_is_active(nilfs, segnum + j)) {
ret = -EBUSY;
kunmap_atomic(kaddr);
@@ -859,10 +861,10 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
si->sui_lastmod = le64_to_cpu(su->su_lastmod);
si->sui_nblocks = le32_to_cpu(su->su_nblocks);
si->sui_flags = le32_to_cpu(su->su_flags) &
- ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+ ~BIT(NILFS_SEGMENT_USAGE_ACTIVE);
if (nilfs_segment_is_active(nilfs, segnum + j))
si->sui_flags |=
- (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+ BIT(NILFS_SEGMENT_USAGE_ACTIVE);
}
kunmap_atomic(kaddr);
brelse(su_bh);
@@ -950,7 +952,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
* disk.
*/
sup->sup_sui.sui_flags &=
- ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+ ~BIT(NILFS_SEGMENT_USAGE_ACTIVE);
cleansi = nilfs_suinfo_clean(&sup->sup_sui);
cleansu = nilfs_segment_usage_clean(su);
@@ -1175,14 +1177,12 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
int err;
if (susize > sb->s_blocksize) {
- printk(KERN_ERR
- "NILFS: too large segment usage size: %zu bytes.\n",
- susize);
+ nilfs_msg(sb, KERN_ERR,
+ "too large segment usage size: %zu bytes", susize);
return -EINVAL;
} else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
- printk(KERN_ERR
- "NILFS: too small segment usage size: %zu bytes.\n",
- susize);
+ nilfs_msg(sb, KERN_ERR,
+ "too small segment usage size: %zu bytes", susize);
return -EINVAL;
}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 46e89872294c..158a9190c8ec 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -21,7 +21,6 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
#include "mdt.h"
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 666107a18a22..c95d369e90aa 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -71,6 +71,22 @@ struct kmem_cache *nilfs_btree_path_cache;
static int nilfs_setup_super(struct super_block *sb, int is_mount);
static int nilfs_remount(struct super_block *sb, int *flags, char *data);
+void __nilfs_msg(struct super_block *sb, const char *level, const char *fmt,
+ ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (sb)
+ printk("%sNILFS (%s): %pV\n", level, sb->s_id, &vaf);
+ else
+ printk("%sNILFS: %pV\n", level, &vaf);
+ va_end(args);
+}
+
static void nilfs_set_error(struct super_block *sb)
{
struct the_nilfs *nilfs = sb->s_fs_info;
@@ -91,19 +107,20 @@ static void nilfs_set_error(struct super_block *sb)
}
/**
- * nilfs_error() - report failure condition on a filesystem
+ * __nilfs_error() - report failure condition on a filesystem
+ *
+ * __nilfs_error() sets an ERROR_FS flag on the superblock as well as
+ * reporting an error message. This function should be called when
+ * NILFS detects incoherences or defects of meta data on disk.
*
- * nilfs_error() sets an ERROR_FS flag on the superblock as well as
- * reporting an error message. It should be called when NILFS detects
- * incoherences or defects of meta data on disk. As for sustainable
- * errors such as a single-shot I/O error, nilfs_warning() or the printk()
- * function should be used instead.
+ * This implements the body of nilfs_error() macro. Normally,
+ * nilfs_error() should be used. As for sustainable errors such as a
+ * single-shot I/O error, nilfs_msg() should be used instead.
*
- * The segment constructor must not call this function because it can
- * kill itself.
+ * Callers should not add a trailing newline since this will do it.
*/
-void nilfs_error(struct super_block *sb, const char *function,
- const char *fmt, ...)
+void __nilfs_error(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
struct the_nilfs *nilfs = sb->s_fs_info;
struct va_format vaf;
@@ -133,24 +150,6 @@ void nilfs_error(struct super_block *sb, const char *function,
sb->s_id);
}
-void nilfs_warning(struct super_block *sb, const char *function,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
- sb->s_id, function, &vaf);
-
- va_end(args);
-}
-
-
struct inode *nilfs_alloc_inode(struct super_block *sb)
{
struct nilfs_inode_info *ii;
@@ -196,8 +195,8 @@ static int nilfs_sync_super(struct super_block *sb, int flag)
}
if (unlikely(err)) {
- printk(KERN_ERR
- "NILFS: unable to write superblock (err=%d)\n", err);
+ nilfs_msg(sb, KERN_ERR, "unable to write superblock: err=%d",
+ err);
if (err == -EIO && nilfs->ns_sbh[1]) {
/*
* sbp[0] points to newer log than sbp[1],
@@ -267,8 +266,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
} else {
- printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
- sb->s_id);
+ nilfs_msg(sb, KERN_CRIT, "superblock broke");
return NULL;
}
} else if (sbp[1] &&
@@ -378,9 +376,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
offset = sb2off & (nilfs->ns_blocksize - 1);
nsbh = sb_getblk(sb, newblocknr);
if (!nsbh) {
- printk(KERN_WARNING
- "NILFS warning: unable to move secondary superblock "
- "to block %llu\n", (unsigned long long)newblocknr);
+ nilfs_msg(sb, KERN_WARNING,
+ "unable to move secondary superblock to block %llu",
+ (unsigned long long)newblocknr);
ret = -EIO;
goto out;
}
@@ -543,10 +541,9 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
up_read(&nilfs->ns_segctor_sem);
if (unlikely(err)) {
if (err == -ENOENT || err == -EINVAL) {
- printk(KERN_ERR
- "NILFS: Invalid checkpoint "
- "(checkpoint number=%llu)\n",
- (unsigned long long)cno);
+ nilfs_msg(sb, KERN_ERR,
+ "Invalid checkpoint (checkpoint number=%llu)",
+ (unsigned long long)cno);
err = -EINVAL;
}
goto failed;
@@ -642,9 +639,8 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
err = nilfs_ifile_count_free_inodes(root->ifile,
&nmaxinodes, &nfreeinodes);
if (unlikely(err)) {
- printk(KERN_WARNING
- "NILFS warning: fail to count free inodes: err %d.\n",
- err);
+ nilfs_msg(sb, KERN_WARNING,
+ "failed to count free inodes: err=%d", err);
if (err == -ERANGE) {
/*
* If nilfs_palloc_count_max_entries() returns
@@ -776,9 +772,9 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
break;
case Opt_snapshot:
if (is_remount) {
- printk(KERN_ERR
- "NILFS: \"%s\" option is invalid "
- "for remount.\n", p);
+ nilfs_msg(sb, KERN_ERR,
+ "\"%s\" option is invalid for remount",
+ p);
return 0;
}
break;
@@ -792,8 +788,8 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
nilfs_clear_opt(nilfs, DISCARD);
break;
default:
- printk(KERN_ERR
- "NILFS: Unrecognized mount option \"%s\"\n", p);
+ nilfs_msg(sb, KERN_ERR,
+ "unrecognized mount option \"%s\"", p);
return 0;
}
}
@@ -829,12 +825,10 @@ static int nilfs_setup_super(struct super_block *sb, int is_mount)
mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
- printk(KERN_WARNING
- "NILFS warning: mounting fs with errors\n");
+ nilfs_msg(sb, KERN_WARNING, "mounting fs with errors");
#if 0
} else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
- printk(KERN_WARNING
- "NILFS warning: maximal mount count reached\n");
+ nilfs_msg(sb, KERN_WARNING, "maximal mount count reached");
#endif
}
if (!max_mnt_count)
@@ -897,17 +891,17 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
features = le64_to_cpu(sbp->s_feature_incompat) &
~NILFS_FEATURE_INCOMPAT_SUPP;
if (features) {
- printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
- "optional features (%llx)\n",
- (unsigned long long)features);
+ nilfs_msg(sb, KERN_ERR,
+ "couldn't mount because of unsupported optional features (%llx)",
+ (unsigned long long)features);
return -EINVAL;
}
features = le64_to_cpu(sbp->s_feature_compat_ro) &
~NILFS_FEATURE_COMPAT_RO_SUPP;
if (!(sb->s_flags & MS_RDONLY) && features) {
- printk(KERN_ERR "NILFS: couldn't mount RDWR because of "
- "unsupported optional features (%llx)\n",
- (unsigned long long)features);
+ nilfs_msg(sb, KERN_ERR,
+ "couldn't mount RDWR because of unsupported optional features (%llx)",
+ (unsigned long long)features);
return -EINVAL;
}
return 0;
@@ -923,13 +917,13 @@ static int nilfs_get_root_dentry(struct super_block *sb,
inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
if (IS_ERR(inode)) {
- printk(KERN_ERR "NILFS: get root inode failed\n");
ret = PTR_ERR(inode);
+ nilfs_msg(sb, KERN_ERR, "error %d getting root inode", ret);
goto out;
}
if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
iput(inode);
- printk(KERN_ERR "NILFS: corrupt root inode.\n");
+ nilfs_msg(sb, KERN_ERR, "corrupt root inode");
ret = -EINVAL;
goto out;
}
@@ -957,7 +951,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
return ret;
failed_dentry:
- printk(KERN_ERR "NILFS: get root dentry failed\n");
+ nilfs_msg(sb, KERN_ERR, "error %d getting root dentry", ret);
goto out;
}
@@ -977,18 +971,18 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
ret = (ret == -ENOENT) ? -EINVAL : ret;
goto out;
} else if (!ret) {
- printk(KERN_ERR "NILFS: The specified checkpoint is "
- "not a snapshot (checkpoint number=%llu).\n",
- (unsigned long long)cno);
+ nilfs_msg(s, KERN_ERR,
+ "The specified checkpoint is not a snapshot (checkpoint number=%llu)",
+ (unsigned long long)cno);
ret = -EINVAL;
goto out;
}
ret = nilfs_attach_checkpoint(s, cno, false, &root);
if (ret) {
- printk(KERN_ERR "NILFS: error loading snapshot "
- "(checkpoint number=%llu).\n",
- (unsigned long long)cno);
+ nilfs_msg(s, KERN_ERR,
+ "error %d while loading snapshot (checkpoint number=%llu)",
+ ret, (unsigned long long)cno);
goto out;
}
ret = nilfs_get_root_dentry(s, root, root_dentry);
@@ -1058,7 +1052,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
__u64 cno;
int err;
- nilfs = alloc_nilfs(sb->s_bdev);
+ nilfs = alloc_nilfs(sb);
if (!nilfs)
return -ENOMEM;
@@ -1083,8 +1077,9 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
cno = nilfs_last_cno(nilfs);
err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
if (err) {
- printk(KERN_ERR "NILFS: error loading last checkpoint "
- "(checkpoint number=%llu).\n", (unsigned long long)cno);
+ nilfs_msg(sb, KERN_ERR,
+ "error %d while loading last checkpoint (checkpoint number=%llu)",
+ err, (unsigned long long)cno);
goto failed_unload;
}
@@ -1144,9 +1139,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
if (!nilfs_valid_fs(nilfs)) {
- printk(KERN_WARNING "NILFS (device %s): couldn't "
- "remount because the filesystem is in an "
- "incomplete recovery state.\n", sb->s_id);
+ nilfs_msg(sb, KERN_WARNING,
+ "couldn't remount because the filesystem is in an incomplete recovery state");
goto restore_opts;
}
@@ -1178,10 +1172,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
~NILFS_FEATURE_COMPAT_RO_SUPP;
up_read(&nilfs->ns_sem);
if (features) {
- printk(KERN_WARNING "NILFS (device %s): couldn't "
- "remount RDWR because of unsupported optional "
- "features (%llx)\n",
- sb->s_id, (unsigned long long)features);
+ nilfs_msg(sb, KERN_WARNING,
+ "couldn't remount RDWR because of unsupported optional features (%llx)",
+ (unsigned long long)features);
err = -EROFS;
goto restore_opts;
}
@@ -1212,6 +1205,38 @@ struct nilfs_super_data {
int flags;
};
+static int nilfs_parse_snapshot_option(const char *option,
+ const substring_t *arg,
+ struct nilfs_super_data *sd)
+{
+ unsigned long long val;
+ const char *msg = NULL;
+ int err;
+
+ if (!(sd->flags & MS_RDONLY)) {
+ msg = "read-only option is not specified";
+ goto parse_error;
+ }
+
+ err = kstrtoull(arg->from, 0, &val);
+ if (err) {
+ if (err == -ERANGE)
+ msg = "too large checkpoint number";
+ else
+ msg = "malformed argument";
+ goto parse_error;
+ } else if (val == 0) {
+ msg = "invalid checkpoint number 0";
+ goto parse_error;
+ }
+ sd->cno = val;
+ return 0;
+
+parse_error:
+ nilfs_msg(NULL, KERN_ERR, "invalid option \"%s\": %s", option, msg);
+ return 1;
+}
+
/**
* nilfs_identify - pre-read mount options needed to identify mount instance
* @data: mount options
@@ -1228,24 +1253,9 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
p = strsep(&options, ",");
if (p != NULL && *p) {
token = match_token(p, tokens, args);
- if (token == Opt_snapshot) {
- if (!(sd->flags & MS_RDONLY)) {
- ret++;
- } else {
- sd->cno = simple_strtoull(args[0].from,
- NULL, 0);
- /*
- * No need to see the end pointer;
- * match_token() has done syntax
- * checking.
- */
- if (sd->cno == 0)
- ret++;
- }
- }
- if (ret)
- printk(KERN_ERR
- "NILFS: invalid mount option: %s\n", p);
+ if (token == Opt_snapshot)
+ ret = nilfs_parse_snapshot_option(p, &args[0],
+ sd);
}
if (!options)
break;
@@ -1326,10 +1336,10 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
} else if (!sd.cno) {
if (nilfs_tree_is_busy(s->s_root)) {
if ((flags ^ s->s_flags) & MS_RDONLY) {
- printk(KERN_ERR "NILFS: the device already "
- "has a %s mount.\n",
- (s->s_flags & MS_RDONLY) ?
- "read-only" : "read/write");
+ nilfs_msg(s, KERN_ERR,
+ "the device already has a %s mount.",
+ (s->s_flags & MS_RDONLY) ?
+ "read-only" : "read/write");
err = -EBUSY;
goto failed_super;
}
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 8ffa42b704d8..490303e3d517 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -272,8 +272,8 @@ nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
- err);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unable to get checkpoint stat: err=%d", err);
return err;
}
@@ -295,8 +295,8 @@ nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
- err);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unable to get checkpoint stat: err=%d", err);
return err;
}
@@ -326,9 +326,9 @@ nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr,
{
__u64 cno;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
cno = nilfs->ns_cno;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
}
@@ -414,8 +414,8 @@ nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n",
- err);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unable to get segment stat: err=%d", err);
return err;
}
@@ -511,9 +511,9 @@ nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr,
{
u64 seg_seq;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
seg_seq = nilfs->ns_seg_seq;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq);
}
@@ -525,9 +525,9 @@ nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr,
{
__u64 segnum;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
segnum = nilfs->ns_segnum;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", segnum);
}
@@ -539,9 +539,9 @@ nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr,
{
__u64 nextnum;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
nextnum = nilfs->ns_nextnum;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum);
}
@@ -553,9 +553,9 @@ nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr,
{
unsigned long pseg_offset;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
pseg_offset = nilfs->ns_pseg_offset;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset);
}
@@ -567,9 +567,9 @@ nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr,
{
__u64 cno;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
cno = nilfs->ns_cno;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
}
@@ -581,9 +581,9 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr,
{
time_t ctime;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
ctime = nilfs->ns_ctime;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return NILFS_SHOW_TIME(ctime, buf);
}
@@ -595,9 +595,9 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr,
{
time_t ctime;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
ctime = nilfs->ns_ctime;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime);
}
@@ -609,9 +609,9 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr,
{
time_t nongc_ctime;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
nongc_ctime = nilfs->ns_nongc_ctime;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return NILFS_SHOW_TIME(nongc_ctime, buf);
}
@@ -623,9 +623,9 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr,
{
time_t nongc_ctime;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
nongc_ctime = nilfs->ns_nongc_ctime;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long)nongc_ctime);
@@ -638,9 +638,9 @@ nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr,
{
u32 ndirtyblks;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks);
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks);
}
@@ -789,14 +789,15 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
err = kstrtouint(skip_spaces(buf), 0, &val);
if (err) {
- printk(KERN_ERR "NILFS: unable to convert string: err=%d\n",
- err);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unable to convert string: err=%d", err);
return err;
}
if (val < NILFS_SB_FREQ) {
val = NILFS_SB_FREQ;
- printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n");
+ nilfs_msg(nilfs->ns_sb, KERN_WARNING,
+ "superblock update frequency cannot be lesser than 10 seconds");
}
down_write(&nilfs->ns_sem);
@@ -999,7 +1000,8 @@ int nilfs_sysfs_create_device_group(struct super_block *sb)
nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL);
if (unlikely(!nilfs->ns_dev_subgroups)) {
err = -ENOMEM;
- printk(KERN_ERR "NILFS: unable to allocate memory for device group\n");
+ nilfs_msg(sb, KERN_ERR,
+ "unable to allocate memory for device group");
goto failed_create_device_group;
}
@@ -1109,15 +1111,15 @@ int __init nilfs_sysfs_init(void)
nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj);
if (!nilfs_kset) {
err = -ENOMEM;
- printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n",
- err);
+ nilfs_msg(NULL, KERN_ERR,
+ "unable to create sysfs entry: err=%d", err);
goto failed_sysfs_init;
}
err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
if (unlikely(err)) {
- printk(KERN_ERR "NILFS: unable to create feature group: err %d\n",
- err);
+ nilfs_msg(NULL, KERN_ERR,
+ "unable to create feature group: err=%d", err);
goto cleanup_sysfs_init;
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index e9fd241b9a0a..2dd75bf619ad 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -56,12 +56,12 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
/**
* alloc_nilfs - allocate a nilfs object
- * @bdev: block device to which the_nilfs is related
+ * @sb: super block instance
*
* Return Value: On success, pointer to the_nilfs is returned.
* On error, NULL is returned.
*/
-struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+struct the_nilfs *alloc_nilfs(struct super_block *sb)
{
struct the_nilfs *nilfs;
@@ -69,7 +69,8 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
if (!nilfs)
return NULL;
- nilfs->ns_bdev = bdev;
+ nilfs->ns_sb = sb;
+ nilfs->ns_bdev = sb->s_bdev;
atomic_set(&nilfs->ns_ndirtyblks, 0);
init_rwsem(&nilfs->ns_sem);
mutex_init(&nilfs->ns_snapshot_mount_mutex);
@@ -191,7 +192,10 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
nilfs->ns_cno = nilfs->ns_last_cno + 1;
if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
- printk(KERN_ERR "NILFS invalid last segment number.\n");
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "pointed segment number is out of range: segnum=%llu, nsegments=%lu",
+ (unsigned long long)nilfs->ns_segnum,
+ nilfs->ns_nsegments);
ret = -EINVAL;
}
return ret;
@@ -215,12 +219,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
int err;
if (!valid_fs) {
- printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
+ nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs");
if (s_flags & MS_RDONLY) {
- printk(KERN_INFO "NILFS: INFO: recovery "
- "required for readonly filesystem.\n");
- printk(KERN_INFO "NILFS: write access will "
- "be enabled during recovery.\n");
+ nilfs_msg(sb, KERN_INFO,
+ "recovery required for readonly filesystem");
+ nilfs_msg(sb, KERN_INFO,
+ "write access will be enabled during recovery");
}
}
@@ -235,13 +239,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
goto scan_error;
if (!nilfs_valid_sb(sbp[1])) {
- printk(KERN_WARNING
- "NILFS warning: unable to fall back to spare"
- "super block\n");
+ nilfs_msg(sb, KERN_WARNING,
+ "unable to fall back to spare super block");
goto scan_error;
}
- printk(KERN_INFO
- "NILFS: try rollback from an earlier position\n");
+ nilfs_msg(sb, KERN_INFO,
+ "trying rollback from an earlier position");
/*
* restore super block with its spare and reconfigure
@@ -254,10 +257,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
/* verify consistency between two super blocks */
blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
if (blocksize != nilfs->ns_blocksize) {
- printk(KERN_WARNING
- "NILFS warning: blocksize differs between "
- "two super blocks (%d != %d)\n",
- blocksize, nilfs->ns_blocksize);
+ nilfs_msg(sb, KERN_WARNING,
+ "blocksize differs between two super blocks (%d != %d)",
+ blocksize, nilfs->ns_blocksize);
goto scan_error;
}
@@ -276,7 +278,8 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
if (unlikely(err)) {
- printk(KERN_ERR "NILFS: error loading super root.\n");
+ nilfs_msg(sb, KERN_ERR, "error %d while loading super root",
+ err);
goto failed;
}
@@ -287,30 +290,29 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
__u64 features;
if (nilfs_test_opt(nilfs, NORECOVERY)) {
- printk(KERN_INFO "NILFS: norecovery option specified. "
- "skipping roll-forward recovery\n");
+ nilfs_msg(sb, KERN_INFO,
+ "norecovery option specified, skipping roll-forward recovery");
goto skip_recovery;
}
features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
~NILFS_FEATURE_COMPAT_RO_SUPP;
if (features) {
- printk(KERN_ERR "NILFS: couldn't proceed with "
- "recovery because of unsupported optional "
- "features (%llx)\n",
- (unsigned long long)features);
+ nilfs_msg(sb, KERN_ERR,
+ "couldn't proceed with recovery because of unsupported optional features (%llx)",
+ (unsigned long long)features);
err = -EROFS;
goto failed_unload;
}
if (really_read_only) {
- printk(KERN_ERR "NILFS: write access "
- "unavailable, cannot proceed.\n");
+ nilfs_msg(sb, KERN_ERR,
+ "write access unavailable, cannot proceed");
err = -EROFS;
goto failed_unload;
}
sb->s_flags &= ~MS_RDONLY;
} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
- printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
- "option was specified for a read/write mount\n");
+ nilfs_msg(sb, KERN_ERR,
+ "recovery cancelled because norecovery option was specified for a read/write mount");
err = -EINVAL;
goto failed_unload;
}
@@ -325,11 +327,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
up_write(&nilfs->ns_sem);
if (err) {
- printk(KERN_ERR "NILFS: failed to update super block. "
- "recovery unfinished.\n");
+ nilfs_msg(sb, KERN_ERR,
+ "error %d updating super block. recovery unfinished.",
+ err);
goto failed_unload;
}
- printk(KERN_INFO "NILFS: recovery complete.\n");
+ nilfs_msg(sb, KERN_INFO, "recovery complete");
skip_recovery:
nilfs_clear_recovery_info(&ri);
@@ -337,7 +340,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
return 0;
scan_error:
- printk(KERN_ERR "NILFS: error searching super root.\n");
+ nilfs_msg(sb, KERN_ERR, "error %d while searching super root", err);
goto failed;
failed_unload:
@@ -384,12 +387,11 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
struct nilfs_super_block *sbp)
{
if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
- printk(KERN_ERR "NILFS: unsupported revision "
- "(superblock rev.=%d.%d, current rev.=%d.%d). "
- "Please check the version of mkfs.nilfs.\n",
- le32_to_cpu(sbp->s_rev_level),
- le16_to_cpu(sbp->s_minor_rev_level),
- NILFS_CURRENT_REV, NILFS_MINOR_REV);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).",
+ le32_to_cpu(sbp->s_rev_level),
+ le16_to_cpu(sbp->s_minor_rev_level),
+ NILFS_CURRENT_REV, NILFS_MINOR_REV);
return -EINVAL;
}
nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
@@ -398,12 +400,14 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
- printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n",
- nilfs->ns_inode_size);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "too large inode size: %d bytes",
+ nilfs->ns_inode_size);
return -EINVAL;
} else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
- printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n",
- nilfs->ns_inode_size);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "too small inode size: %d bytes",
+ nilfs->ns_inode_size);
return -EINVAL;
}
@@ -411,7 +415,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
- printk(KERN_ERR "NILFS: too short segment.\n");
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "too short segment: %lu blocks",
+ nilfs->ns_blocks_per_segment);
return -EINVAL;
}
@@ -420,7 +426,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
le32_to_cpu(sbp->s_r_segments_percentage);
if (nilfs->ns_r_segments_percentage < 1 ||
nilfs->ns_r_segments_percentage > 99) {
- printk(KERN_ERR "NILFS: invalid reserved segments percentage.\n");
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "invalid reserved segments percentage: %lu",
+ nilfs->ns_r_segments_percentage);
return -EINVAL;
}
@@ -504,16 +512,16 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
if (!sbp[0]) {
if (!sbp[1]) {
- printk(KERN_ERR "NILFS: unable to read superblock\n");
+ nilfs_msg(sb, KERN_ERR, "unable to read superblock");
return -EIO;
}
- printk(KERN_WARNING
- "NILFS warning: unable to read primary superblock "
- "(blocksize = %d)\n", blocksize);
+ nilfs_msg(sb, KERN_WARNING,
+ "unable to read primary superblock (blocksize = %d)",
+ blocksize);
} else if (!sbp[1]) {
- printk(KERN_WARNING
- "NILFS warning: unable to read secondary superblock "
- "(blocksize = %d)\n", blocksize);
+ nilfs_msg(sb, KERN_WARNING,
+ "unable to read secondary superblock (blocksize = %d)",
+ blocksize);
}
/*
@@ -535,14 +543,14 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
}
if (!valid[swp]) {
nilfs_release_super_block(nilfs);
- printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
- sb->s_id);
+ nilfs_msg(sb, KERN_ERR, "couldn't find nilfs on the device");
return -EINVAL;
}
if (!valid[!swp])
- printk(KERN_WARNING "NILFS warning: broken superblock. "
- "using spare superblock (blocksize = %d).\n", blocksize);
+ nilfs_msg(sb, KERN_WARNING,
+ "broken superblock, retrying with spare superblock (blocksize = %d)",
+ blocksize);
if (swp)
nilfs_swap_super_block(nilfs);
@@ -576,7 +584,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
if (!blocksize) {
- printk(KERN_ERR "NILFS: unable to set blocksize\n");
+ nilfs_msg(sb, KERN_ERR, "unable to set blocksize");
err = -EINVAL;
goto out;
}
@@ -595,8 +603,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
if (blocksize < NILFS_MIN_BLOCK_SIZE ||
blocksize > NILFS_MAX_BLOCK_SIZE) {
- printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
- "filesystem blocksize %d\n", blocksize);
+ nilfs_msg(sb, KERN_ERR,
+ "couldn't mount because of unsupported filesystem blocksize %d",
+ blocksize);
err = -EINVAL;
goto failed_sbh;
}
@@ -604,10 +613,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
if (blocksize < hw_blocksize) {
- printk(KERN_ERR
- "NILFS: blocksize %d too small for device "
- "(sector-size = %d).\n",
- blocksize, hw_blocksize);
+ nilfs_msg(sb, KERN_ERR,
+ "blocksize %d too small for device (sector-size = %d)",
+ blocksize, hw_blocksize);
err = -EINVAL;
goto failed_sbh;
}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 79369fd6b13b..b305c6f033e7 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -43,6 +43,7 @@ enum {
* struct the_nilfs - struct to supervise multiple nilfs mount points
* @ns_flags: flags
* @ns_flushed_device: flag indicating if all volatile data was flushed
+ * @ns_sb: back pointer to super block instance
* @ns_bdev: block device
* @ns_sem: semaphore for shared states
* @ns_snapshot_mount_mutex: mutex to protect snapshot mounts
@@ -102,6 +103,7 @@ struct the_nilfs {
unsigned long ns_flags;
int ns_flushed_device;
+ struct super_block *ns_sb;
struct block_device *ns_bdev;
struct rw_semaphore ns_sem;
struct mutex ns_snapshot_mount_mutex;
@@ -120,11 +122,8 @@ struct the_nilfs {
unsigned int ns_sb_update_freq;
/*
- * Following fields are dedicated to a writable FS-instance.
- * Except for the period seeking checkpoint, code outside the segment
- * constructor must lock a segment semaphore while accessing these
- * fields.
- * The writable FS-instance is sole during a lifetime of the_nilfs.
+ * The following fields are updated by a writable FS-instance.
+ * These fields are protected by ns_segctor_sem outside load_nilfs().
*/
u64 ns_seg_seq;
__u64 ns_segnum;
@@ -281,7 +280,7 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
}
void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *alloc_nilfs(struct block_device *bdev);
+struct the_nilfs *alloc_nilfs(struct super_block *sb);
void destroy_nilfs(struct the_nilfs *nilfs);
int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 460c0cedab3a..7dabbc31060e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6106,6 +6106,43 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
}
}
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed)
+{
+ tid_t target;
+ int ret = 0;
+ unsigned int truncated_clusters;
+
+ inode_lock(osb->osb_tl_inode);
+ truncated_clusters = osb->truncated_clusters;
+ inode_unlock(osb->osb_tl_inode);
+
+ /*
+ * Check whether we can succeed in allocating if we free
+ * the truncate log.
+ */
+ if (truncated_clusters < needed)
+ goto out;
+
+ ret = ocfs2_flush_truncate_log(osb);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+ jbd2_log_wait_commit(osb->journal->j_journal, target);
+ ret = 1;
+ }
+out:
+ return ret;
+}
+
static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
int slot_num,
struct inode **tl_inode,
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index f3dc1b0dfffc..4a5152ec88a3 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -188,6 +188,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
u64 start_blk,
unsigned int num_clusters);
int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed);
/*
* Process local structure which describes the block unlinks done
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index af2adfcb0f6f..98d36548153d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1645,43 +1645,6 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
return ret;
}
-/*
- * Try to flush truncate logs if we can free enough clusters from it.
- * As for return value, "< 0" means error, "0" no space and "1" means
- * we have freed enough spaces and let the caller try to allocate again.
- */
-static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
- unsigned int needed)
-{
- tid_t target;
- int ret = 0;
- unsigned int truncated_clusters;
-
- inode_lock(osb->osb_tl_inode);
- truncated_clusters = osb->truncated_clusters;
- inode_unlock(osb->osb_tl_inode);
-
- /*
- * Check whether we can succeed in allocating if we free
- * the truncate log.
- */
- if (truncated_clusters < needed)
- goto out;
-
- ret = ocfs2_flush_truncate_log(osb);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
- jbd2_log_wait_commit(osb->journal->j_journal, target);
- ret = 1;
- }
-out:
- return ret;
-}
-
int ocfs2_write_begin_nolock(struct address_space *mapping,
loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 8107d0d0c3f6..e9f3705c4c9f 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1004,6 +1004,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
u8 nodenum, u8 *real_master);
+void __dlm_do_purge_lockres(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 13719d3f35f8..6ea06f8a7d29 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2276,9 +2276,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
dlm->name, namelen, lockname, res->owner, r);
dlm_print_one_lock_resource(res);
- BUG();
- }
- return ret ? ret : r;
+ if (r == -ENOMEM)
+ BUG();
+ } else
+ ret = r;
+
+ return ret;
}
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -2416,48 +2419,26 @@ int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
}
spin_lock(&res->spinlock);
- BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF));
- if (!list_empty(&res->purge)) {
- mlog(0, "%s: Removing res %.*s from purgelist\n",
- dlm->name, res->lockname.len, res->lockname.name);
- list_del_init(&res->purge);
- dlm_lockres_put(res);
- dlm->purge_count--;
- }
-
- if (!__dlm_lockres_unused(res)) {
- mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
- dlm->name, res->lockname.len, res->lockname.name);
- __dlm_print_one_lock_resource(res);
- BUG();
- }
-
- __dlm_unhash_lockres(dlm, res);
-
- spin_lock(&dlm->track_lock);
- if (!list_empty(&res->tracking))
- list_del_init(&res->tracking);
- else {
- mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
- dlm->name, res->lockname.len, res->lockname.name);
- __dlm_print_one_lock_resource(res);
+ if (!(res->state & DLM_LOCK_RES_DROPPING_REF)) {
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
+ mlog(ML_NOTICE, "%s:%.*s: node %u sends deref done "
+ "but it is already derefed!\n", dlm->name,
+ res->lockname.len, res->lockname.name, node);
+ ret = 0;
+ goto done;
}
- spin_unlock(&dlm->track_lock);
- /* lockres is not in the hash now. drop the flag and wake up
- * any processes waiting in dlm_get_lock_resource.
- */
- res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ __dlm_do_purge_lockres(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
- dlm_lockres_put(res);
-
spin_unlock(&dlm->spinlock);
ret = 0;
-
done:
+ if (res)
+ dlm_lockres_put(res);
dlm_put(dlm);
return ret;
}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f6b313898763..dd5cb8bcefd1 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2343,6 +2343,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
struct dlm_lock_resource *res;
int i;
struct hlist_head *bucket;
+ struct hlist_node *tmp;
struct dlm_lock *lock;
@@ -2365,7 +2366,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
*/
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, bucket, hash_node) {
+ hlist_for_each_entry_safe(res, tmp, bucket, hash_node) {
/* always prune any $RECOVERY entries for dead nodes,
* otherwise hangs can occur during later recovery */
if (dlm_is_recovery_lock(res->lockname.name,
@@ -2386,8 +2387,17 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
break;
}
}
- dlm_lockres_clear_refmap_bit(dlm, res,
- dead_node);
+
+ if ((res->owner == dead_node) &&
+ (res->state & DLM_LOCK_RES_DROPPING_REF)) {
+ dlm_lockres_get(res);
+ __dlm_do_purge_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ dlm_lockres_put(res);
+ continue;
+ } else if (res->owner == dlm->node_num)
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
spin_unlock(&res->spinlock);
continue;
}
@@ -2398,14 +2408,17 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
mlog(0, "%s:%.*s: owned by "
"dead node %u, this node was "
- "dropping its ref when it died. "
- "continue, dropping the flag.\n",
+ "dropping its ref when master died. "
+ "continue, purging the lockres.\n",
dlm->name, res->lockname.len,
res->lockname.name, dead_node);
+ dlm_lockres_get(res);
+ __dlm_do_purge_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ dlm_lockres_put(res);
+ continue;
}
- res->state &= ~DLM_LOCK_RES_DROPPING_REF;
- dlm_move_lockres_to_recovery_list(dlm,
- res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 68d239ba0c63..838a06d4066a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -160,6 +160,52 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
spin_unlock(&dlm->spinlock);
}
+/*
+ * Do the real purge work:
+ * unhash the lockres, and
+ * clear flag DLM_LOCK_RES_DROPPING_REF.
+ * It requires dlm and lockres spinlock to be taken.
+ */
+void __dlm_do_purge_lockres(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
+
+ if (!list_empty(&res->purge)) {
+ mlog(0, "%s: Removing res %.*s from purgelist\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ list_del_init(&res->purge);
+ dlm_lockres_put(res);
+ dlm->purge_count--;
+ }
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ __dlm_unhash_lockres(dlm, res);
+
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
+ /*
+ * lockres is not in the hash now. drop the flag and wake up
+ * any processes waiting in dlm_get_lock_resource.
+ */
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+}
+
static void dlm_purge_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
@@ -175,6 +221,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
res->lockname.len, res->lockname.name, master);
if (!master) {
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(ML_NOTICE, "%s: res %.*s already in DLM_LOCK_RES_DROPPING_REF state\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ spin_unlock(&res->spinlock);
+ return;
+ }
+
res->state |= DLM_LOCK_RES_DROPPING_REF;
/* drop spinlock... retake below */
spin_unlock(&res->spinlock);
@@ -203,8 +256,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
dlm->purge_count--;
}
- if (!master && ret != 0) {
- mlog(0, "%s: deref %.*s in progress or master goes down\n",
+ if (!master && ret == DLM_DEREF_RESPONSE_INPROG) {
+ mlog(0, "%s: deref %.*s in progress\n",
dlm->name, res->lockname.len, res->lockname.name);
spin_unlock(&res->spinlock);
return;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index ced70c8139f7..c9e828ec3c8e 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1007,10 +1007,17 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
lc->oc_type = NO_CONTROLD;
rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
- DLM_LSFL_FS, DLM_LVB_LEN,
+ DLM_LSFL_FS | DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
&ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
- if (rc)
+ if (rc) {
+ if (rc == -EEXIST || rc == -EPROTO)
+ printk(KERN_ERR "ocfs2: Unable to create the "
+ "lockspace %s (%d), because a ocfs2-tools "
+ "program is running on this file system "
+ "with the same name lockspace\n",
+ conn->cc_name, rc);
goto out;
+ }
if (ops_rv == -EOPNOTSUPP) {
lc->oc_type = WITH_CONTROLD;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 2f19aeec5482..ea47120a85ff 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1164,7 +1164,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
int flags,
struct ocfs2_alloc_context **ac)
{
- int status;
+ int status, ret = 0;
+ int retried = 0;
*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
@@ -1189,7 +1190,24 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
}
if (status == -ENOSPC) {
+retry:
status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+ /* Retry if there is sufficient space cached in truncate log */
+ if (status == -ENOSPC && !retried) {
+ retried = 1;
+ ocfs2_inode_unlock((*ac)->ac_inode, 1);
+ inode_unlock((*ac)->ac_inode);
+
+ ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
+ if (ret == 1)
+ goto retry;
+
+ if (ret < 0)
+ mlog_errno(ret);
+
+ inode_lock((*ac)->ac_inode);
+ ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
+ }
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index 5dfc4f3cfe68..00235bf644dc 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -73,6 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
}
}
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
ret = 1;
out_release_op:
op_release(new_op);
@@ -94,6 +95,9 @@ static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
int ret;
+ if (time_before(jiffies, dentry->d_time))
+ return 1;
+
if (flags & LOOKUP_RCU)
return -ECHILD;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 2e63e6d0a68e..28a0557a69be 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -262,7 +262,7 @@ int orangefs_getattr(struct vfsmount *mnt,
"orangefs_getattr: called on %s\n",
dentry->d_name.name);
- ret = orangefs_inode_getattr(inode, 0, 1);
+ ret = orangefs_inode_getattr(inode, 0, 0);
if (ret == 0) {
generic_fillattr(inode, kstat);
@@ -384,7 +384,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref
if (!inode || !(inode->i_state & I_NEW))
return inode;
- error = orangefs_inode_getattr(inode, 1, 0);
+ error = orangefs_inode_getattr(inode, 1, 1);
if (error) {
iget_failed(inode);
return ERR_PTR(error);
@@ -429,7 +429,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
orangefs_set_inode(inode, ref);
inode->i_ino = hash; /* needed for stat etc */
- error = orangefs_inode_getattr(inode, 1, 0);
+ error = orangefs_inode_getattr(inode, 1, 1);
if (error)
goto out_iput;
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 7e8dfa97c44a..62c525936ee8 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -72,6 +72,8 @@ static int orangefs_create(struct inode *dir,
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
"%s: dentry instantiated for %s\n",
@@ -181,6 +183,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+
inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
if (IS_ERR(inode)) {
gossip_debug(GOSSIP_NAME_DEBUG,
@@ -189,6 +193,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
+ ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+
gossip_debug(GOSSIP_NAME_DEBUG,
"%s:%s:%d "
"Found good inode [%lu] with count [%d]\n",
@@ -316,6 +322,8 @@ static int orangefs_symlink(struct inode *dir,
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Symlink) %pU -> %s\n",
@@ -378,6 +386,8 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Directory) %pU -> %s\n",
@@ -408,6 +418,8 @@ static int orangefs_rename(struct inode *old_dir,
"orangefs_rename: called (%pd2 => %pd2) ct=%d\n",
old_dentry, new_dentry, d_count(new_dentry));
+ ORANGEFS_I(new_dentry->d_parent->d_inode)->getattr_time = jiffies - 1;
+
new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
if (!new_op)
return -EINVAL;
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 4b6e132d5a0f..633c07a6e3d8 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -246,6 +246,8 @@ struct orangefs_inode_s {
* with this object
*/
unsigned long pinode_flags;
+
+ unsigned long getattr_time;
};
#define P_ATIME_FLAG 0
@@ -527,7 +529,7 @@ int orangefs_inode_setxattr(struct inode *inode,
size_t size,
int flags);
-int orangefs_inode_getattr(struct inode *inode, int new, int size);
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass);
int orangefs_inode_check_changed(struct inode *inode);
@@ -546,6 +548,8 @@ extern struct mutex request_mutex;
extern int debug;
extern int op_timeout_secs;
extern int slot_timeout_secs;
+extern int dcache_timeout_msecs;
+extern int getattr_timeout_msecs;
extern struct list_head orangefs_superblocks;
extern spinlock_t orangefs_superblocks_lock;
extern struct list_head orangefs_request_list;
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index 6f072a8c0de1..e9fd5755c05f 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -47,6 +47,8 @@ struct client_debug_mask client_debug_mask = { NULL, 0, 0 };
unsigned int kernel_mask_set_mod_init; /* implicitly false */
int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
+int dcache_timeout_msecs = 50;
+int getattr_timeout_msecs = 50;
MODULE_LICENSE("GPL");
MODULE_AUTHOR("ORANGEFS Development Team");
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index 5c03113e3ad2..375708c2db87 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -61,10 +61,21 @@
* Slots are requested and waited for,
* the wait times out after slot_timeout_secs.
*
+ * What: /sys/fs/orangefs/dcache_timeout_msecs
+ * Date: Jul 2016
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Time lookup is valid in milliseconds.
+ *
+ * What: /sys/fs/orangefs/getattr_timeout_msecs
+ * Date: Jul 2016
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Time getattr is valid in milliseconds.
*
* What: /sys/fs/orangefs/acache/...
* Date: Jun 2015
- * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Contact: Martin Brandenburg <martin@omnibond.com>
* Description:
* Attribute cache configurable settings.
*
@@ -117,6 +128,8 @@ struct orangefs_obj {
int perf_history_size;
int perf_time_interval_secs;
int slot_timeout_secs;
+ int dcache_timeout_msecs;
+ int getattr_timeout_msecs;
};
struct acache_orangefs_obj {
@@ -658,6 +671,20 @@ static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr)
"%d\n",
slot_timeout_secs);
goto out;
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "dcache_timeout_msecs")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ dcache_timeout_msecs);
+ goto out;
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "getattr_timeout_msecs")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ getattr_timeout_msecs);
+ goto out;
} else {
goto out;
}
@@ -734,6 +761,12 @@ static ssize_t int_store(struct orangefs_obj *orangefs_obj,
} else if (!strcmp(attr->attr.name, "slot_timeout_secs")) {
rc = kstrtoint(buf, 0, &slot_timeout_secs);
goto out;
+ } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) {
+ rc = kstrtoint(buf, 0, &dcache_timeout_msecs);
+ goto out;
+ } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) {
+ rc = kstrtoint(buf, 0, &getattr_timeout_msecs);
+ goto out;
} else {
goto out;
}
@@ -1361,6 +1394,12 @@ static struct orangefs_attribute op_timeout_secs_attribute =
static struct orangefs_attribute slot_timeout_secs_attribute =
__ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store);
+static struct orangefs_attribute dcache_timeout_msecs_attribute =
+ __ATTR(dcache_timeout_msecs, 0664, int_orangefs_show, int_store);
+
+static struct orangefs_attribute getattr_timeout_msecs_attribute =
+ __ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store);
+
static struct orangefs_attribute perf_counter_reset_attribute =
__ATTR(perf_counter_reset,
0664,
@@ -1382,6 +1421,8 @@ static struct orangefs_attribute perf_time_interval_secs_attribute =
static struct attribute *orangefs_default_attrs[] = {
&op_timeout_secs_attribute.attr,
&slot_timeout_secs_attribute.attr,
+ &dcache_timeout_msecs_attribute.attr,
+ &getattr_timeout_msecs_attribute.attr,
&perf_counter_reset_attribute.attr,
&perf_history_size_attribute.attr,
&perf_time_interval_secs_attribute.attr,
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index c5fbc62357c6..d13c7291fd05 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -251,7 +251,7 @@ static int orangefs_inode_is_stale(struct inode *inode, int new,
return 0;
}
-int orangefs_inode_getattr(struct inode *inode, int new, int size)
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op;
@@ -261,12 +261,16 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
get_khandle_from_ino(inode));
+ if (!new && !bypass) {
+ if (time_before(jiffies, orangefs_inode->getattr_time))
+ return 0;
+ }
+
new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
if (!new_op)
return -ENOMEM;
new_op->upcall.req.getattr.refn = orangefs_inode->refn;
- new_op->upcall.req.getattr.mask = size ?
- ORANGEFS_ATTR_SYS_ALL_NOHINT : ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE;
+ new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
ret = service_operation(new_op, __func__,
get_interruptible_flag(inode));
@@ -287,20 +291,18 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
case S_IFREG:
inode->i_flags = orangefs_inode_flags(&new_op->
downcall.resp.getattr.attributes);
- if (size) {
- inode_size = (loff_t)new_op->
- downcall.resp.getattr.attributes.size;
- rounded_up_size =
- (inode_size + (4096 - (inode_size % 4096)));
- inode->i_size = inode_size;
- orangefs_inode->blksize =
- new_op->downcall.resp.getattr.attributes.blksize;
- spin_lock(&inode->i_lock);
- inode->i_bytes = inode_size;
- inode->i_blocks =
- (unsigned long)(rounded_up_size / 512);
- spin_unlock(&inode->i_lock);
- }
+ inode_size = (loff_t)new_op->
+ downcall.resp.getattr.attributes.size;
+ rounded_up_size =
+ (inode_size + (4096 - (inode_size % 4096)));
+ inode->i_size = inode_size;
+ orangefs_inode->blksize =
+ new_op->downcall.resp.getattr.attributes.blksize;
+ spin_lock(&inode->i_lock);
+ inode->i_bytes = inode_size;
+ inode->i_blocks =
+ (unsigned long)(rounded_up_size / 512);
+ spin_unlock(&inode->i_lock);
break;
case S_IFDIR:
inode->i_size = PAGE_SIZE;
@@ -345,6 +347,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
+ orangefs_inode->getattr_time = jiffies + getattr_timeout_msecs*HZ/1000;
ret = 0;
out:
op_release(new_op);
@@ -418,6 +421,7 @@ int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
ClearMtimeFlag(orangefs_inode);
ClearCtimeFlag(orangefs_inode);
ClearModeFlag(orangefs_inode);
+ orangefs_inode->getattr_time = jiffies - 1;
}
return ret;
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
index 1efc6f8a5224..3d7418c728f5 100644
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -207,14 +207,6 @@ typedef __s64 ORANGEFS_offset;
ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
ORANGEFS_ATTR_SYS_BLKSIZE)
-#define ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE \
- (ORANGEFS_ATTR_SYS_COMMON_ALL | \
- ORANGEFS_ATTR_SYS_LNK_TARGET | \
- ORANGEFS_ATTR_SYS_DFILE_COUNT | \
- ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT | \
- ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
- ORANGEFS_ATTR_SYS_BLKSIZE)
-
#define ORANGEFS_XATTR_REPLACE 0x2
#define ORANGEFS_XATTR_CREATE 0x1
#define ORANGEFS_MAX_SERVER_ADDR_LEN 256
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 7151ea428041..a8c13605b434 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -4,6 +4,7 @@
obj-y += proc.o
+CFLAGS_task_mmu.o += -Wno-override-init
proc-y := nommu.o task_nommu.o
proc-$(CONFIG_MMU) := task_mmu.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 31370da2ee7c..54e270262979 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -579,11 +579,8 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
unsigned long totalpages = totalram_pages + total_swap_pages;
unsigned long points = 0;
- read_lock(&tasklist_lock);
- if (pid_alive(task))
- points = oom_badness(task, NULL, NULL, totalpages) *
- 1000 / totalpages;
- read_unlock(&tasklist_lock);
+ points = oom_badness(task, NULL, NULL, totalpages) *
+ 1000 / totalpages;
seq_printf(m, "%lu\n", points);
return 0;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 510413eb25b8..7907e456ac4f 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -80,19 +80,17 @@ static u64 get_iowait_time(int cpu)
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
- unsigned long jif;
u64 user, nice, system, idle, iowait, irq, softirq, steal;
u64 guest, guest_nice;
u64 sum = 0;
u64 sum_softirq = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
- struct timespec boottime;
+ struct timespec64 boottime;
user = nice = system = idle = iowait =
irq = softirq = steal = 0;
guest = guest_nice = 0;
- getboottime(&boottime);
- jif = boottime.tv_sec;
+ getboottime64(&boottime);
for_each_possible_cpu(i) {
user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
@@ -163,12 +161,12 @@ static int show_stat(struct seq_file *p, void *v)
seq_printf(p,
"\nctxt %llu\n"
- "btime %lu\n"
+ "btime %llu\n"
"processes %lu\n"
"procs_running %lu\n"
"procs_blocked %lu\n",
nr_context_switches(),
- (unsigned long)jif,
+ (unsigned long long)boottime.tv_sec,
total_forks,
nr_running(),
nr_iowait());
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index b751eea32e20..5db6f45b3fed 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -1153,8 +1153,9 @@ int balance_internal(struct tree_balance *tb,
insert_ptr);
}
- memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
insert_ptr[0] = new_insert_ptr;
+ if (new_insert_ptr)
+ memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
return order;
}