From e6aabe0cac14a495d42f629a803c5e221089bae8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 15 Oct 2009 14:54:03 +0200 Subject: ocfs2: Always include ACL support To become consistent with filesystems such as XFS or BTRFS, make posix ACLs always available. This also reduces possibility of misconfiguration on admin's side. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/Kconfig | 10 +--------- fs/ocfs2/Makefile | 7 ++----- fs/ocfs2/acl.h | 22 ---------------------- fs/ocfs2/super.c | 9 --------- fs/ocfs2/xattr.c | 4 ---- fs/ocfs2/xattr.h | 2 -- 6 files changed, 3 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 701b7a3a872e..0d840669698e 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -6,6 +6,7 @@ config OCFS2_FS select CRC32 select QUOTA select QUOTA_TREE + select FS_POSIX_ACL help OCFS2 is a general purpose extent based shared disk cluster file system with many similarities to ext3. It supports 64 bit inode @@ -74,12 +75,3 @@ config OCFS2_DEBUG_FS This option will enable expensive consistency checks. Enable this option for debugging only as it is likely to decrease performance of the filesystem. - -config OCFS2_FS_POSIX_ACL - bool "OCFS2 POSIX Access Control Lists" - depends on OCFS2_FS - select FS_POSIX_ACL - default n - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 31f25ce32c97..600d2d2ade11 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -39,11 +39,8 @@ ocfs2-objs := \ ver.o \ quota_local.o \ quota_global.o \ - xattr.o - -ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y) -ocfs2-objs += acl.o -endif + xattr.o \ + acl.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 8f6389ed4da5..5c5d31f05853 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -26,8 +26,6 @@ struct ocfs2_acl_entry { __le32 e_id; }; -#ifdef CONFIG_OCFS2_FS_POSIX_ACL - extern int ocfs2_check_acl(struct inode *, int); extern int ocfs2_acl_chmod(struct inode *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, @@ -35,24 +33,4 @@ extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct ocfs2_alloc_context *, struct ocfs2_alloc_context *); -#else /* CONFIG_OCFS2_FS_POSIX_ACL*/ - -#define ocfs2_check_acl NULL -static inline int ocfs2_acl_chmod(struct inode *inode) -{ - return 0; -} -static inline int ocfs2_init_acl(handle_t *handle, - struct inode *inode, - struct inode *dir, - struct buffer_head *di_bh, - struct buffer_head *dir_bh, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_alloc_context *data_ac) -{ - return 0; -} - -#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/ - #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 960673004df1..da7d33a57cf6 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1413,19 +1413,12 @@ static int ocfs2_parse_options(struct super_block *sb, } mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; break; -#ifdef CONFIG_OCFS2_FS_POSIX_ACL case Opt_acl: mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; break; case Opt_noacl: mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; break; -#else - case Opt_acl: - case Opt_noacl: - printk(KERN_INFO "ocfs2 (no)acl options not supported\n"); - break; -#endif default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1502,12 +1495,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (opts & OCFS2_MOUNT_INODE64) seq_printf(s, ",inode64"); -#ifdef CONFIG_OCFS2_FS_POSIX_ACL if (opts & OCFS2_MOUNT_POSIX_ACL) seq_printf(s, ",acl"); else seq_printf(s, ",noacl"); -#endif return 0; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index fe3419068df2..923e950e5445 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -98,10 +98,8 @@ static struct ocfs2_xattr_def_value_root def_xv = { struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, -#ifdef CONFIG_OCFS2_FS_POSIX_ACL &ocfs2_xattr_acl_access_handler, &ocfs2_xattr_acl_default_handler, -#endif &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL @@ -109,12 +107,10 @@ struct xattr_handler *ocfs2_xattr_handlers[] = { static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, -#ifdef CONFIG_OCFS2_FS_POSIX_ACL [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ocfs2_xattr_acl_access_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ocfs2_xattr_acl_default_handler, -#endif [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 08e36389f56d..abd72a47f520 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -40,10 +40,8 @@ struct ocfs2_security_xattr_info { extern struct xattr_handler ocfs2_xattr_user_handler; extern struct xattr_handler ocfs2_xattr_trusted_handler; extern struct xattr_handler ocfs2_xattr_security_handler; -#ifdef CONFIG_OCFS2_FS_POSIX_ACL extern struct xattr_handler ocfs2_xattr_acl_access_handler; extern struct xattr_handler ocfs2_xattr_acl_default_handler; -#endif extern struct xattr_handler *ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); -- cgit v1.2.3 From 5297aad80cd3a3e62116e0a0b29116ccd8ae397d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 15 Oct 2009 14:54:04 +0200 Subject: ocfs2: Make acl use the default Change acl mount options handling to match the one of XFS and BTRFS and hopefully it is also easier to use now. When admin does not specify any acl mount option, acls are enabled if and only if the filesystem has xattr feature enabled. If admin specifies 'acl' mount option, we fail the mount if the filesystem does not have xattr feature and thus acls cannot be enabled. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/ocfs2.h | 8 +++--- fs/ocfs2/super.c | 82 ++++++++++++++++++++++++++++++-------------------------- 2 files changed, 49 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index eae404602424..35ad46cf89b3 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -250,9 +250,11 @@ enum ocfs2_mount_options OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ - OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */ - OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */ - OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ + OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */ + OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access + control lists */ + OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ + OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index da7d33a57cf6..a5116b9b7e70 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -100,6 +100,8 @@ struct mount_options static int ocfs2_parse_options(struct super_block *sb, char *options, struct mount_options *mopt, int is_remount); +static int ocfs2_check_set_options(struct super_block *sb, + struct mount_options *options); static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt); static void ocfs2_put_super(struct super_block *sb); static int ocfs2_mount_volume(struct super_block *sb); @@ -600,7 +602,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) lock_kernel(); - if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { + if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || + !ocfs2_check_set_options(sb, &parsed_options)) { ret = -EINVAL; goto out; } @@ -691,8 +694,6 @@ unlock_osb: if (!ret) { /* Only save off the new mount options in case of a successful * remount. */ - if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) - parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; @@ -1011,31 +1012,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); bh = NULL; - if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) - parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; - + if (!ocfs2_check_set_options(sb, &parsed_options)) { + status = -EINVAL; + goto read_super_error; + } osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; osb->osb_commit_interval = parsed_options.commit_interval; osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); osb->local_alloc_bits = osb->local_alloc_default_bits; - if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA && - !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { - status = -EINVAL; - mlog(ML_ERROR, "User quotas were requested, but this " - "filesystem does not have the feature enabled.\n"); - goto read_super_error; - } - if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA && - !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { - status = -EINVAL; - mlog(ML_ERROR, "Group quotas were requested, but this " - "filesystem does not have the feature enabled.\n"); - goto read_super_error; - } status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -1245,6 +1231,40 @@ static struct file_system_type ocfs2_fs_type = { .next = NULL }; +static int ocfs2_check_set_options(struct super_block *sb, + struct mount_options *options) +{ + if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + mlog(ML_ERROR, "User quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + return 0; + } + if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + mlog(ML_ERROR, "Group quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + return 0; + } + if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL && + !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) { + mlog(ML_ERROR, "ACL support requested but extended attributes " + "feature is not enabled\n"); + return 0; + } + /* No ACL setting specified? Use XATTR feature... */ + if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL | + OCFS2_MOUNT_NO_POSIX_ACL))) { + if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) + options->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + else + options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; + } + return 1; +} + static int ocfs2_parse_options(struct super_block *sb, char *options, struct mount_options *mopt, @@ -1392,31 +1412,17 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_INODE64; break; case Opt_usrquota: - /* We check only on remount, otherwise features - * aren't yet initialized. */ - if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { - mlog(ML_ERROR, "User quota requested but " - "filesystem feature is not set\n"); - status = 0; - goto bail; - } mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; break; case Opt_grpquota: - if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { - mlog(ML_ERROR, "Group quota requested but " - "filesystem feature is not set\n"); - status = 0; - goto bail; - } mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; break; case Opt_acl: mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; break; case Opt_noacl: + mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; break; default: -- cgit v1.2.3 From 57b09bb5e492c37c1e4273fe4e435ffd1d2ddbe0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 15 Oct 2009 14:54:05 +0200 Subject: ocfs2: Set MS_POSIXACL on remount We have to set MS_POSIXACL on remount as well. Otherwise VFS would not know we started supporting ACLs after remount and thus ACLs would not work. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a5116b9b7e70..45d654cb77ef 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -702,6 +702,10 @@ unlock_osb: if (!ocfs2_is_hard_readonly(osb)) ocfs2_set_journal_params(osb); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? + MS_POSIXACL : 0); } out: unlock_kernel(); -- cgit v1.2.3 From 38a04e432768ec0b016f3c687b4de31ac111ae59 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 30 Nov 2009 14:32:19 +0800 Subject: ocfs2: Find proper end cpos for a leaf refcount block. ocfs2 refcount tree is stored as an extent tree while the leaf ocfs2_refcount_rec points to a refcount block. The following step can trip a kernel panic. mkfs.ocfs2 -b 512 -C 1M --fs-features=refcount $DEVICE mount -t ocfs2 $DEVICE $MNT_DIR FILE_NAME=$RANDOM FILE_NAME_1=$RANDOM FILE_REF="${FILE_NAME}_ref" FILE_REF_1="${FILE_NAME}_ref_1" for((i=0;i<305;i++)) do # /mnt/1048576 is a file with 1048576 sizes. cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done for((i=0;i<3;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME done for((i=0;i<2;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME for((i=0;i<11;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF # write_f is a program which will write some bytes to a file at offset. # write_f -f file_name -l offset -w write_bytes. ./write_f -f $MNT_DIR/$FILE_REF -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[306*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[311*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF_1 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 #kernel panic here. The reason is that if the ocfs2_extent_rec is the last record in a leaf extent block, the old solution fails to find the suitable end cpos. So this patch try to walk through the b-tree, find the next sub root and get the c_pos the next sub-tree starts from. btw, I have runned tristan's test case against the patched kernel for several days and this type of kernel panic never happens again. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/alloc.c | 10 ++--- fs/ocfs2/alloc.h | 5 +++ fs/ocfs2/refcounttree.c | 117 ++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 119 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 38a42f5d59ff..5661db139ca0 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1765,9 +1765,9 @@ set_and_inc: * * The array index of the subtree root is passed back. */ -static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, - struct ocfs2_path *left, - struct ocfs2_path *right) +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right) { int i = 0; @@ -2872,8 +2872,8 @@ out: * This looks similar, but is subtly different to * ocfs2_find_cpos_for_left_leaf(). */ -static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, - struct ocfs2_path *path, u32 *cpos) +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) { int i, j, ret = 0; u64 blkno; diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 9c122d574464..1db4359ccb90 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_t *handle, int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, handle_t *handle, struct ocfs2_path *path); +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos); +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right); #endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 3a0df7a1b810..9c3956f24e58 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -968,6 +968,103 @@ out: return 0; } +/* + * Find the end range for a leaf refcount block indicated by + * el->l_recs[index].e_blkno. + */ +static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct ocfs2_extent_block *eb, + struct ocfs2_extent_list *el, + int index, u32 *cpos_end) +{ + int ret, i, subtree_root; + u32 cpos; + u64 blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_path *left_path = NULL, *right_path = NULL; + struct ocfs2_extent_tree et; + struct ocfs2_extent_list *tmp_el; + + if (index < le16_to_cpu(el->l_next_free_rec) - 1) { + /* + * We have a extent rec after index, so just use the e_cpos + * of the next extent rec. + */ + *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos); + return 0; + } + + if (!eb || (eb && !eb->h_next_leaf_blk)) { + /* + * We are the last extent rec, so any high cpos should + * be stored in this leaf refcount block. + */ + *cpos_end = UINT_MAX; + return 0; + } + + /* + * If the extent block isn't the last one, we have to find + * the subtree root between this extent block and the next + * leaf extent block and get the corresponding e_cpos from + * the subroot. Otherwise we may corrupt the b-tree. + */ + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + + left_path = ocfs2_new_path_from_et(&et); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos); + ret = ocfs2_find_path(ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + right_path = ocfs2_new_path_from_path(left_path); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(ci, right_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_root = ocfs2_find_subtree_root(&et, left_path, + right_path); + + tmp_el = left_path->p_node[subtree_root].el; + blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; + for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) { + if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { + *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); + break; + } + } + + BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec)); + +out: + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); + return ret; +} + /* * Given a cpos and len, try to find the refcount record which contains cpos. * 1. If cpos can be found in one refcount record, return the record. @@ -983,10 +1080,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, struct buffer_head **ret_bh) { int ret = 0, i, found; - u32 low_cpos; + u32 low_cpos, uninitialized_var(cpos_end); struct ocfs2_extent_list *el; - struct ocfs2_extent_rec *tmp, *rec = NULL; - struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_extent_block *eb = NULL; struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *rb = @@ -1034,12 +1131,16 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, } } - /* adjust len when we have ocfs2_extent_rec after it. */ - if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) { - tmp = &el->l_recs[i+1]; + if (found) { + ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh, + eb, el, i, &cpos_end); + if (ret) { + mlog_errno(ret); + goto out; + } - if (le32_to_cpu(tmp->e_cpos) < cpos + len) - len = le32_to_cpu(tmp->e_cpos) - cpos; + if (cpos_end < low_cpos + len) + len = cpos_end - low_cpos; } ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), -- cgit v1.2.3 From 12d4cec988a8a20927a9795dbf2c6d9ef2f85baa Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 30 Nov 2009 15:08:40 +0800 Subject: ocfs2: refcounttree.c cleanup. sparse check finds some endian problem and some other minor issues. There is an obsolete function which should be removed. So this patch resolve all these. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/refcounttree.c | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9c3956f24e58..c7ffba86e9c6 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -276,7 +276,7 @@ static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } -void ocfs2_kref_remove_refcount_tree(struct kref *kref) +static void ocfs2_kref_remove_refcount_tree(struct kref *kref) { struct ocfs2_refcount_tree *tree = container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); @@ -524,23 +524,6 @@ out: return ret; } -int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw, - struct ocfs2_refcount_tree **ret_tree, - struct buffer_head **ref_bh) -{ - int ret; - u64 ref_blkno; - - ret = ocfs2_get_refcount_block(inode, &ref_blkno); - if (ret) { - mlog_errno(ret); - return ret; - } - - return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, - rw, ret_tree, ref_bh); -} - void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree, int rw) { @@ -1519,7 +1502,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, /* change old and new rl_used accordingly. */ le16_add_cpu(&rl->rl_used, -num_moved); - new_rl->rl_used = cpu_to_le32(num_moved); + new_rl->rl_used = cpu_to_le16(num_moved); sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), @@ -1898,7 +1881,8 @@ static int ocfs2_split_refcount_rec(handle_t *handle, recs_need++; /* If the leaf block don't have enough record, expand it. */ - if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) { + if (le16_to_cpu(rf_list->rl_used) + recs_need > + le16_to_cpu(rf_list->rl_count)) { struct ocfs2_refcount_rec tmp_rec; u64 cpos = le64_to_cpu(orig_rec->r_cpos); len = le32_to_cpu(orig_rec->r_clusters); @@ -1960,7 +1944,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle, memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); le64_add_cpu(&tail_rec->r_cpos, le32_to_cpu(tail_rec->r_clusters) - len); - tail_rec->r_clusters = le32_to_cpu(len); + tail_rec->r_clusters = cpu_to_le32(len); } /* @@ -3941,8 +3925,7 @@ static int ocfs2_add_refcounted_extent(struct inode *inode, } ret = ocfs2_insert_extent(handle, et, cpos, - cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, - p_cluster)), + ocfs2_clusters_to_blocks(inode->i_sb, p_cluster), num_clusters, ext_flags, meta_ac); if (ret) { mlog_errno(ret); @@ -4354,8 +4337,8 @@ static int ocfs2_user_path_parent(const char __user *path, * @new_dentry: target dentry * @preserve: if true, preserve all file attributes */ -int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry, bool preserve) +static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) { struct inode *inode = old_dentry->d_inode; int error; -- cgit v1.2.3 From f6656d26d17b2598f43cd41be088853fa2a03397 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Tue, 17 Nov 2009 16:29:19 -0800 Subject: ocfs2/cluster: Make fence method configurable - v2 By default, o2cb fences the box by calling emergency_restart(). While this scheme works well in production, it comes in the way during testing as it does not let the tester take stack/core dumps for analysis. This patch allows user to dynamically change the fence method to panic() by: # echo "panic" > /sys/kernel/config/cluster//fence_method Signed-off-by: Sunil Mushran Signed-off-by: Joel Becker --- fs/ocfs2/cluster/nodemanager.c | 51 ++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/cluster/nodemanager.h | 7 ++++++ fs/ocfs2/cluster/quorum.c | 16 +++++++++++-- 3 files changed, 72 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 7ee6188bc79a..c81142e3ef84 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -35,6 +35,10 @@ * cluster references throughout where nodes are looked up */ struct o2nm_cluster *o2nm_single_cluster = NULL; +char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { + "reset", /* O2NM_FENCE_RESET */ + "panic", /* O2NM_FENCE_PANIC */ +}; struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { @@ -579,6 +583,43 @@ static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( return o2nm_cluster_attr_write(page, count, &cluster->cl_reconnect_delay_ms); } + +static ssize_t o2nm_cluster_attr_fence_method_read( + struct o2nm_cluster *cluster, char *page) +{ + ssize_t ret = 0; + + if (cluster) + ret = sprintf(page, "%s\n", + o2nm_fence_method_desc[cluster->cl_fence_method]); + return ret; +} + +static ssize_t o2nm_cluster_attr_fence_method_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + unsigned int i; + + if (page[count - 1] != '\n') + goto bail; + + for (i = 0; i < O2NM_FENCE_METHODS; ++i) { + if (count != strlen(o2nm_fence_method_desc[i]) + 1) + continue; + if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1)) + continue; + if (cluster->cl_fence_method != i) { + printk(KERN_INFO "ocfs2: Changing fence method to %s\n", + o2nm_fence_method_desc[i]); + cluster->cl_fence_method = i; + } + return count; + } + +bail: + return -EINVAL; +} + static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { .attr = { .ca_owner = THIS_MODULE, .ca_name = "idle_timeout_ms", @@ -603,10 +644,19 @@ static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { .store = o2nm_cluster_attr_reconnect_delay_ms_write, }; +static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "fence_method", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_fence_method_read, + .store = o2nm_cluster_attr_fence_method_write, +}; + static struct configfs_attribute *o2nm_cluster_attrs[] = { &o2nm_cluster_attr_idle_timeout_ms.attr, &o2nm_cluster_attr_keepalive_delay_ms.attr, &o2nm_cluster_attr_reconnect_delay_ms.attr, + &o2nm_cluster_attr_fence_method.attr, NULL, }; static ssize_t o2nm_cluster_show(struct config_item *item, @@ -778,6 +828,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; + cluster->cl_fence_method = O2NM_FENCE_RESET; ret = &cluster->cl_group; o2nm_single_cluster = cluster; diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index c992ea0da4ad..09ea2d388bbb 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -33,6 +33,12 @@ #include #include +enum o2nm_fence_method { + O2NM_FENCE_RESET = 0, + O2NM_FENCE_PANIC, + O2NM_FENCE_METHODS, /* Number of fence methods */ +}; + struct o2nm_node { spinlock_t nd_lock; struct config_item nd_item; @@ -58,6 +64,7 @@ struct o2nm_cluster { unsigned int cl_idle_timeout_ms; unsigned int cl_keepalive_delay_ms; unsigned int cl_reconnect_delay_ms; + enum o2nm_fence_method cl_fence_method; /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index bbacf7da48a4..639024033fce 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -74,8 +74,20 @@ static void o2quo_fence_self(void) * threads can still schedule, etc, etc */ o2hb_stop_all_regions(); - printk("ocfs2 is very sorry to be fencing this system by restarting\n"); - emergency_restart(); + switch (o2nm_single_cluster->cl_fence_method) { + case O2NM_FENCE_PANIC: + panic("*** ocfs2 is very sorry to be fencing this system by " + "panicing ***\n"); + break; + default: + WARN_ON(o2nm_single_cluster->cl_fence_method >= + O2NM_FENCE_METHODS); + case O2NM_FENCE_RESET: + printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " + "system by restarting ***\n"); + emergency_restart(); + break; + }; } /* Indicate that a timeout occured on a hearbeat region write. The -- cgit v1.2.3 From aad1b15310b9bcd59fa81ab8f2b1513b59553ea8 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Thu, 19 Nov 2009 10:17:46 +0800 Subject: ocfs2: return -EAGAIN instead of EAGAIN in dlm We used to return positive EAGAIN to indicate a retry action is needed in dlm_begin_reco_handler(). Now we return negative -EAGAIN to erase the confusion caused by this error code. Signed-off-by: Tiger Yang Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmrecovery.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index d9fa3d22e17c..2f9e4e19a4f2 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2589,6 +2589,14 @@ retry: "begin reco msg (%d)\n", dlm->name, nodenum, ret); ret = 0; } + if (ret == -EAGAIN) { + mlog(0, "%s: trying to start recovery of node " + "%u, but node %u is waiting for last recovery " + "to complete, backoff for a bit\n", dlm->name, + dead_node, nodenum); + msleep(100); + goto retry; + } if (ret < 0) { struct dlm_lock_resource *res; /* this is now a serious problem, possibly ENOMEM @@ -2608,14 +2616,6 @@ retry: * another ENOMEM */ msleep(100); goto retry; - } else if (ret == EAGAIN) { - mlog(0, "%s: trying to start recovery of node " - "%u, but node %u is waiting for last recovery " - "to complete, backoff for a bit\n", dlm->name, - dead_node, nodenum); - /* TODO Look into replacing msleep with cond_resched() */ - msleep(100); - goto retry; } } @@ -2639,7 +2639,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, dlm->name, br->node_idx, br->dead_node, dlm->reco.dead_node, dlm->reco.new_master); spin_unlock(&dlm->spinlock); - return EAGAIN; + return -EAGAIN; } spin_unlock(&dlm->spinlock); -- cgit v1.2.3 From f5befbbbf9b7af033c94a1df6acaf79e77a3ceb6 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 15 Dec 2009 14:08:28 +0800 Subject: ocfs2-devel: remove redundant OCFS2_MOUNT_POSIX_ACL check in ocfs2_get_acl_nolock() osb->s_mount_opt has already been checked against OCFS2_MOUNT_POSIX_ACL_CHECK before calling ocfs2_get_acl_nolock() in ocfs2_init_acl() && ocfs2_get_acl(), so remove it. Signed-off-by: Jeff Liu Signed-off-by: Joel Becker --- fs/ocfs2/acl.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index fbeaec762103..200b88109b29 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -98,15 +98,11 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, int type, struct buffer_head *di_bh) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int name_index; char *value = NULL; struct posix_acl *acl; int retval; - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return NULL; - switch (type) { case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; -- cgit v1.2.3 From 3a05d7961e6b5fb77660849b56a22feca5d1e0c5 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 4 Dec 2009 02:02:35 +0800 Subject: ocfs2: explicit declare uninitialized var in user_cluster_connect() This patch explicitly declares an uninitialized local variable in user_cluster_connect(), to remove a compiling warning. Signed-off-by: Coly Li Signed-off-by: Joel Becker --- fs/ocfs2/stack_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index ff4c798a5635..da78a2a334fd 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -814,7 +814,7 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing, static int user_cluster_connect(struct ocfs2_cluster_connection *conn) { dlm_lockspace_t *fsdlm; - struct ocfs2_live_connection *control; + struct ocfs2_live_connection *uninitialized_var(control); int rc = 0; BUG_ON(conn == NULL); -- cgit v1.2.3 From 936545401624217955df4dd44bb71615900e5397 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sun, 6 Dec 2009 22:38:53 +0800 Subject: ocfs2: replace u8 by __u8 in ocfs2_fs.h This patch replaces date type 'u8' with '__u8', which follows the coding style of ocfs2_fs.h, and portable to user space for ocfs2-tools. Signed-off-by: Coly Li Signed-off-by: Joel Becker --- fs/ocfs2/ocfs2_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index e9431e4a5e7c..1a1a679e51b5 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -1202,7 +1202,7 @@ struct ocfs2_local_disk_dqinfo { /* Header of one chunk of a quota file */ struct ocfs2_local_disk_chunk { __le32 dqc_free; /* Number of free entries in the bitmap */ - u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding + __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding * chunk of quota file */ }; -- cgit v1.2.3 From faf8b70f79edf56fedd531dfcd41f3e73c3e9696 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 3 Dec 2009 12:46:52 -0800 Subject: ocfs2: Use FIEMAP_EXTENT_SHARED Adds FIEMAP_EXTENT_SHARED flag to refcounted extents. Signed-off-by: Sunil Mushran Acked-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/extent_map.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 843db64e9d4a..cdce5f8c1cfa 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -786,6 +786,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, fe_flags = 0; if (rec.e_flags & OCFS2_EXT_UNWRITTEN) fe_flags |= FIEMAP_EXTENT_UNWRITTEN; + if (rec.e_flags & OCFS2_EXT_REFCOUNTED) + fe_flags |= FIEMAP_EXTENT_SHARED; if (is_last) fe_flags |= FIEMAP_EXTENT_LAST; len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; -- cgit v1.2.3 From 55f4946ed23cbf78efe9f818c4b55e890fe2beff Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Thu, 17 Dec 2009 18:42:16 +0800 Subject: Ocfs2: Should ocfs2 support fiemap for S_IFDIR inode? Let userspace have a chance to get the extent info of a directory just like extN did. Signed-off-by: Tristan Ye Signed-off-by: Joel Becker --- fs/ocfs2/namei.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f010b22b1c44..0b9f35e09bf8 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2326,4 +2326,5 @@ const struct inode_operations ocfs2_dir_iops = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, }; -- cgit v1.2.3 From c7d260afcbc0f9f4074e74796d4e3646875c08e1 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 18 Dec 2009 10:24:54 +0800 Subject: ocfs2: Add reflinked file's inode to inode hash eariler. We used to add reflinked file's inode to inode hash when we add it to the dest dir. But actually there is a race. Consider the following sequence. 1. reflink happens and create the inode in orphan dir. 2. reflink thread is scheduled out because of some io. 3. recovery begins to work and calls ocfs2_recover_orphans. It calls ocfs2_iget and get a new inode and i_count = 1. It calls iput then and delete inode. the buffer's uptodate state is cleared. This patch move insert_inode_hash to the create function so that it can be found by step 3 and prevented from deleting because i_count > 1. This resolves the bug http://oss.oracle.com/bugzilla/show_bug.cgi?id=1183. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f010b22b1c44..5ac4d52e090f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2136,6 +2136,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, if (status < 0) mlog_errno(status); + insert_inode_hash(inode); leave: if (status < 0 && did_quota_inode) vfs_dq_free_inode(inode); @@ -2284,7 +2285,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto out_commit; } - insert_inode_hash(inode); dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); status = 0; -- cgit v1.2.3 From 10cf1a02f444fdcd50be47cce3fa8bf08251dd9c Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 18 Dec 2009 10:24:55 +0800 Subject: ocfs2: Set i_nlink properly during reflink. We create a file in orphan dir for reflink so that if there is any error, we don't create any wrong dentry in the dir. But actually the file in orphan dir should be i_nlink = 0 so that it can be replayed and freed successfully. This patch first set i_nlink to 0 when creating the file in orphan dir and then set it to 1(reflink now only works for regular file) when we move it to the dest dir. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/namei.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 5ac4d52e090f..3e9b46002f22 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2108,6 +2108,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, } did_quota_inode = 1; + inode->i_nlink = 0; /* do the real work now. */ status = ocfs2_mknod_locked(osb, dir, inode, 0, &new_di_bh, parent_di_bh, handle, @@ -2268,6 +2269,8 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, di = (struct ocfs2_dinode *)di_bh->b_data; le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); di->i_orphaned_slot = 0; + inode->i_nlink = 1; + ocfs2_set_links_count(di, inode->i_nlink); ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, -- cgit v1.2.3 From 3d354cbc43db36e7e8b27ed78901064b87864ffc Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 20 Dec 2009 10:43:35 -0500 Subject: nfsd: fix "insecure" export option A typo in 12045a6ee9908b "nfsd: let "insecure" flag vary by pseudoflavor" reversed the sense of the "insecure" flag. Reported-by: Michael Guntsche Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- fs/nfsd/nfsfh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 1c12177b908c..55c8e63af0be 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -89,7 +89,7 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, int flags = nfsexp_flags(rqstp, exp); /* Check if the request originated from a secure port. */ - if (!rqstp->rq_secure && (flags & NFSEXP_INSECURE_PORT)) { + if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk(KERN_WARNING "nfsd: request from insecure port %s!\n", -- cgit v1.2.3 From 385e3ed4f0b81dd0b0b214050a30d352a71b95f7 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 16 Dec 2009 12:48:44 -0800 Subject: alloc_file(): simplify handling of mnt_clone_write() errors When alloc_file() and init_file() were combined, the error handling of mnt_clone_write() was taken into alloc_file() in a somewhat obfuscated way. Since we don't use the error code for anything except warning, we might as well warn directly without an extra variable. Signed-off-by: Roland Dreier Signed-off-by: Al Viro --- fs/file_table.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index 0afacf654398..69652c5bd5f0 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -186,10 +186,8 @@ struct file *alloc_file(struct path *path, fmode_t mode, * that we can do debugging checks at __fput() */ if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { - int error = 0; file_take_write(file); - error = mnt_clone_write(path->mnt); - WARN_ON(error); + WARN_ON(mnt_clone_write(path->mnt)); } ima_counts_get(file); return file; -- cgit v1.2.3 From ed2617585f39dd12fae38c657bba68b9779ea10d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 21 Dec 2009 17:53:53 +0100 Subject: fs/compat_ioctl.c: fix build error when !BLOCK No driver uses SG_SET_TRANSFORM any more in Linux, since the ide-scsi driver was removed in 2.6.29. The compat-ioctl cleanup series moved the handling for this around, which broke building without CONFIG_BLOCK. Just remove the code handling it for compat mode. Signed-off-by: Al Viro --- fs/compat_ioctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 14cbc831422a..332dd00f0894 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1600,8 +1600,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd, case KDSKBMETA: case KDSKBLED: case KDSETLED: - /* SG stuff */ - case SG_SET_TRANSFORM: /* AUTOFS */ case AUTOFS_IOC_READY: case AUTOFS_IOC_FAIL: -- cgit v1.2.3 From 628ff7c1d8d8466a5ad8078bd0206a130f8b8a51 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 18 Dec 2009 09:41:24 -0800 Subject: anonfd: Allow making anon files read-only It seems a couple places such as arch/ia64/kernel/perfmon.c and drivers/infiniband/core/uverbs_main.c could use anon_inode_getfile() instead of a private pseudo-fs + alloc_file(), if only there were a way to get a read-only file. So provide this by having anon_inode_getfile() create a read-only file if we pass O_RDONLY in flags. Signed-off-by: Roland Dreier Signed-off-by: Al Viro --- fs/anon_inodes.c | 12 ++++++++++-- fs/eventfd.c | 2 +- fs/eventpoll.c | 2 +- fs/signalfd.c | 2 +- fs/timerfd.c | 2 +- 5 files changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 2c994591f4d7..598237e97221 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -89,11 +89,19 @@ struct file *anon_inode_getfile(const char *name, struct qstr this; struct path path; struct file *file; + fmode_t mode; int error; if (IS_ERR(anon_inode_inode)) return ERR_PTR(-ENODEV); + switch (flags & O_ACCMODE) { + case O_RDONLY: mode = FMODE_READ; break; + case O_WRONLY: mode = FMODE_WRITE; break; + case O_RDWR: mode = FMODE_READ | FMODE_WRITE; break; + default: return ERR_PTR(-EINVAL); + } + if (fops->owner && !try_module_get(fops->owner)) return ERR_PTR(-ENOENT); @@ -121,13 +129,13 @@ struct file *anon_inode_getfile(const char *name, d_instantiate(path.dentry, anon_inode_inode); error = -ENFILE; - file = alloc_file(&path, FMODE_READ | FMODE_WRITE, fops); + file = alloc_file(&path, mode, fops); if (!file) goto err_dput; file->f_mapping = anon_inode_inode->i_mapping; file->f_pos = 0; - file->f_flags = O_RDWR | (flags & O_NONBLOCK); + file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); file->f_version = 0; file->private_data = priv; diff --git a/fs/eventfd.c b/fs/eventfd.c index 8b47e4200e65..d26402ff06ea 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -339,7 +339,7 @@ struct file *eventfd_file_create(unsigned int count, int flags) ctx->flags = flags; file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, - flags & EFD_SHARED_FCNTL_FLAGS); + O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); if (IS_ERR(file)) eventfd_free_ctx(ctx); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 366c503f9657..bd056a5b4efc 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1206,7 +1206,7 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) * a file structure and a free file descriptor. */ error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, - flags & O_CLOEXEC); + O_RDWR | (flags & O_CLOEXEC)); if (error < 0) ep_free(ep); diff --git a/fs/signalfd.c b/fs/signalfd.c index b07565c94386..1dabe4ee02fe 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -236,7 +236,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, * anon_inode_getfd() will install the fd. */ ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx, - flags & (O_CLOEXEC | O_NONBLOCK)); + O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK))); if (ufd < 0) kfree(ctx); } else { diff --git a/fs/timerfd.c b/fs/timerfd.c index b042bd7034b1..1bfc95ad5f71 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -200,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, - flags & TFD_SHARED_FCNTL_FLAGS); + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); if (ufd < 0) kfree(ctx); -- cgit v1.2.3 From 482928d59db668b8d82a48717f78986d8cea72e9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Dec 2009 10:10:39 -0500 Subject: Fix f_flags/f_mode in case of lookup_instantiate_filp() from open(pathname, 3) Just set f_flags when shoving struct file into nameidata; don't postpone that until __dentry_open(). do_filp_open() has correct value; lookup_instantiate_filp() doesn't - we lose the difference between O_RDWR and 3 by that point. We still set .intent.open.flags, so no fs code needs to be changed. Signed-off-by: Al Viro --- fs/internal.h | 7 +++++++ fs/namei.c | 6 ++++-- fs/open.c | 13 ++++++------- 3 files changed, 17 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index f67cd141d9a8..e96a1667d749 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -85,3 +85,10 @@ extern struct file *get_empty_filp(void); * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); + +/* + * open.c + */ +struct nameidata; +extern struct file *nameidata_to_filp(struct nameidata *); +extern void release_open_intent(struct nameidata *); diff --git a/fs/namei.c b/fs/namei.c index dad4b80257db..d517f73aa36b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1640,6 +1640,7 @@ struct file *do_filp_open(int dfd, const char *pathname, if (filp == NULL) return ERR_PTR(-ENFILE); nd.intent.open.file = filp; + filp->f_flags = open_flag; nd.intent.open.flags = flag; nd.intent.open.create_mode = 0; error = do_path_lookup(dfd, pathname, @@ -1685,6 +1686,7 @@ struct file *do_filp_open(int dfd, const char *pathname, if (filp == NULL) goto exit_parent; nd.intent.open.file = filp; + filp->f_flags = open_flag; nd.intent.open.flags = flag; nd.intent.open.create_mode = mode; dir = nd.path.dentry; @@ -1725,7 +1727,7 @@ do_last: mnt_drop_write(nd.path.mnt); goto exit; } - filp = nameidata_to_filp(&nd, open_flag); + filp = nameidata_to_filp(&nd); mnt_drop_write(nd.path.mnt); if (nd.root.mnt) path_put(&nd.root); @@ -1789,7 +1791,7 @@ ok: mnt_drop_write(nd.path.mnt); goto exit; } - filp = nameidata_to_filp(&nd, open_flag); + filp = nameidata_to_filp(&nd); if (!IS_ERR(filp)) { error = ima_path_check(&filp->f_path, filp->f_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); diff --git a/fs/open.c b/fs/open.c index ca69241796bd..6daee28f6e8f 100644 --- a/fs/open.c +++ b/fs/open.c @@ -821,15 +821,14 @@ static inline int __get_file_write_access(struct inode *inode, } static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, - int flags, struct file *f, + struct file *f, int (*open)(struct inode *, struct file *), const struct cred *cred) { struct inode *inode; int error; - f->f_flags = flags; - f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK | + f->f_mode = (__force fmode_t)((f->f_flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { @@ -930,7 +929,6 @@ struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry if (IS_ERR(dentry)) goto out_err; nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt), - nd->intent.open.flags - 1, nd->intent.open.file, open, cred); out: @@ -949,7 +947,7 @@ EXPORT_SYMBOL_GPL(lookup_instantiate_filp); * * Note that this function destroys the original nameidata */ -struct file *nameidata_to_filp(struct nameidata *nd, int flags) +struct file *nameidata_to_filp(struct nameidata *nd) { const struct cred *cred = current_cred(); struct file *filp; @@ -958,7 +956,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags) filp = nd->intent.open.file; /* Has the filesystem initialised the file for us? */ if (filp->f_path.dentry == NULL) - filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp, + filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, NULL, cred); else path_put(&nd->path); @@ -997,7 +995,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, return ERR_PTR(error); } - return __dentry_open(dentry, mnt, flags, f, NULL, cred); + f->f_flags = flags; + return __dentry_open(dentry, mnt, f, NULL, cred); } EXPORT_SYMBOL(dentry_open); -- cgit v1.2.3 From 5300990c0370e804e49d9a59d928c5d53fb73487 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Dec 2009 10:15:07 -0500 Subject: Sanitize f_flags helpers * pull ACC_MODE to fs.h; we have several copies all over the place * nightmarish expression calculating f_mode by f_flags deserves a helper too (OPEN_FMODE(flags)) Signed-off-by: Al Viro --- fs/anon_inodes.c | 10 +--------- fs/namei.c | 2 -- fs/open.c | 2 +- 3 files changed, 2 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 598237e97221..9f0bf13291e5 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -89,19 +89,11 @@ struct file *anon_inode_getfile(const char *name, struct qstr this; struct path path; struct file *file; - fmode_t mode; int error; if (IS_ERR(anon_inode_inode)) return ERR_PTR(-ENODEV); - switch (flags & O_ACCMODE) { - case O_RDONLY: mode = FMODE_READ; break; - case O_WRONLY: mode = FMODE_WRITE; break; - case O_RDWR: mode = FMODE_READ | FMODE_WRITE; break; - default: return ERR_PTR(-EINVAL); - } - if (fops->owner && !try_module_get(fops->owner)) return ERR_PTR(-ENOENT); @@ -129,7 +121,7 @@ struct file *anon_inode_getfile(const char *name, d_instantiate(path.dentry, anon_inode_inode); error = -ENFILE; - file = alloc_file(&path, mode, fops); + file = alloc_file(&path, OPEN_FMODE(flags), fops); if (!file) goto err_dput; file->f_mapping = anon_inode_inode->i_mapping; diff --git a/fs/namei.c b/fs/namei.c index d517f73aa36b..68921d9b5302 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -37,8 +37,6 @@ #include "internal.h" -#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) - /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) * were necessary because of omirr. The reason is that omirr needs diff --git a/fs/open.c b/fs/open.c index 6daee28f6e8f..040cef72bc00 100644 --- a/fs/open.c +++ b/fs/open.c @@ -828,7 +828,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, struct inode *inode; int error; - f->f_mode = (__force fmode_t)((f->f_flags+1) & O_ACCMODE) | FMODE_LSEEK | + f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { -- cgit v1.2.3 From 28ba0ec64ca0f6ad2b0338ccec0b00a4e64e7a69 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 6 Nov 2009 11:31:06 +0000 Subject: jfs: Fix 32bit build warning loff_t is a type that isn't entirely dependant upon 32 v 64bit choice Signed-off-by: Alan Cox Signed-off-by: Al Viro --- fs/jfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2234c73fc577..d929a822a74e 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -524,7 +524,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) * Page cache is indexed by long. * I would use MAX_LFS_FILESIZE, but it's only half as big */ - sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, sb->s_maxbytes); + sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes); #endif sb->s_time_gran = 1; return 0; -- cgit v1.2.3 From c459001fa4f71deafb62e00fa70d35f695498965 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 9 Dec 2009 03:05:30 +0300 Subject: ext3: quota macros cleanup [V2] Currently all quota block reservation macros contains hardcoded "2" aka MAXQUOTAS value. This is no good because in some places it is not obvious to understand what does this digit represent. Let's introduce new macro with self descriptive name. Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/ext3/inode.c | 8 ++++---- fs/ext3/namei.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index ad14227f509e..455e6e6e5cb9 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -970,7 +970,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, if (max_blocks > DIO_MAX_BLOCKS) max_blocks = DIO_MAX_BLOCKS; handle = ext3_journal_start(inode, DIO_CREDITS + - 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb)); + EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -3146,8 +3146,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+ - EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -3239,7 +3239,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode) #ifdef CONFIG_QUOTA /* We know that structure was already allocated during vfs_dq_init so * we will be updating only the data blocks + inodes */ - ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); + ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); #endif return ret; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index aad6400c9b77..81f7348b2de3 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1699,7 +1699,7 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1733,7 +1733,7 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1769,7 +1769,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2175,7 +2175,7 @@ static int ext3_symlink (struct inode * dir, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); -- cgit v1.2.3 From b462707e7ccad058ae151e5c5b06eb5cadcb737f Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 14 Dec 2009 15:21:12 +0300 Subject: Add unlocked version of inode_add_bytes() function Quota code requires unlocked version of this function. Off course we can just copy-paste the code, but copy-pasting is always an evil. Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/stat.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/stat.c b/fs/stat.c index 075694e31d8b..c4ecd52c5737 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -401,9 +401,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename, } #endif /* __ARCH_WANT_STAT64 */ -void inode_add_bytes(struct inode *inode, loff_t bytes) +/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ +void __inode_add_bytes(struct inode *inode, loff_t bytes) { - spin_lock(&inode->i_lock); inode->i_blocks += bytes >> 9; bytes &= 511; inode->i_bytes += bytes; @@ -411,6 +411,12 @@ void inode_add_bytes(struct inode *inode, loff_t bytes) inode->i_blocks++; inode->i_bytes -= 512; } +} + +void inode_add_bytes(struct inode *inode, loff_t bytes) +{ + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, bytes); spin_unlock(&inode->i_lock); } -- cgit v1.2.3 From fd8fbfc1709822bd94247c5b2ab15a5f5041e103 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 14 Dec 2009 15:21:13 +0300 Subject: quota: decouple fs reserved space from quota reservation Currently inode_reservation is managed by fs itself and this reservation is transfered on dquot_transfer(). This means what inode_reservation must always be in sync with dquot->dq_dqb.dqb_rsvspace. Otherwise dquot_transfer() will result in incorrect quota(WARN_ON in dquot_claim_reserved_space() will be triggered) This is not easy because of complex locking order issues for example http://bugzilla.kernel.org/show_bug.cgi?id=14739 The patch introduce quota reservation field for each fs-inode (fs specific inode is used in order to prevent bloating generic vfs inode). This reservation is managed by quota code internally similar to i_blocks/i_bytes and may not be always in sync with internal fs reservation. Also perform some code rearrangement: - Unify dquot_reserve_space() and dquot_reserve_space() - Unify dquot_release_reserved_space() and dquot_free_space() - Also this patch add missing warning update to release_rsv() dquot_release_reserved_space() must call flush_warnings() as dquot_free_space() does. Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/quota/dquot.c | 213 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 119 insertions(+), 94 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index cd6bb9a33c13..1cb8fa84300f 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1318,6 +1318,67 @@ void vfs_dq_drop(struct inode *inode) } EXPORT_SYMBOL(vfs_dq_drop); +/* + * inode_reserved_space is managed internally by quota, and protected by + * i_lock similar to i_blocks+i_bytes. + */ +static qsize_t *inode_reserved_space(struct inode * inode) +{ + /* Filesystem must explicitly define it's own method in order to use + * quota reservation interface */ + BUG_ON(!inode->i_sb->dq_op->get_reserved_space); + return inode->i_sb->dq_op->get_reserved_space(inode); +} + +static void inode_add_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + spin_unlock(&inode->i_lock); +} + + +static void inode_claim_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); +} + +static void inode_sub_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + spin_unlock(&inode->i_lock); +} + +static qsize_t inode_get_rsv_space(struct inode *inode) +{ + qsize_t ret; + spin_lock(&inode->i_lock); + ret = *inode_reserved_space(inode); + spin_unlock(&inode->i_lock); + return ret; +} + +static void inode_incr_space(struct inode *inode, qsize_t number, + int reserve) +{ + if (reserve) + inode_add_rsv_space(inode, number); + else + inode_add_bytes(inode, number); +} + +static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) +{ + if (reserve) + inode_sub_rsv_space(inode, number); + else + inode_sub_bytes(inode, number); +} + /* * Following four functions update i_blocks+i_bytes fields and * quota information (together with appropriate checks) @@ -1336,6 +1397,21 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int cnt, ret = QUOTA_OK; char warntype[MAXQUOTAS]; + /* + * First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex + */ + if (IS_NOQUOTA(inode)) { + inode_incr_space(inode, number, reserve); + goto out; + } + + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + if (IS_NOQUOTA(inode)) { + inode_incr_space(inode, number, reserve); + goto out_unlock; + } + for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; @@ -1346,7 +1422,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) == NO_QUOTA) { ret = NO_QUOTA; - goto out_unlock; + spin_unlock(&dq_data_lock); + goto out_flush_warn; } } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1357,64 +1434,32 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, else dquot_incr_space(inode->i_dquot[cnt], number); } - if (!reserve) - inode_add_bytes(inode, number); -out_unlock: + inode_incr_space(inode, number, reserve); spin_unlock(&dq_data_lock); - flush_warnings(inode->i_dquot, warntype); - return ret; -} - -int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) -{ - int cnt, ret = QUOTA_OK; - - /* - * First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex - */ - if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); - goto out; - } - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); - goto out_unlock; - } - - ret = __dquot_alloc_space(inode, number, warn, 0); - if (ret == NO_QUOTA) - goto out_unlock; + if (reserve) + goto out_flush_warn; /* Dirtify all the dquots - this can block when journalling */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (inode->i_dquot[cnt]) mark_dquot_dirty(inode->i_dquot[cnt]); +out_flush_warn: + flush_warnings(inode->i_dquot, warntype); out_unlock: up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); out: return ret; } + +int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) +{ + return __dquot_alloc_space(inode, number, warn, 0); +} EXPORT_SYMBOL(dquot_alloc_space); int dquot_reserve_space(struct inode *inode, qsize_t number, int warn) { - int ret = QUOTA_OK; - - if (IS_NOQUOTA(inode)) - goto out; - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) - goto out_unlock; - - ret = __dquot_alloc_space(inode, number, warn, 1); -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return __dquot_alloc_space(inode, number, warn, 1); } EXPORT_SYMBOL(dquot_reserve_space); @@ -1471,14 +1516,14 @@ int dquot_claim_space(struct inode *inode, qsize_t number) int ret = QUOTA_OK; if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); goto out; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); if (IS_NOQUOTA(inode)) { up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); goto out; } @@ -1490,7 +1535,7 @@ int dquot_claim_space(struct inode *inode, qsize_t number) number); } /* Update inode bytes */ - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); spin_unlock(&dq_data_lock); /* Dirtify all the dquots - this can block when journalling */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) @@ -1502,39 +1547,10 @@ out: } EXPORT_SYMBOL(dquot_claim_space); -/* - * Release reserved quota space - */ -void dquot_release_reserved_space(struct inode *inode, qsize_t number) -{ - int cnt; - - if (IS_NOQUOTA(inode)) - goto out; - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) - goto out_unlock; - - spin_lock(&dq_data_lock); - /* Release reserved dquots */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt]) - dquot_free_reserved_space(inode->i_dquot[cnt], number); - } - spin_unlock(&dq_data_lock); - -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return; -} -EXPORT_SYMBOL(dquot_release_reserved_space); - /* * This operation can block, but only after everything is updated */ -int dquot_free_space(struct inode *inode, qsize_t number) +int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) { unsigned int cnt; char warntype[MAXQUOTAS]; @@ -1543,7 +1559,7 @@ int dquot_free_space(struct inode *inode, qsize_t number) * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) { out_sub: - inode_sub_bytes(inode, number); + inode_decr_space(inode, number, reserve); return QUOTA_OK; } @@ -1558,20 +1574,42 @@ out_sub: if (!inode->i_dquot[cnt]) continue; warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); - dquot_decr_space(inode->i_dquot[cnt], number); + if (reserve) + dquot_free_reserved_space(inode->i_dquot[cnt], number); + else + dquot_decr_space(inode->i_dquot[cnt], number); } - inode_sub_bytes(inode, number); + inode_decr_space(inode, number, reserve); spin_unlock(&dq_data_lock); + + if (reserve) + goto out_unlock; /* Dirtify all the dquots - this can block when journalling */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (inode->i_dquot[cnt]) mark_dquot_dirty(inode->i_dquot[cnt]); +out_unlock: flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; } + +int dquot_free_space(struct inode *inode, qsize_t number) +{ + return __dquot_free_space(inode, number, 0); +} EXPORT_SYMBOL(dquot_free_space); +/* + * Release reserved quota space + */ +void dquot_release_reserved_space(struct inode *inode, qsize_t number) +{ + __dquot_free_space(inode, number, 1); + +} +EXPORT_SYMBOL(dquot_release_reserved_space); + /* * This operation can block, but only after everything is updated */ @@ -1609,19 +1647,6 @@ int dquot_free_inode(const struct inode *inode, qsize_t number) } EXPORT_SYMBOL(dquot_free_inode); -/* - * call back function, get reserved quota space from underlying fs - */ -qsize_t dquot_get_reserved_space(struct inode *inode) -{ - qsize_t reserved_space = 0; - - if (sb_any_quota_active(inode->i_sb) && - inode->i_sb->dq_op->get_reserved_space) - reserved_space = inode->i_sb->dq_op->get_reserved_space(inode); - return reserved_space; -} - /* * Transfer the number of inode and blocks from one diskquota to an other. * @@ -1665,7 +1690,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } spin_lock(&dq_data_lock); cur_space = inode_get_bytes(inode); - rsv_space = dquot_get_reserved_space(inode); + rsv_space = inode_get_rsv_space(inode); space = cur_space + rsv_space; /* Build the transfer_from list and check the limits */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -- cgit v1.2.3 From a9e7f4472075fb6937c545af3f6329e9946bbe66 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 14 Dec 2009 15:21:14 +0300 Subject: ext4: Convert to generic reserved quota's space management. This patch also fixes write vs chown race condition. Acked-by: "Theodore Ts'o" Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/ext4/ext4.h | 6 +++++- fs/ext4/inode.c | 16 +++++++--------- fs/ext4/super.c | 5 +++++ 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ab31e65d46d0..56f9271ee8cc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -704,6 +704,10 @@ struct ext4_inode_info { __u16 i_extra_isize; spinlock_t i_block_reservation_lock; +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif /* completed async DIOs that might need unwritten extents handling */ struct list_head i_aio_dio_complete_list; @@ -1435,7 +1439,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); -extern qsize_t ext4_get_reserved_space(struct inode *inode); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int flush_aio_dio_completed_IO(struct inode *inode); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5352db1a3086..a31980dd9b86 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1003,17 +1003,12 @@ out: return err; } -qsize_t ext4_get_reserved_space(struct inode *inode) +#ifdef CONFIG_QUOTA +qsize_t *ext4_get_reserved_space(struct inode *inode) { - unsigned long long total; - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - total = EXT4_I(inode)->i_reserved_data_blocks + - EXT4_I(inode)->i_reserved_meta_blocks; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - - return (total << inode->i_blkbits); + return &EXT4_I(inode)->i_reserved_quota; } +#endif /* * Calculate the number of metadata blocks need to reserve * to allocate @blocks for non extent file based file @@ -4794,6 +4789,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); ei->i_disksize = inode->i_size; +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 827bde1f2594..6ed9aa91f27d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -704,6 +704,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_allocated_meta_blocks = 0; ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; @@ -1014,7 +1017,9 @@ static const struct dquot_operations ext4_quota_operations = { .reserve_space = dquot_reserve_space, .claim_space = dquot_claim_space, .release_rsv = dquot_release_reserved_space, +#ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, +#endif .alloc_inode = dquot_alloc_inode, .free_space = dquot_free_space, .free_inode = dquot_free_inode, -- cgit v1.2.3 From dc52dd3a3a800e70b3440ea4424f8c87ab043e42 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 14 Dec 2009 15:21:15 +0300 Subject: quota: Move duplicated code to separate functions - for(..) { mark_dquot_dirty(); } -> mark_all_dquot_dirty() - for(..) { dput(); } -> dqput_all() Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/quota/dquot.c | 77 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 1cb8fa84300f..dea86abdf2e7 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -323,6 +323,30 @@ int dquot_mark_dquot_dirty(struct dquot *dquot) } EXPORT_SYMBOL(dquot_mark_dquot_dirty); +/* Dirtify all the dquots - this can block when journalling */ +static inline int mark_all_dquot_dirty(struct dquot * const *dquot) +{ + int ret, err, cnt; + + ret = err = 0; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (dquot[cnt]) + /* Even in case of error we have to continue */ + ret = mark_dquot_dirty(dquot[cnt]); + if (!err) + err = ret; + } + return err; +} + +static inline void dqput_all(struct dquot **dquot) +{ + unsigned int cnt; + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + dqput(dquot[cnt]); +} + /* This function needs dq_list_lock */ static inline int clear_dquot_dirty(struct dquot *dquot) { @@ -1268,8 +1292,7 @@ int dquot_initialize(struct inode *inode, int type) out_err: up_write(&sb_dqopt(sb)->dqptr_sem); /* Drop unused references */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - dqput(got[cnt]); + dqput_all(got); return ret; } EXPORT_SYMBOL(dquot_initialize); @@ -1288,9 +1311,7 @@ int dquot_drop(struct inode *inode) inode->i_dquot[cnt] = NULL; } up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - dqput(put[cnt]); + dqput_all(put); return 0; } EXPORT_SYMBOL(dquot_drop); @@ -1439,10 +1460,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, if (reserve) goto out_flush_warn; - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); out_flush_warn: flush_warnings(inode->i_dquot, warntype); out_unlock: @@ -1500,10 +1518,7 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number) warn_put_all: spin_unlock(&dq_data_lock); if (ret == QUOTA_OK) - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return ret; @@ -1537,10 +1552,7 @@ int dquot_claim_space(struct inode *inode, qsize_t number) /* Update inode bytes */ inode_claim_rsv_space(inode, number); spin_unlock(&dq_data_lock); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); out: return ret; @@ -1584,10 +1596,7 @@ out_sub: if (reserve) goto out_unlock; - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); out_unlock: flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1637,10 +1646,7 @@ int dquot_free_inode(const struct inode *inode, qsize_t number) dquot_decr_inodes(inode->i_dquot[cnt], number); } spin_unlock(&dq_data_lock); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; @@ -1734,25 +1740,18 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) spin_unlock(&dq_data_lock); up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (transfer_from[cnt]) - mark_dquot_dirty(transfer_from[cnt]); - if (transfer_to[cnt]) { - mark_dquot_dirty(transfer_to[cnt]); - /* The reference we got is transferred to the inode */ - transfer_to[cnt] = NULL; - } - } + mark_all_dquot_dirty(transfer_from); + mark_all_dquot_dirty(transfer_to); + /* The reference we got is transferred to the inode */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + transfer_to[cnt] = NULL; warn_put_all: flush_warnings(transfer_to, warntype_to); flush_warnings(transfer_from, warntype_from_inodes); flush_warnings(transfer_from, warntype_from_space); put_all: - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - dqput(transfer_from[cnt]); - dqput(transfer_to[cnt]); - } + dqput_all(transfer_from); + dqput_all(transfer_to); return ret; over_quota: spin_unlock(&dq_data_lock); -- cgit v1.2.3 From ed505ee454dbf133b1a4067d3ac0325d4261eda1 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 14 Dec 2009 12:59:18 -0600 Subject: ext3: Remove outdated comment about lock_super() ext3_fill_super() is no longer called by read_super(), and it is no longer called with the superblock locked. The unlock_super()/lock_super() is no longer present, so this comment is entirely superfluous. Port of ext4 commit 32ed5058ce90024efcd811254b4b1de0468099df by Theodore Ts'o . CC: Theodore Ts'o Signed-off-by: Eric Sandeen Signed-off-by: Jan Kara --- fs/ext3/super.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7ad1e8c30bd0..8ad566b845ae 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2014,14 +2014,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) } ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock - * in numerous places. Here we just pop the lock - it's relatively - * harmless, because we are now ready to accept write_super() requests, - * and aviro says that's the only reason for hanging onto the - * superblock lock. - */ + EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; ext3_orphan_cleanup(sb, es); EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; -- cgit v1.2.3 From 4854a5f0cbb1967fc7db3ea861d97afeea78b88b Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 14 Dec 2009 12:59:59 -0600 Subject: ext3: ext3_mark_recovery_complete() doesn't need to use lock_super The function ext3_mark_recovery_complete() is called from two call paths: either (a) while mounting the filesystem, in which case there's no danger of any other CPU calling write_super() until the mount is completed, and (b) while remounting the filesystem read-write, in which case the fs core has already locked the superblock. This also allows us to take out a very vile unlock_super()/lock_super() pair in ext3_remount(). Port of ext4 commit a63c9eb2ce6f5028da90f282798232c4f398ceb8 by Theodore Ts'o . CC: Theodore Ts'o Signed-off-by: Eric Sandeen Signed-off-by: Jan Kara --- fs/ext3/super.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 8ad566b845ae..806b8b780add 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2396,13 +2396,11 @@ static void ext3_mark_recovery_complete(struct super_block * sb, if (journal_flush(journal) < 0) goto out; - lock_super(sb); if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ext3_commit_super(sb, es, 1); } - unlock_super(sb); out: journal_unlock_updates(journal); @@ -2594,13 +2592,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) (sbi->s_mount_state & EXT3_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); - /* - * We have to unlock super so that we can wait for - * transactions. - */ - unlock_super(sb); ext3_mark_recovery_complete(sb, es); - lock_super(sb); } else { __le32 ret; if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, -- cgit v1.2.3 From b8a052d01669977f224255b0f9f2737018171ddb Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 14 Dec 2009 13:00:30 -0600 Subject: ext3: Replace lock/unlock_super() with an explicit lock for the orphan list Use a separate lock to protect the orphan list, so we can stop overloading the use of lock_super(). Port of ext4 commit 3b9d4ed26680771295d904a6b83e88e620780893 by Theodore Ts'o . CC: Theodore Ts'o Signed-off-by: Eric Sandeen Signed-off-by: Jan Kara --- fs/ext3/namei.c | 20 +++++++++++--------- fs/ext3/super.c | 1 + 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 81f7348b2de3..7b0e44f7d66f 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1920,7 +1920,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err = 0, rc; - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_orphan_lock); if (!list_empty(&EXT3_I(inode)->i_orphan)) goto out_unlock; @@ -1929,9 +1929,13 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) /* @@@ FIXME: Observation from aviro: * I think I can trigger J_ASSERT in ext3_orphan_add(). We block - * here (on lock_super()), so race with ext3_link() which might bump + * here (on s_orphan_lock), so race with ext3_link() which might bump * ->i_nlink. For, say it, character device. Not a regular file, * not a directory, not a symlink and ->i_nlink > 0. + * + * tytso, 4/25/2009: I'm not sure how that could happen; + * shouldn't the fs core protect us from these sort of + * unlink()/link() races? */ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); @@ -1968,7 +1972,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out_unlock: - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_orphan_lock); ext3_std_error(inode->i_sb, err); return err; } @@ -1986,11 +1990,9 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err = 0; - lock_super(inode->i_sb); - if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } + mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; ino_next = NEXT_ORPHAN(inode); prev = ei->i_orphan.prev; @@ -2040,7 +2042,7 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode) out_err: ext3_std_error(inode->i_sb, err); out: - unlock_super(inode->i_sb); + mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock); return err; out_brelse: diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 806b8b780add..97dd3828384c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1928,6 +1928,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->dq_op = &ext3_quota_operations; #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); sb->s_root = NULL; -- cgit v1.2.3 From 96d2a495c25d525873529b736cdb63ad502b101c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 14 Dec 2009 13:01:05 -0600 Subject: ext3: Replace lock/unlock_super() with an explicit lock for resizing Use a separate lock to protect s_groups_count and the other block group descriptors which get changed via an on-line resize operation, so we can stop overloading the use of lock_super(). Port of ext4 commit 32ed5058ce90024efcd811254b4b1de0468099df by Theodore Ts'o . CC: Theodore Ts'o Signed-off-by: Eric Sandeen Signed-off-by: Jan Kara --- fs/ext3/resize.c | 35 ++++++++++++++++++----------------- fs/ext3/super.c | 1 + 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 5f83b6179178..54351ac7cef9 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -209,7 +209,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { err = -EBUSY; goto exit_journal; @@ -324,7 +324,7 @@ exit_bh: brelse(bh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; @@ -662,11 +662,12 @@ exit_free: * important part is that the new block and inode counts are in the backup * superblocks, and the location of the new group metadata in the GDT backups. * - * We do not need lock_super() for this, because these blocks are not - * otherwise touched by the filesystem code when it is mounted. We don't - * need to worry about last changing from sbi->s_groups_count, because the - * worst that can happen is that we do not copy the full number of backups - * at this time. The resize which changed s_groups_count will backup again. + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. */ static void update_backups(struct super_block *sb, int blk_off, char *data, int size) @@ -825,7 +826,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) goto exit_put; } - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); @@ -856,7 +857,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) /* * OK, now we've set up the new group. Time to make it active. * - * Current kernels don't lock all allocations via lock_super(), + * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -900,12 +901,12 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold lock_super + * * Writers of s_groups_count *must* hold s_resize_lock * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold lock_super() over the access + * * Readers must hold s_resize_lock over the access * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. @@ -936,7 +937,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) ext3_journal_dirty_metadata(handle, sbi->s_sbh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; if (!err) { @@ -973,7 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after - * taking lock_super() below. */ + * taking the s_resize_lock below. */ o_blocks_count = le32_to_cpu(es->s_blocks_count); o_groups_count = EXT3_SB(sb)->s_groups_count; @@ -1045,11 +1046,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, goto exit_put; } - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_resize_lock); if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_journal_stop(handle); err = -EBUSY; goto exit_put; @@ -1059,13 +1060,13 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, EXT3_SB(sb)->s_sbh))) { ext3_warning(sb, __func__, "error %d on journal write access", err); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_journal_stop(handle); goto exit_put; } es->s_blocks_count = cpu_to_le32(o_blocks_count + add); ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, o_blocks_count + add); ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 97dd3828384c..afa2b569da10 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1929,6 +1929,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); + mutex_init(&sbi->s_resize_lock); sb->s_root = NULL; -- cgit v1.2.3 From 82fdfa928cfa19d9627526b2ce164a27f8e9d34b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Dec 2009 22:24:36 +0100 Subject: quota: Fix 64-bit limits setting on 32-bit archs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix warnings: fs/quota/quota_v2.c: In function ‘v2_read_file_info’: fs/quota/quota_v2.c:123: warning: integer constant is too large for ‘long’ type fs/quota/quota_v2.c:124: warning: integer constant is too large for ‘long’ type Reported-by: Jerry Leo Signed-off-by: Jan Kara --- fs/quota/quota_v2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 3dfc23e02135..669855cf9b61 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -120,8 +120,8 @@ static int v2_read_file_info(struct super_block *sb, int type) info->dqi_maxilimit = 0xffffffff; } else { /* used space is stored as unsigned 64-bit value */ - info->dqi_maxblimit = 0xffffffffffffffff; /* 2^64-1 */ - info->dqi_maxilimit = 0xffffffffffffffff; + info->dqi_maxblimit = 0xffffffffffffffffULL; /* 2^64-1 */ + info->dqi_maxilimit = 0xffffffffffffffffULL; } info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); -- cgit v1.2.3 From d21cd8f163ac44b15c465aab7306db931c606908 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Thu, 10 Dec 2009 03:31:45 +0000 Subject: ext4: Fix potential quota deadlock We have to delay vfs_dq_claim_space() until allocation context destruction. Currently we have following call-trace: ext4_mb_new_blocks() /* task is already holding ac->alloc_semp */ ->ext4_mb_mark_diskspace_used ->vfs_dq_claim_space() /* acquire dqptr_sem here. Possible deadlock */ ->ext4_mb_release_context() /* drop ac->alloc_semp here */ Let's move quota claiming to ext4_da_update_reserve_space() ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.32-rc7 #18 ------------------------------------------------------- write-truncate-/3465 is trying to acquire lock: (&s->s_dquot.dqptr_sem){++++..}, at: [] dquot_claim_space+0x3b/0x1b0 but task is already holding lock: (&meta_group_info[i]->alloc_sem){++++..}, at: [] ext4_mb_load_buddy+0xb2/0x370 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&meta_group_info[i]->alloc_sem){++++..}: [] __lock_acquire+0xd7b/0x1260 [] lock_acquire+0xba/0xd0 [] down_read+0x51/0x90 [] ext4_mb_load_buddy+0xb2/0x370 [] ext4_mb_free_blocks+0x46c/0x870 [] ext4_free_blocks+0x73/0x130 [] ext4_ext_truncate+0x76c/0x8d0 [] ext4_truncate+0x187/0x5e0 [] vmtruncate+0x6b/0x70 [] inode_setattr+0x62/0x190 [] ext4_setattr+0x25a/0x370 [] notify_change+0x151/0x340 [] do_truncate+0x6d/0xa0 [] may_open+0x1d4/0x200 [] do_filp_open+0x1eb/0x910 [] do_sys_open+0x6d/0x140 [] sys_open+0x2e/0x40 [] sysenter_do_call+0x12/0x32 -> #2 (&ei->i_data_sem){++++..}: [] __lock_acquire+0xd7b/0x1260 [] lock_acquire+0xba/0xd0 [] down_read+0x51/0x90 [] ext4_get_blocks+0x47/0x450 [] ext4_getblk+0x61/0x1d0 [] ext4_bread+0x1f/0xa0 [] ext4_quota_write+0x12c/0x310 [] qtree_write_dquot+0x93/0x120 [] v2_write_dquot+0x28/0x30 [] dquot_commit+0xab/0xf0 [] ext4_write_dquot+0x77/0x90 [] ext4_mark_dquot_dirty+0x2f/0x50 [] dquot_alloc_inode+0x101/0x180 [] ext4_new_inode+0x602/0xf00 [] ext4_create+0x89/0x150 [] vfs_create+0xa2/0xc0 [] do_filp_open+0x7a7/0x910 [] do_sys_open+0x6d/0x140 [] sys_open+0x2e/0x40 [] sysenter_do_call+0x12/0x32 -> #1 (&sb->s_type->i_mutex_key#7/4){+.+...}: [] __lock_acquire+0xd7b/0x1260 [] lock_acquire+0xba/0xd0 [] mutex_lock_nested+0x65/0x2d0 [] vfs_load_quota_inode+0x4bd/0x5a0 [] vfs_quota_on_path+0x5f/0x70 [] ext4_quota_on+0x112/0x190 [] sys_quotactl+0x44a/0x8a0 [] sysenter_do_call+0x12/0x32 -> #0 (&s->s_dquot.dqptr_sem){++++..}: [] __lock_acquire+0x1091/0x1260 [] lock_acquire+0xba/0xd0 [] down_read+0x51/0x90 [] dquot_claim_space+0x3b/0x1b0 [] ext4_mb_mark_diskspace_used+0x36f/0x380 [] ext4_mb_new_blocks+0x34a/0x530 [] ext4_ext_get_blocks+0x122b/0x13c0 [] ext4_get_blocks+0x226/0x450 [] mpage_da_map_blocks+0xc3/0xaa0 [] ext4_da_writepages+0x506/0x790 [] do_writepages+0x22/0x50 [] __filemap_fdatawrite_range+0x6d/0x80 [] filemap_flush+0x2b/0x30 [] ext4_alloc_da_blocks+0x5c/0x60 [] ext4_release_file+0x75/0xb0 [] __fput+0xf9/0x210 [] fput+0x27/0x30 [] filp_close+0x4c/0x80 [] put_files_struct+0x6e/0xd0 [] exit_files+0x47/0x60 [] do_exit+0x144/0x710 [] do_group_exit+0x38/0xa0 [] get_signal_to_deliver+0x2ac/0x410 [] do_notify_resume+0xb9/0x890 [] work_notifysig+0x13/0x21 other info that might help us debug this: 3 locks held by write-truncate-/3465: #0: (jbd2_handle){+.+...}, at: [] start_this_handle+0x38f/0x5c0 #1: (&ei->i_data_sem){++++..}, at: [] ext4_get_blocks+0xb6/0x450 #2: (&meta_group_info[i]->alloc_sem){++++..}, at: [] ext4_mb_load_buddy+0xb2/0x370 stack backtrace: Pid: 3465, comm: write-truncate- Not tainted 2.6.32-rc7 #18 Call Trace: [] ? printk+0x1d/0x22 [] print_circular_bug+0xca/0xd0 [] __lock_acquire+0x1091/0x1260 [] ? sched_clock_local+0xd2/0x170 [] ? trace_hardirqs_off_caller+0x20/0xd0 [] lock_acquire+0xba/0xd0 [] ? dquot_claim_space+0x3b/0x1b0 [] down_read+0x51/0x90 [] ? dquot_claim_space+0x3b/0x1b0 [] dquot_claim_space+0x3b/0x1b0 [] ext4_mb_mark_diskspace_used+0x36f/0x380 [] ext4_mb_new_blocks+0x34a/0x530 [] ? ext4_ext_find_extent+0x25d/0x280 [] ext4_ext_get_blocks+0x122b/0x13c0 [] ? sched_clock_local+0xd2/0x170 [] ? sched_clock_cpu+0x120/0x160 [] ? cpu_clock+0x4f/0x60 [] ? trace_hardirqs_off_caller+0x20/0xd0 [] ? down_write+0x8c/0xa0 [] ext4_get_blocks+0x226/0x450 [] ? sched_clock_cpu+0x120/0x160 [] ? cpu_clock+0x4f/0x60 [] ? trace_hardirqs_off+0xb/0x10 [] mpage_da_map_blocks+0xc3/0xaa0 [] ? find_get_pages_tag+0x16c/0x180 [] ? find_get_pages_tag+0x0/0x180 [] ? __mpage_da_writepage+0x16d/0x1a0 [] ? pagevec_lookup_tag+0x2e/0x40 [] ? write_cache_pages+0xdb/0x3d0 [] ? __mpage_da_writepage+0x0/0x1a0 [] ext4_da_writepages+0x506/0x790 [] ? cpu_clock+0x4f/0x60 [] ? sched_clock_local+0xd2/0x170 [] ? sched_clock_cpu+0x120/0x160 [] ? sched_clock_cpu+0x120/0x160 [] ? ext4_da_writepages+0x0/0x790 [] do_writepages+0x22/0x50 [] __filemap_fdatawrite_range+0x6d/0x80 [] filemap_flush+0x2b/0x30 [] ext4_alloc_da_blocks+0x5c/0x60 [] ext4_release_file+0x75/0xb0 [] __fput+0xf9/0x210 [] fput+0x27/0x30 [] filp_close+0x4c/0x80 [] put_files_struct+0x6e/0xd0 [] exit_files+0x47/0x60 [] do_exit+0x144/0x710 [] ? lock_release_holdtime+0x33/0x210 [] ? _spin_unlock_irq+0x27/0x30 [] do_group_exit+0x38/0xa0 [] ? trace_hardirqs_on+0xb/0x10 [] get_signal_to_deliver+0x2ac/0x410 [] do_notify_resume+0xb9/0x890 [] ? trace_hardirqs_off_caller+0x20/0xd0 [] ? lock_release_holdtime+0x33/0x210 [] ? autoremove_wake_function+0x0/0x50 [] ? trace_hardirqs_on_caller+0x134/0x190 [] ? trace_hardirqs_on+0xb/0x10 [] ? security_file_permission+0x14/0x20 [] ? vfs_write+0x131/0x190 [] ? do_sync_write+0x0/0x120 [] ? sysenter_do_call+0x27/0x32 [] work_notifysig+0x13/0x21 CC: Theodore Ts'o Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/ext4/inode.c | 9 +++++++-- fs/ext4/mballoc.c | 6 ------ 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a31980dd9b86..ddd0a9c87d9e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1046,7 +1046,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, int blocks) static void ext4_da_update_reserve_space(struct inode *inode, int used) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int total, mdb, mdb_free; + int total, mdb, mdb_free, mdb_claim = 0; spin_lock(&EXT4_I(inode)->i_block_reservation_lock); /* recalculate the number of metablocks still need to be reserved */ @@ -1059,7 +1059,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) if (mdb_free) { /* Account for allocated meta_blocks */ - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; + mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks; + BUG_ON(mdb_free < mdb_claim); + mdb_free -= mdb_claim; /* update fs dirty blocks counter */ percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); @@ -1070,8 +1072,11 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) /* update per-inode reservations */ BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); EXT4_I(inode)->i_reserved_data_blocks -= used; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + vfs_dq_claim_block(inode, used + mdb_claim); + /* * free those over-booking quota for metadata blocks */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b1fd3daadc9c..d34afad3e137 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2755,12 +2755,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); - else { - percpu_counter_sub(&sbi->s_dirtyblocks_counter, - ac->ac_b_ex.fe_len); - /* convert reserved quota blocks to real quota blocks */ - vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len); - } if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, -- cgit v1.2.3 From 39bc680a8160bb9d6743f7873b535d553ff61058 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Thu, 10 Dec 2009 16:36:27 +0000 Subject: ext4: fix sleep inside spinlock issue with quota and dealloc (#14739) Unlock i_block_reservation_lock before vfs_dq_reserve_block(). This patch fixes http://bugzilla.kernel.org/show_bug.cgi?id=14739 CC: Theodore Ts'o Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/ext4/inode.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ddd0a9c87d9e..ab807963a614 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1816,19 +1816,17 @@ repeat: md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; total = md_needed + nrblocks; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); /* * Make quota reservation here to prevent quota overflow * later. Real quota accounting is done at pages writeout * time. */ - if (vfs_dq_reserve_block(inode, total)) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + if (vfs_dq_reserve_block(inode, total)) return -EDQUOT; - } if (ext4_claim_free_blocks(sbi, total)) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); vfs_dq_release_reservation_block(inode, total); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); @@ -1836,10 +1834,11 @@ repeat: } return -ENOSPC; } + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); EXT4_I(inode)->i_reserved_data_blocks += nrblocks; - EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; - + EXT4_I(inode)->i_reserved_meta_blocks += md_needed; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return 0; /* success */ } -- cgit v1.2.3 From 765f8361902d015c864d5e62019b2f139452d7ef Mon Sep 17 00:00:00 2001 From: Yin Kangkai Date: Tue, 15 Dec 2009 14:48:25 -0800 Subject: jbd: jbd-debug and jbd2-debug should be writable jbd-debug and jbd2-debug is currently read-only (S_IRUGO), which is not correct. Make it writable so that we can start debuging. Signed-off-by: Yin Kangkai Reviewed-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Jan Kara --- fs/jbd/journal.c | 2 +- fs/jbd2/journal.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 4160afad6d00..bd224eec9b07 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1913,7 +1913,7 @@ static void __init jbd_create_debugfs_entry(void) { jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); if (jbd_debugfs_dir) - jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO, + jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR, jbd_debugfs_dir, &journal_enable_debug); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b7ca3a92a4db..17af879e6e9e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2115,7 +2115,8 @@ static void __init jbd2_create_debugfs_entry(void) { jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL); if (jbd2_debugfs_dir) - jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, S_IRUGO, + jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, + S_IRUGO | S_IWUSR, jbd2_debugfs_dir, &jbd2_journal_enable_debug); } -- cgit v1.2.3 From 869835dfad3eb6f7d90c3255a24b084fea82f30d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 21 Dec 2009 21:57:04 +0100 Subject: quota: Improve checking of quota file header When we are asked for vfsv0 quota format and the file is in vfsv1 format (or vice versa), refuse to use the quota file. Also return with error when we don't like the header of quota file. Signed-off-by: Jan Kara --- fs/quota/quota_v2.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 669855cf9b61..e3da02f4986f 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -97,8 +97,11 @@ static int v2_read_file_info(struct super_block *sb, int type) unsigned int version; if (!v2_read_header(sb, type, &dqhead)) - return 0; + return -1; version = le32_to_cpu(dqhead.dqh_version); + if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) || + (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1)) + return -1; size = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); -- cgit v1.2.3 From 9329d1beaeed1a94f030c784dcec5ff973f402c4 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 18 Dec 2009 21:18:15 +0100 Subject: vfs: get_sb_single() - do not pass options twice Filesystem code usually destroys the option buffer while parsing it. This leads to errors when the same buffer is passed twice. In case we fill a new superblock do not call remount. This is needed to quite a warning that the debugfs code causes every boot. Cc: Miklos Szeredi Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 19eb70b374bc..aff046b0fe78 100644 --- a/fs/super.c +++ b/fs/super.c @@ -901,8 +901,9 @@ int get_sb_single(struct file_system_type *fs_type, return error; } s->s_flags |= MS_ACTIVE; + } else { + do_remount_sb(s, flags, data, 0); } - do_remount_sb(s, flags, data, 0); simple_set_mnt(mnt, s); return 0; } -- cgit v1.2.3 From 66ecb92be9eb579df93add22d19843e7869f168e Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Fri, 18 Dec 2009 15:34:20 +0200 Subject: Driver core: bin_attribute parameters can often be const* Many struct bin_attribute descriptors are purely read-only structures, and there's no need to change them. Therefore make the promise not to, which will let those descriptors be put in a ro section. Signed-off-by: Phil Carmody Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/bin.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 60c702bc10ae..a0a500af24a1 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -483,7 +483,8 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd) * @attr: attribute descriptor. */ -int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) +int sysfs_create_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) { BUG_ON(!kobj || !kobj->sd || !attr); @@ -497,7 +498,8 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) * @attr: attribute descriptor. */ -void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) +void sysfs_remove_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) { sysfs_hash_and_remove(kobj->sd, attr->attr.name); } -- cgit v1.2.3 From 86239d59e268a35a49a00fce9c8512a506267f5c Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Tue, 22 Dec 2009 09:11:58 +0800 Subject: Ocfs2: Let ocfs2 support fiemap for symlink and fast symlink. For fast symlink, it can be treated the same as inlined files since the data extent we want to return of both case all were stored in metadata block. For symlink, it can be simply treated the same as we did for regular files. Signed-off-by: Tristan Ye Acked-by: Sunil Mushran Signed-off-by: Joel Becker --- fs/ocfs2/extent_map.c | 23 +++++++++++++++++++---- fs/ocfs2/symlink.c | 2 ++ 2 files changed, 21 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index cdce5f8c1cfa..d35a27f4523e 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -37,6 +37,7 @@ #include "extent_map.h" #include "inode.h" #include "super.h" +#include "symlink.h" #include "buffer_head_io.h" @@ -703,6 +704,12 @@ out: return ret; } +/* + * The ocfs2_fiemap_inline() may be a little bit misleading, since + * it not only handles the fiemap for inlined files, but also deals + * with the fast symlink, cause they have no difference for extent + * mapping per se. + */ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct fiemap_extent_info *fieinfo, u64 map_start) @@ -715,11 +722,18 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct ocfs2_inode_info *oi = OCFS2_I(inode); di = (struct ocfs2_dinode *)di_bh->b_data; - id_count = le16_to_cpu(di->id2.i_data.id_count); + if (ocfs2_inode_is_fast_symlink(inode)) + id_count = ocfs2_fast_symlink_chars(inode->i_sb); + else + id_count = le16_to_cpu(di->id2.i_data.id_count); if (map_start < id_count) { phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits; - phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + if (ocfs2_inode_is_fast_symlink(inode)) + phys += offsetof(struct ocfs2_dinode, id2.i_symlink); + else + phys += offsetof(struct ocfs2_dinode, + id2.i_data.id_data); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, flags); @@ -756,9 +770,10 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, down_read(&OCFS2_I(inode)->ip_alloc_sem); /* - * Handle inline-data separately. + * Handle inline-data and fast symlink separately. */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || + ocfs2_inode_is_fast_symlink(inode)) { ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start); goto out_unlock; } diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index e3421030a69f..49b133ccbf11 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -163,6 +163,7 @@ const struct inode_operations ocfs2_symlink_inode_operations = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, }; const struct inode_operations ocfs2_fast_symlink_inode_operations = { .readlink = ocfs2_readlink, @@ -174,4 +175,5 @@ const struct inode_operations ocfs2_fast_symlink_inode_operations = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, }; -- cgit v1.2.3 From b31d308ddcfb73349582a0196b5f65dad3cf5d17 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 22 Dec 2009 10:32:15 +0800 Subject: ocfs2/trivial: Use proper mask for 2 places in hearbeat.c I just noticed today that there are 2 places of "mlog(0,...)" in fs/ocfs2/cluster/heartbeat.c, but actually have no default mask prefix in that file. So change them to mlog(ML_HEARTBEAT,...). Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/cluster/heartbeat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index c452d116b892..eda5b8bcddd5 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -176,7 +176,8 @@ static void o2hb_write_timeout(struct work_struct *work) static void o2hb_arm_write_timeout(struct o2hb_region *reg) { - mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); + mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", + O2HB_MAX_WRITE_TIMEOUT_MS); cancel_delayed_work(®->hr_write_timeout_work); reg->hr_last_timeout_start = jiffies; @@ -874,7 +875,8 @@ static int o2hb_thread(void *data) do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); - mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", + mlog(ML_HEARTBEAT, + "start = %lu.%lu, end = %lu.%lu, msec = %u\n", before_hb.tv_sec, (unsigned long) before_hb.tv_usec, after_hb.tv_sec, (unsigned long) after_hb.tv_usec, elapsed_msec); -- cgit v1.2.3 From 8ff6af881deccca4f88e03f2fdadb1aac42e9489 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 23 Dec 2009 14:31:15 +0800 Subject: ocfs2/trivial: Use le16_to_cpu for a disk value in xattr.c In ocfs2_value_metas_in_xattr_header, we should Use le16_to_cpu for ocfs2_extent_list.l_next_free_rec. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 923e950e5445..932c07731608 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -6062,7 +6062,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb, * to the extent block, so just calculate a maximum record num. */ if (!xv->xr_list.l_tree_depth) - *num_recs += xv->xr_list.l_next_free_rec; + *num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec); else *num_recs += ocfs2_clusters_for_bytes(sb, XATTR_SIZE_MAX); -- cgit v1.2.3 From 2f48912d1443a796f10c42c89cb386f0e3eca04d Mon Sep 17 00:00:00 2001 From: Daisuke HATAYAMA Date: Mon, 4 Jan 2010 15:42:14 +0900 Subject: binfmt_elf_fdpic: Fix build breakage introduced by coredump changes. Commit f6151dfea21496d43dbaba32cfcd9c9f404769bc introduces build breakage, so this patch fixes it together with some printk formatting cleanup. Signed-off-by: Daisuke HATAYAMA Signed-off-by: Paul Mundt --- fs/binfmt_elf_fdpic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c25256a5c5b0..7dc85997e96c 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1798,11 +1798,11 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) ELF_CORE_WRITE_EXTRA_DATA; #endif - if (file->f_pos != offset) { + if (cprm->file->f_pos != offset) { /* Sanity check */ printk(KERN_WARNING "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", - file->f_pos, offset); + cprm->file->f_pos, offset); } end_coredump: -- cgit v1.2.3