aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c33
-rw-r--r--fs/9p/vfs_dir.c14
-rw-r--r--fs/9p/vfs_inode.c6
-rw-r--r--fs/9p/vfs_inode_dotl.c4
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/inode.c2
-rw-r--r--fs/affs/amigaffs.c4
-rw-r--r--fs/affs/inode.c8
-rw-r--r--fs/affs/namei.c6
-rw-r--r--fs/afs/fsclient.c2
-rw-r--r--fs/afs/inode.c4
-rw-r--r--fs/aio.c140
-rw-r--r--fs/attr.c5
-rw-r--r--fs/autofs4/inode.c2
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c11
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/bio-integrity.c1
-rw-r--r--fs/bio.c8
-rw-r--r--fs/block_dev.c15
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c75
-rw-r--r--fs/btrfs/file.c4
-rw-r--r--fs/btrfs/free-space-cache.c112
-rw-r--r--fs/btrfs/inode.c4
-rw-r--r--fs/btrfs/super.c38
-rw-r--r--fs/btrfs/transaction.c9
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c6
-rw-r--r--fs/btrfs/xattr.c50
-rw-r--r--fs/buffer.c11
-rw-r--r--fs/ceph/addr.c201
-rw-r--r--fs/ceph/caps.c199
-rw-r--r--fs/ceph/dir.c88
-rw-r--r--fs/ceph/file.c23
-rw-r--r--fs/ceph/inode.c118
-rw-r--r--fs/ceph/ioctl.c38
-rw-r--r--fs/ceph/ioctl.h55
-rw-r--r--fs/ceph/mds_client.c54
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/snap.c16
-rw-r--r--fs/ceph/super.c71
-rw-r--r--fs/ceph/super.h71
-rw-r--r--fs/ceph/xattr.c42
-rw-r--r--fs/cifs/README14
-rw-r--r--fs/cifs/cifs_debug.c9
-rw-r--r--fs/cifs/cifs_fs_sb.h4
-rw-r--r--fs/cifs/cifsacl.c347
-rw-r--r--fs/cifs/cifsencrypt.c113
-rw-r--r--fs/cifs/cifsfs.c26
-rw-r--r--fs/cifs/cifsfs.h6
-rw-r--r--fs/cifs/cifsglob.h55
-rw-r--r--fs/cifs/cifspdu.h48
-rw-r--r--fs/cifs/cifsproto.h56
-rw-r--r--fs/cifs/cifssmb.c454
-rw-r--r--fs/cifs/connect.c708
-rw-r--r--fs/cifs/dir.c22
-rw-r--r--fs/cifs/export.c4
-rw-r--r--fs/cifs/file.c1158
-rw-r--r--fs/cifs/inode.c60
-rw-r--r--fs/cifs/link.c19
-rw-r--r--fs/cifs/misc.c66
-rw-r--r--fs/cifs/readdir.c10
-rw-r--r--fs/cifs/sess.c6
-rw-r--r--fs/cifs/smbencrypt.c186
-rw-r--r--fs/cifs/transport.c19
-rw-r--r--fs/cifs/xattr.c42
-rw-r--r--fs/coda/coda_linux.c2
-rw-r--r--fs/coda/coda_linux.h5
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/compat.c15
-rw-r--r--fs/configfs/inode.c5
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/configfs/mount.c36
-rw-r--r--fs/dcache.c128
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/devpts/inode.c4
-rw-r--r--fs/direct-io.c646
-rw-r--r--fs/ecryptfs/crypto.c26
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h7
-rw-r--r--fs/ecryptfs/file.c23
-rw-r--r--fs/ecryptfs/inode.c64
-rw-r--r--fs/efs/inode.c2
-rw-r--r--fs/eventpoll.c27
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exofs/Kbuild3
-rw-r--r--fs/exofs/Kconfig9
-rw-r--r--fs/exofs/exofs.h26
-rw-r--r--fs/exofs/inode.c235
-rw-r--r--fs/exofs/ore.c657
-rw-r--r--fs/exofs/ore_raid.c660
-rw-r--r--fs/exofs/ore_raid.h79
-rw-r--r--fs/exofs/super.c206
-rw-r--r--fs/ext2/balloc.c2
-rw-r--r--fs/ext2/ext2.h8
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/super.c8
-rw-r--r--fs/ext2/xattr_security.c34
-rw-r--r--fs/ext3/balloc.c17
-rw-r--r--fs/ext3/fsync.c10
-rw-r--r--fs/ext3/ialloc.c47
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext3/ioctl.c24
-rw-r--r--fs/ext3/namei.c6
-rw-r--r--fs/ext3/super.c12
-rw-r--r--fs/ext3/xattr_security.c36
-rw-r--r--fs/ext4/balloc.c345
-rw-r--r--fs/ext4/ext4.h185
-rw-r--r--fs/ext4/ext4_extents.h2
-rw-r--r--fs/ext4/ext4_jbd2.c8
-rw-r--r--fs/ext4/extents.c1167
-rw-r--r--fs/ext4/file.c51
-rw-r--r--fs/ext4/fsync.c10
-rw-r--r--fs/ext4/ialloc.c206
-rw-r--r--fs/ext4/indirect.c20
-rw-r--r--fs/ext4/inode.c499
-rw-r--r--fs/ext4/ioctl.c65
-rw-r--r--fs/ext4/mballoc.c331
-rw-r--r--fs/ext4/mballoc.h11
-rw-r--r--fs/ext4/migrate.c111
-rw-r--r--fs/ext4/mmp.c10
-rw-r--r--fs/ext4/move_extent.c1
-rw-r--r--fs/ext4/namei.c29
-rw-r--r--fs/ext4/page-io.c78
-rw-r--r--fs/ext4/resize.c10
-rw-r--r--fs/ext4/super.c286
-rw-r--r--fs/ext4/xattr.c12
-rw-r--r--fs/ext4/xattr_security.c36
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fat/fat.h9
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fat/namei_msdos.c2
-rw-r--r--fs/fat/namei_vfat.c2
-rw-r--r--fs/freevxfs/vxfs_inode.c2
-rw-r--r--fs/fs-writeback.c78
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/dev.c3
-rw-r--r--fs/fuse/file.c6
-rw-r--r--fs/fuse/inode.c26
-rw-r--r--fs/gfs2/acl.c5
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/bmap.c199
-rw-r--r--fs/gfs2/dir.c50
-rw-r--r--fs/gfs2/file.c299
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c91
-rw-r--r--fs/gfs2/glops.h2
-rw-r--r--fs/gfs2/incore.h23
-rw-r--r--fs/gfs2/inode.c150
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/lops.c66
-rw-r--r--fs/gfs2/ops_fstype.c7
-rw-r--r--fs/gfs2/quota.c28
-rw-r--r--fs/gfs2/rgrp.c573
-rw-r--r--fs/gfs2/rgrp.h31
-rw-r--r--fs/gfs2/super.c134
-rw-r--r--fs/gfs2/trans.c5
-rw-r--r--fs/gfs2/trans.h22
-rw-r--r--fs/gfs2/xattr.c28
-rw-r--r--fs/hfs/btree.c20
-rw-r--r--fs/hfs/dir.c4
-rw-r--r--fs/hfs/inode.c4
-rw-r--r--fs/hfs/trans.c2
-rw-r--r--fs/hfsplus/dir.c4
-rw-r--r--fs/hfsplus/inode.c10
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hostfs/hostfs_user.c1
-rw-r--r--fs/hpfs/dir.c2
-rw-r--r--fs/hpfs/hpfs_fn.h4
-rw-r--r--fs/hpfs/inode.c10
-rw-r--r--fs/hpfs/namei.c8
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c4
-rw-r--r--fs/ioprio.c1
-rw-r--r--fs/isofs/inode.c14
-rw-r--r--fs/isofs/rock.c4
-rw-r--r--fs/jbd/journal.c8
-rw-r--r--fs/jbd2/commit.c26
-rw-r--r--fs/jbd2/journal.c44
-rw-r--r--fs/jbd2/recovery.c28
-rw-r--r--fs/jbd2/transaction.c68
-rw-r--r--fs/jffs2/compr.c128
-rw-r--r--fs/jffs2/compr.h2
-rw-r--r--fs/jffs2/dir.c6
-rw-r--r--fs/jffs2/fs.c8
-rw-r--r--fs/jffs2/jffs2_fs_sb.h6
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/security.c35
-rw-r--r--fs/jffs2/super.c119
-rw-r--r--fs/jffs2/wbuf.c9
-rw-r--r--fs/jfs/jfs_imap.c6
-rw-r--r--fs/jfs/jfs_inode.c2
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/namei.c12
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/jfs/xattr.c57
-rw-r--r--fs/libfs.c6
-rw-r--r--fs/lockd/host.c25
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/locks.c236
-rw-r--r--fs/logfs/dir.c8
-rw-r--r--fs/logfs/inode.c3
-rw-r--r--fs/logfs/logfs.h1
-rw-r--r--fs/logfs/readwrite.c2
-rw-r--r--fs/logfs/super.c23
-rw-r--r--fs/minix/bitmap.c55
-rw-r--r--fs/minix/inode.c31
-rw-r--r--fs/minix/minix.h11
-rw-r--r--fs/namei.c51
-rw-r--r--fs/namespace.c53
-rw-r--r--fs/ncpfs/inode.c10
-rw-r--r--fs/nfs/blocklayout/blocklayout.c58
-rw-r--r--fs/nfs/blocklayout/blocklayout.h4
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c35
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfs/callback_xdr.c12
-rw-r--r--fs/nfs/client.c11
-rw-r--r--fs/nfs/delegation.c2
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/file.c112
-rw-r--r--fs/nfs/fscache-index.c4
-rw-r--r--fs/nfs/idmap.c25
-rw-r--r--fs/nfs/inode.c24
-rw-r--r--fs/nfs/internal.h12
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs4_fs.h24
-rw-r--r--fs/nfs/nfs4filelayout.c41
-rw-r--r--fs/nfs/nfs4proc.c123
-rw-r--r--fs/nfs/nfs4state.c33
-rw-r--r--fs/nfs/nfs4xdr.c2
-rw-r--r--fs/nfs/objlayout/objio_osd.c872
-rw-r--r--fs/nfs/objlayout/objlayout.c209
-rw-r--r--fs/nfs/objlayout/objlayout.h48
-rw-r--r--fs/nfs/pagelist.c3
-rw-r--r--fs/nfs/pnfs.c96
-rw-r--r--fs/nfs/pnfs.h5
-rw-r--r--fs/nfs/pnfs_dev.c1
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c30
-rw-r--r--fs/nfs/super.c54
-rw-r--r--fs/nfs/unlink.c4
-rw-r--r--fs/nfs/write.c77
-rw-r--r--fs/nfsd/export.c16
-rw-r--r--fs/nfsd/nfs4acl.c1
-rw-r--r--fs/nfsd/nfs4callback.c20
-rw-r--r--fs/nfsd/nfs4proc.c374
-rw-r--r--fs/nfsd/nfs4recover.c53
-rw-r--r--fs/nfsd/nfs4state.c1794
-rw-r--r--fs/nfsd/nfs4xdr.c380
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nfsd/nfsd.h33
-rw-r--r--fs/nfsd/nfsfh.c39
-rw-r--r--fs/nfsd/nfssvc.c3
-rw-r--r--fs/nfsd/state.h174
-rw-r--r--fs/nfsd/vfs.c31
-rw-r--r--fs/nfsd/vfs.h29
-rw-r--r--fs/nfsd/xdr4.h28
-rw-r--r--fs/nilfs2/inode.c4
-rw-r--r--fs/nilfs2/ioctl.c16
-rw-r--r--fs/nilfs2/namei.c2
-rw-r--r--fs/nilfs2/nilfs.h8
-rw-r--r--fs/ntfs/debug.h15
-rw-r--r--fs/ntfs/inode.c8
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c69
-rw-r--r--fs/ocfs2/aops.h14
-rw-r--r--fs/ocfs2/cluster/heartbeat.c194
-rw-r--r--fs/ocfs2/cluster/netdebug.c102
-rw-r--r--fs/ocfs2/cluster/tcp.c139
-rw-r--r--fs/ocfs2/cluster/tcp.h2
-rw-r--r--fs/ocfs2/dir.c7
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h56
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c1
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c44
-rw-r--r--fs/ocfs2/dlm/dlmlock.c54
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c175
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c164
-rw-r--r--fs/ocfs2/dlm/dlmthread.c16
-rw-r--r--fs/ocfs2/dlmglue.c23
-rw-r--r--fs/ocfs2/extent_map.c96
-rw-r--r--fs/ocfs2/extent_map.h2
-rw-r--r--fs/ocfs2/file.c96
-rw-r--r--fs/ocfs2/inode.c6
-rw-r--r--fs/ocfs2/inode.h3
-rw-r--r--fs/ocfs2/ioctl.c11
-rw-r--r--fs/ocfs2/journal.c23
-rw-r--r--fs/ocfs2/journal.h5
-rw-r--r--fs/ocfs2/mmap.c53
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/namei.c18
-rw-r--r--fs/ocfs2/ocfs2.h51
-rw-r--r--fs/ocfs2/quota_local.c23
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/stack_o2cb.c71
-rw-r--r--fs/ocfs2/super.c25
-rw-r--r--fs/ocfs2/super.h14
-rw-r--r--fs/ocfs2/xattr.c48
-rw-r--r--fs/open.c4
-rw-r--r--fs/openpromfs/inode.c4
-rw-r--r--fs/partitions/ldm.c16
-rw-r--r--fs/pipe.c1
-rw-r--r--fs/posix_acl.c2
-rw-r--r--fs/proc/base.c25
-rw-r--r--fs/proc/generic.c2
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/proc_sysctl.c48
-rw-r--r--fs/proc/root.c8
-rw-r--r--fs/proc/stat.c41
-rw-r--r--fs/proc/task_mmu.c5
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/pstore/inode.c40
-rw-r--r--fs/pstore/internal.h2
-rw-r--r--fs/pstore/platform.c98
-rw-r--r--fs/qnx4/inode.c2
-rw-r--r--fs/quota/quota.c9
-rw-r--r--fs/ramfs/inode.c10
-rw-r--r--fs/read_write.c82
-rw-r--r--fs/reiserfs/inode.c10
-rw-r--r--fs/reiserfs/journal.c9
-rw-r--r--fs/reiserfs/namei.c16
-rw-r--r--fs/reiserfs/resize.c4
-rw-r--r--fs/reiserfs/xattr_security.c4
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/seq_file.c6
-rw-r--r--fs/squashfs/Kconfig28
-rw-r--r--fs/squashfs/inode.c18
-rw-r--r--fs/squashfs/squashfs_fs.h7
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/stack.c2
-rw-r--r--fs/stat.c5
-rw-r--r--fs/statfs.c2
-rw-r--r--fs/super.c11
-rw-r--r--fs/sync.c4
-rw-r--r--fs/sysfs/dir.c196
-rw-r--r--fs/sysfs/file.c56
-rw-r--r--fs/sysfs/inode.c16
-rw-r--r--fs/sysfs/sysfs.h17
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/ubifs/budget.c2
-rw-r--r--fs/ubifs/debug.c16
-rw-r--r--fs/ubifs/debug.h5
-rw-r--r--fs/ubifs/recovery.c2
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/ubifs/super.c20
-rw-r--r--fs/ubifs/xattr.c4
-rw-r--r--fs/udf/balloc.c14
-rw-r--r--fs/udf/directory.c8
-rw-r--r--fs/udf/inode.c56
-rw-r--r--fs/udf/lowlevel.c2
-rw-r--r--fs/udf/misc.c19
-rw-r--r--fs/udf/namei.c20
-rw-r--r--fs/udf/partition.c19
-rw-r--r--fs/udf/super.c280
-rw-r--r--fs/udf/truncate.c22
-rw-r--r--fs/udf/udf_sb.h5
-rw-r--r--fs/udf/udfdecl.h35
-rw-r--r--fs/udf/udftime.c3
-rw-r--r--fs/udf/unicode.c6
-rw-r--r--fs/ufs/ialloc.c2
-rw-r--r--fs/ufs/inode.c4
-rw-r--r--fs/ufs/ufs.h9
-rw-r--r--fs/xattr.c63
-rw-r--r--fs/xfs/kmem.h7
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_alloc.c4
-rw-r--r--fs/xfs/xfs_aops.c127
-rw-r--r--fs/xfs/xfs_aops.h4
-rw-r--r--fs/xfs/xfs_attr.c89
-rw-r--r--fs/xfs/xfs_attr_leaf.c71
-rw-r--r--fs/xfs/xfs_bmap.c2543
-rw-r--r--fs/xfs/xfs_bmap.h318
-rw-r--r--fs/xfs/xfs_btree.c11
-rw-r--r--fs/xfs/xfs_buf.c244
-rw-r--r--fs/xfs/xfs_buf.h51
-rw-r--r--fs/xfs/xfs_buf_item.c14
-rw-r--r--fs/xfs/xfs_da_btree.c54
-rw-r--r--fs/xfs/xfs_dfrag.c6
-rw-r--r--fs/xfs/xfs_dir2_leaf.c6
-rw-r--r--fs/xfs/xfs_discard.c20
-rw-r--r--fs/xfs/xfs_dquot.c32
-rw-r--r--fs/xfs/xfs_dquot_item.c6
-rw-r--r--fs/xfs/xfs_export.c20
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_file.c168
-rw-r--r--fs/xfs/xfs_filestream.c4
-rw-r--r--fs/xfs/xfs_fsops.c60
-rw-r--r--fs/xfs/xfs_ialloc.c15
-rw-r--r--fs/xfs/xfs_iget.c2
-rw-r--r--fs/xfs/xfs_inode.c64
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c6
-rw-r--r--fs/xfs/xfs_ioctl.c2
-rw-r--r--fs/xfs/xfs_iomap.c39
-rw-r--r--fs/xfs/xfs_iops.c55
-rw-r--r--fs/xfs/xfs_log.c370
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_recover.c43
-rw-r--r--fs/xfs/xfs_message.h42
-rw-r--r--fs/xfs/xfs_mount.c36
-rw-r--r--fs/xfs/xfs_qm.c15
-rw-r--r--fs/xfs/xfs_qm_syscalls.c2
-rw-r--r--fs/xfs/xfs_rename.c8
-rw-r--r--fs/xfs/xfs_rtalloc.c48
-rw-r--r--fs/xfs/xfs_rw.c23
-rw-r--r--fs/xfs/xfs_rw.h2
-rw-r--r--fs/xfs/xfs_super.c41
-rw-r--r--fs/xfs/xfs_sync.c61
-rw-r--r--fs/xfs/xfs_sync.h2
-rw-r--r--fs/xfs/xfs_trace.h51
-rw-r--r--fs/xfs/xfs_trans.c13
-rw-r--r--fs/xfs/xfs_trans.h14
-rw-r--r--fs/xfs/xfs_trans_ail.c43
-rw-r--r--fs/xfs/xfs_trans_buf.c24
-rw-r--r--fs/xfs/xfs_trans_inode.c25
-rw-r--r--fs/xfs/xfs_trans_priv.h1
-rw-r--r--fs/xfs/xfs_vnodeops.c123
425 files changed, 15894 insertions, 11772 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index ef966188611..2b78014a124 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -132,21 +132,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
options = tmp_options;
while ((p = strsep(&options, ",")) != NULL) {
- int token;
+ int token, r;
if (!*p)
continue;
token = match_token(p, tokens, args);
- if (token < Opt_uname) {
- int r = match_int(&args[0], &option);
+ switch (token) {
+ case Opt_debug:
+ r = match_int(&args[0], &option);
if (r < 0) {
P9_DPRINTK(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
+ "integer field, but no integer?\n");
ret = r;
continue;
}
- }
- switch (token) {
- case Opt_debug:
v9ses->debug = option;
#ifdef CONFIG_NET_9P_DEBUG
p9_debug_level = option;
@@ -154,12 +152,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
break;
case Opt_dfltuid:
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ P9_DPRINTK(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ ret = r;
+ continue;
+ }
v9ses->dfltuid = option;
break;
case Opt_dfltgid:
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ P9_DPRINTK(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ ret = r;
+ continue;
+ }
v9ses->dfltgid = option;
break;
case Opt_afid:
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ P9_DPRINTK(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ ret = r;
+ continue;
+ }
v9ses->afid = option;
break;
case Opt_uname:
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 9c2bdda5cd9..598fff1a54e 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -165,9 +165,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
while (rdir->head < rdir->tail) {
p9stat_init(&st);
- err = p9stat_read(rdir->buf + rdir->head,
- rdir->tail - rdir->head, &st,
- fid->clnt->proto_version);
+ err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
+ rdir->tail - rdir->head, &st);
if (err) {
P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
err = -EIO;
@@ -231,7 +230,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
while (err == 0) {
if (rdir->tail == rdir->head) {
err = p9_client_readdir(fid, rdir->buf, buflen,
- filp->f_pos);
+ filp->f_pos);
if (err <= 0)
goto unlock_and_exit;
@@ -241,10 +240,9 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
while (rdir->head < rdir->tail) {
- err = p9dirent_read(rdir->buf + rdir->head,
- rdir->tail - rdir->head,
- &curdirent,
- fid->clnt->proto_version);
+ err = p9dirent_read(fid->clnt, rdir->buf + rdir->head,
+ rdir->tail - rdir->head,
+ &curdirent);
if (err < 0) {
P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
err = -EIO;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index e3c03db3c78..879ed885173 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -278,10 +278,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
case S_IFSOCK:
if (v9fs_proto_dotl(v9ses)) {
inode->i_op = &v9fs_file_inode_operations_dotl;
- inode->i_fop = &v9fs_file_operations_dotl;
} else if (v9fs_proto_dotu(v9ses)) {
inode->i_op = &v9fs_file_inode_operations;
- inode->i_fop = &v9fs_file_operations;
} else {
P9_DPRINTK(P9_DEBUG_ERROR,
"special files without extended mode\n");
@@ -1140,7 +1138,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
struct v9fs_session_info *v9ses = sb->s_fs_info;
struct v9fs_inode *v9inode = V9FS_I(inode);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_atime.tv_sec = stat->atime;
inode->i_mtime.tv_sec = stat->mtime;
@@ -1166,7 +1164,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
/* HARDLINKCOUNT %u */
sscanf(ext, "%13s %u", tag_name, &i_nlink);
if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
- inode->i_nlink = i_nlink;
+ set_nlink(inode, i_nlink);
}
}
mode = stat->mode & S_IALLUGO;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index aded79fcd5c..0b5745e2194 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -606,7 +606,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
inode->i_uid = stat->st_uid;
inode->i_gid = stat->st_gid;
- inode->i_nlink = stat->st_nlink;
+ set_nlink(inode, stat->st_nlink);
mode = stat->st_mode & S_IALLUGO;
mode |= inode->i_mode & ~S_IALLUGO;
@@ -632,7 +632,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
if (stat->st_result_mask & P9_STATS_GID)
inode->i_gid = stat->st_gid;
if (stat->st_result_mask & P9_STATS_NLINK)
- inode->i_nlink = stat->st_nlink;
+ set_nlink(inode, stat->st_nlink);
if (stat->st_result_mask & P9_STATS_MODE) {
inode->i_mode = stat->st_mode;
if ((S_ISBLK(inode->i_mode)) ||
diff --git a/fs/Kconfig b/fs/Kconfig
index 9fe0b349f4c..5f4c45d4aa1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -109,7 +109,7 @@ source "fs/proc/Kconfig"
source "fs/sysfs/Kconfig"
config TMPFS
- bool "Virtual memory file system support (former shm fs)"
+ bool "Tmpfs virtual memory file system support (former shm fs)"
depends on SHMEM
help
Tmpfs is a file system which keeps all files in virtual memory.
diff --git a/fs/Makefile b/fs/Makefile
index afc109691a9..d2c3353d547 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
-obj-$(CONFIG_EXOFS_FS) += exofs/
+obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index d5250c5aae2..1dab6a174d6 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -247,7 +247,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
inode->i_gid = ADFS_SB(sb)->s_gid;
inode->i_ino = obj->file_id;
inode->i_size = obj->size;
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_blocks = (inode->i_size + sb->s_blocksize - 1) >>
sb->s_blocksize_bits;
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 3a4557e8325..de37ec84234 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -215,7 +215,7 @@ affs_remove_link(struct dentry *dentry)
break;
default:
if (!AFFS_TAIL(sb, bh)->link_chain)
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
affs_free_block(sb, link_ino);
goto done;
@@ -316,7 +316,7 @@ affs_remove_header(struct dentry *dentry)
if (inode->i_nlink > 1)
retval = affs_remove_link(dentry);
else
- inode->i_nlink = 0;
+ clear_nlink(inode);
affs_unlock_link(inode);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 5d828903ac6..88a4b0b5005 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -54,7 +54,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
prot = be32_to_cpu(tail->protect);
inode->i_size = 0;
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_mode = 0;
AFFS_I(inode)->i_extcnt = 1;
AFFS_I(inode)->i_ext_last = ~1;
@@ -137,7 +137,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
sbi->s_hashsize + 1;
}
if (tail->link_chain)
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
inode->i_op = &affs_file_inode_operations;
inode->i_fop = &affs_file_operations;
@@ -304,7 +304,7 @@ affs_new_inode(struct inode *dir)
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_ino = block;
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
atomic_set(&AFFS_I(inode)->i_opencnt, 0);
AFFS_I(inode)->i_blkcnt = 0;
@@ -387,7 +387,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block);
affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
mark_buffer_dirty_inode(inode_bh, inode);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
ihold(inode);
}
affs_fix_checksum(sb, bh);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index e3e9efc1fdd..780a11dc631 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -277,7 +277,7 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata
inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
error = affs_add_entry(dir, inode, dentry, ST_FILE);
if (error) {
- inode->i_nlink = 0;
+ clear_nlink(inode);
iput(inode);
return error;
}
@@ -305,7 +305,7 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
error = affs_add_entry(dir, inode, dentry, ST_USERDIR);
if (error) {
- inode->i_nlink = 0;
+ clear_nlink(inode);
mark_inode_dirty(inode);
iput(inode);
return error;
@@ -392,7 +392,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
return 0;
err:
- inode->i_nlink = 0;
+ clear_nlink(inode);
mark_inode_dirty(inode);
iput(inode);
return error;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 346e3289abd..2f213d109c2 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -90,7 +90,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
vnode->vfs_inode.i_uid = status->owner;
vnode->vfs_inode.i_gid = status->group;
vnode->vfs_inode.i_generation = vnode->fid.unique;
- vnode->vfs_inode.i_nlink = status->nlink;
+ set_nlink(&vnode->vfs_inode, status->nlink);
mode = vnode->vfs_inode.i_mode;
mode &= ~S_IALLUGO;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0fdab6e03d8..d890ae3b2ce 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -67,7 +67,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
fscache_attr_changed(vnode->cache);
#endif
- inode->i_nlink = vnode->status.nlink;
+ set_nlink(inode, vnode->status.nlink);
inode->i_uid = vnode->status.owner;
inode->i_gid = 0;
inode->i_size = vnode->status.size;
@@ -174,7 +174,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
inode->i_size = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
inode->i_op = &afs_autocell_inode_operations;
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_uid = 0;
inode->i_gid = 0;
inode->i_ctime.tv_sec = get_seconds();
diff --git a/fs/aio.c b/fs/aio.c
index e29ec485af2..78c514cfd21 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -440,8 +440,6 @@ void exit_aio(struct mm_struct *mm)
static struct kiocb *__aio_get_req(struct kioctx *ctx)
{
struct kiocb *req = NULL;
- struct aio_ring *ring;
- int okay = 0;
req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
if (unlikely(!req))
@@ -459,39 +457,114 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx)
INIT_LIST_HEAD(&req->ki_run_list);
req->ki_eventfd = NULL;
- /* Check if the completion queue has enough free space to
- * accept an event from this io.
- */
+ return req;
+}
+
+/*
+ * struct kiocb's are allocated in batches to reduce the number of
+ * times the ctx lock is acquired and released.
+ */
+#define KIOCB_BATCH_SIZE 32L
+struct kiocb_batch {
+ struct list_head head;
+ long count; /* number of requests left to allocate */
+};
+
+static void kiocb_batch_init(struct kiocb_batch *batch, long total)
+{
+ INIT_LIST_HEAD(&batch->head);
+ batch->count = total;
+}
+
+static void kiocb_batch_free(struct kiocb_batch *batch)
+{
+ struct kiocb *req, *n;
+
+ list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
+ list_del(&req->ki_batch);
+ kmem_cache_free(kiocb_cachep, req);
+ }
+}
+
+/*
+ * Allocate a batch of kiocbs. This avoids taking and dropping the
+ * context lock a lot during setup.
+ */
+static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
+{
+ unsigned short allocated, to_alloc;
+ long avail;
+ bool called_fput = false;
+ struct kiocb *req, *n;
+ struct aio_ring *ring;
+
+ to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
+ for (allocated = 0; allocated < to_alloc; allocated++) {
+ req = __aio_get_req(ctx);
+ if (!req)
+ /* allocation failed, go with what we've got */
+ break;
+ list_add(&req->ki_batch, &batch->head);
+ }
+
+ if (allocated == 0)
+ goto out;
+
+retry:
spin_lock_irq(&ctx->ctx_lock);
- ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0);
- if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) {
+ ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
+
+ avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
+ BUG_ON(avail < 0);
+ if (avail == 0 && !called_fput) {
+ /*
+ * Handle a potential starvation case. It is possible that
+ * we hold the last reference on a struct file, causing us
+ * to delay the final fput to non-irq context. In this case,
+ * ctx->reqs_active is artificially high. Calling the fput
+ * routine here may free up a slot in the event completion
+ * ring, allowing this allocation to succeed.
+ */
+ kunmap_atomic(ring);
+ spin_unlock_irq(&ctx->ctx_lock);
+ aio_fput_routine(NULL);
+ called_fput = true;
+ goto retry;
+ }
+
+ if (avail < allocated) {
+ /* Trim back the number of requests. */
+ list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
+ list_del(&req->ki_batch);
+ kmem_cache_free(kiocb_cachep, req);
+ if (--allocated <= avail)
+ break;
+ }
+ }
+
+ batch->count -= allocated;
+ list_for_each_entry(req, &batch->head, ki_batch) {
list_add(&req->ki_list, &ctx->active_reqs);
ctx->reqs_active++;
- okay = 1;
}
- kunmap_atomic(ring, KM_USER0);
- spin_unlock_irq(&ctx->ctx_lock);
- if (!okay) {
- kmem_cache_free(kiocb_cachep, req);
- req = NULL;
- }
+ kunmap_atomic(ring);
+ spin_unlock_irq(&ctx->ctx_lock);
- return req;
+out:
+ return allocated;
}
-static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+static inline struct kiocb *aio_get_req(struct kioctx *ctx,
+ struct kiocb_batch *batch)
{
struct kiocb *req;
- /* Handle a potential starvation case -- should be exceedingly rare as
- * requests will be stuck on fput_head only if the aio_fput_routine is
- * delayed and the requests were the last user of the struct file.
- */
- req = __aio_get_req(ctx);
- if (unlikely(NULL == req)) {
- aio_fput_routine(NULL);
- req = __aio_get_req(ctx);
- }
+
+ if (list_empty(&batch->head))
+ if (kiocb_batch_refill(ctx, batch) == 0)
+ return NULL;
+ req = list_first_entry(&batch->head, struct kiocb, ki_batch);
+ list_del(&req->ki_batch);
return req;
}
@@ -1387,13 +1460,13 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
ret = compat_rw_copy_check_uvector(type,
(struct compat_iovec __user *)kiocb->ki_buf,
kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
- &kiocb->ki_iovec);
+ &kiocb->ki_iovec, 1);
else
#endif
ret = rw_copy_check_uvector(type,
(struct iovec __user *)kiocb->ki_buf,
kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
- &kiocb->ki_iovec);
+ &kiocb->ki_iovec, 1);
if (ret < 0)
goto out;
@@ -1515,7 +1588,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
}
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- struct iocb *iocb, bool compat)
+ struct iocb *iocb, struct kiocb_batch *batch,
+ bool compat)
{
struct kiocb *req;
struct file *file;
@@ -1541,7 +1615,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
if (unlikely(!file))
return -EBADF;
- req = aio_get_req(ctx); /* returns with 2 references to req */
+ req = aio_get_req(ctx, batch); /* returns with 2 references to req */
if (unlikely(!req)) {
fput(file);
return -EAGAIN;
@@ -1621,8 +1695,9 @@ long do_io_submit(aio_context_t ctx_id, long nr,
{
struct kioctx *ctx;
long ret = 0;
- int i;
+ int i = 0;
struct blk_plug plug;
+ struct kiocb_batch batch;
if (unlikely(nr < 0))
return -EINVAL;
@@ -1639,6 +1714,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
return -EINVAL;
}
+ kiocb_batch_init(&batch, nr);
+
blk_start_plug(&plug);
/*
@@ -1659,12 +1736,13 @@ long do_io_submit(aio_context_t ctx_id, long nr,
break;
}
- ret = io_submit_one(ctx, user_iocb, &tmp, compat);
+ ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat);
if (ret)
break;
}
blk_finish_plug(&plug);
+ kiocb_batch_free(&batch);
put_ioctx(ctx);
return i ? i : ret;
}
diff --git a/fs/attr.c b/fs/attr.c
index 538e27959d3..7ee7ba48831 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -13,6 +13,7 @@
#include <linux/fsnotify.h>
#include <linux/fcntl.h>
#include <linux/security.h>
+#include <linux/evm.h>
/**
* inode_change_ok - check if attribute changes to an inode are allowed
@@ -237,8 +238,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
else
error = simple_setattr(dentry, attr);
- if (!error)
+ if (!error) {
fsnotify_change(dentry, ia_valid);
+ evm_inode_post_setattr(dentry, ia_valid);
+ }
return error;
}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 180fa2425e4..8179f1ab817 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -342,7 +342,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
inode->i_ino = get_next_ino();
if (S_ISDIR(mode)) {
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_op = &autofs4_dir_inode_operations;
inode->i_fop = &autofs4_dir_operations;
} else if (S_ISLNK(mode)) {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 720d885e8dc..8342ca67abc 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -357,7 +357,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
inode->i_gid = befs_sb->mount_opts.use_gid ?
befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
/*
* BEFS's time is 64 bits, but current VFS is 32 bits...
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index b14cebfd904..9cc07401947 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -199,7 +199,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
printf("unlinking non-existent file %s:%lu (nlink=%d)\n",
inode->i_sb->s_id, inode->i_ino,
inode->i_nlink);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
de->ino = 0;
mark_buffer_dirty_inode(bh, dir);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index a8e37f81d09..697af5bf70b 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -78,7 +78,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino);
inode->i_uid = le32_to_cpu(di->i_uid);
inode->i_gid = le32_to_cpu(di->i_gid);
- inode->i_nlink = le32_to_cpu(di->i_nlink);
+ set_nlink(inode, le32_to_cpu(di->i_nlink));
inode->i_size = BFS_FILESIZE(di);
inode->i_blocks = BFS_FILEBLOCKS(di);
inode->i_atime.tv_sec = le32_to_cpu(di->i_atime);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index dd0fdfc56d3..21ac5ee4b43 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -795,7 +795,16 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
* might try to exec. This is because the brk will
* follow the loader, and is not movable. */
#if defined(CONFIG_X86) || defined(CONFIG_ARM)
- load_bias = 0;
+ /* Memory randomization might have been switched off
+ * in runtime via sysctl.
+ * If that is the case, retain the original non-zero
+ * load_bias value in order to establish proper
+ * non-randomized mappings.
+ */
+ if (current->flags & PF_RANDOMIZE)
+ load_bias = 0;
+ else
+ load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
#else
load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
#endif
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index ba1a1ae4a18..1e9edbdeda7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -521,7 +521,7 @@ static void kill_node(Node *e)
write_unlock(&entries_lock);
if (dentry) {
- dentry->d_inode->i_nlink--;
+ drop_nlink(dentry->d_inode);
d_drop(dentry);
dput(dentry);
simple_release_fs(&bm_mnt, &entry_count);
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 9c5e6b2cd11..c2183f3917c 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -22,6 +22,7 @@
#include <linux/blkdev.h>
#include <linux/mempool.h>
+#include <linux/export.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
diff --git a/fs/bio.c b/fs/bio.c
index 9bfade8a609..b1fe82cf88c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -255,7 +255,6 @@ void bio_init(struct bio *bio)
{
memset(bio, 0, sizeof(*bio));
bio->bi_flags = 1 << BIO_UPTODATE;
- bio->bi_comp_cpu = -1;
atomic_set(&bio->bi_cnt, 1);
}
EXPORT_SYMBOL(bio_init);
@@ -338,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio)
* RETURNS:
* Pointer to new bio on success, NULL on failure.
*/
-struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
+struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
{
struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
@@ -366,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio)
* %__GFP_WAIT, the allocation is guaranteed to succeed.
*
**/
-struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
+struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
{
struct bio *bio;
@@ -697,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd)
kfree(bmd);
}
-static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
+static struct bio_map_data *bio_alloc_map_data(int nr_segs,
+ unsigned int iov_count,
gfp_t gfp_mask)
{
struct bio_map_data *bmd;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 95f786ec7f0..b07f1da1de4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
if (!bdev->bd_disk)
return;
- if (disk_partitionable(bdev->bd_disk))
+ if (disk_part_scan_enabled(bdev->bd_disk))
bdev->bd_invalidated = 1;
}
@@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
{
struct gendisk *disk;
+ struct module *owner;
int ret;
int partno;
int perm = 0;
@@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
disk = get_gendisk(bdev->bd_dev, &partno);
if (!disk)
goto out;
+ owner = disk->fops->owner;
disk_block_events(disk);
mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = NULL;
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
- module_put(disk->fops->owner);
put_disk(disk);
+ module_put(owner);
goto restart;
}
}
@@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_unlock_bdev;
}
/* only one opener holds refs to the module and disk */
- module_put(disk->fops->owner);
put_disk(disk);
+ module_put(owner);
}
bdev->bd_openers++;
if (for_part)
@@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
out_unlock_bdev:
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
- module_put(disk->fops->owner);
put_disk(disk);
+ module_put(owner);
out:
bdput(bdev);
@@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
if (!bdev->bd_openers) {
struct module *owner = disk->fops->owner;
- put_disk(disk);
- module_put(owner);
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
bdev->bd_disk = NULL;
if (bdev != bdev->bd_contains)
victim = bdev->bd_contains;
bdev->bd_contains = NULL;
+
+ put_disk(disk);
+ module_put(owner);
}
mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index c7ddf8a01c5..9c1eccc2c50 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1719,7 +1719,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
inode->i_gid = btrfs_stack_inode_gid(inode_item);
btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
inode->i_mode = btrfs_stack_inode_mode(inode_item);
- inode->i_nlink = btrfs_stack_inode_nlink(inode_item);
+ set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c23b82d8ec0..9c1a744e595 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2015,7 +2015,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
sb->s_bdi = &fs_info->bdi;
fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
- fs_info->btree_inode->i_nlink = 1;
+ set_nlink(fs_info->btree_inode, 1);
/*
* we set the i_size on the btree inode to the max possible int.
* the real end of the address space is determined by all of
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e6a832e3e64..352083ad233 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2266,9 +2266,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
BUG_ON(ret);
kfree(extent_op);
- cond_resched();
- spin_lock(&delayed_refs->lock);
- continue;
+ goto next;
}
list_del_init(&locked_ref->cluster);
@@ -2288,7 +2286,11 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
btrfs_put_delayed_ref(ref);
kfree(extent_op);
count++;
-
+next:
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ 2 * 1024 * 1024,
+ btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
cond_resched();
spin_lock(&delayed_refs->lock);
}
@@ -2316,6 +2318,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
+
delayed_refs = &trans->transaction->delayed_refs;
INIT_LIST_HEAD(&cluster);
again:
@@ -3292,27 +3298,12 @@ static int should_alloc_chunk(struct btrfs_root *root,
if (num_bytes - num_allocated < thresh)
return 1;
}
-
- /*
- * we have two similar checks here, one based on percentage
- * and once based on a hard number of 256MB. The idea
- * is that if we have a good amount of free
- * room, don't allocate a chunk. A good mount is
- * less than 80% utilized of the chunks we have allocated,
- * or more than 256MB free
- */
- if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
- return 0;
-
- if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
- return 0;
-
thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
- /* 256MB or 5% of the FS */
- thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+ /* 256MB or 2% of the FS */
+ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
- if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+ if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
return 0;
return 1;
}
@@ -3451,7 +3442,8 @@ static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
smp_mb();
nr_pages = min_t(unsigned long, nr_pages,
root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
- writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
+ writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
+ WB_REASON_FS_FREE_SPACE);
spin_lock(&space_info->lock);
if (reserved > space_info->bytes_may_use)
@@ -5329,15 +5321,6 @@ alloc:
if (unlikely(block_group->ro))
goto loop;
- spin_lock(&block_group->free_space_ctl->tree_lock);
- if (cached &&
- block_group->free_space_ctl->free_space <
- num_bytes + empty_cluster + empty_size) {
- spin_unlock(&block_group->free_space_ctl->tree_lock);
- goto loop;
- }
- spin_unlock(&block_group->free_space_ctl->tree_lock);
-
/*
* Ok we want to try and use the cluster allocator, so
* lets look there
@@ -5383,8 +5366,15 @@ refill_cluster:
* plenty of times and not have found
* anything, so we are likely way too
* fragmented for the clustering stuff to find
- * anything. */
- if (loop >= LOOP_NO_EMPTY_SIZE) {
+ * anything.
+ *
+ * However, if the cluster is taken from the
+ * current block group, release the cluster
+ * first, so that we stand a better chance of
+ * succeeding in the unclustered
+ * allocation. */
+ if (loop >= LOOP_NO_EMPTY_SIZE &&
+ last_ptr->block_group != block_group) {
spin_unlock(&last_ptr->refill_lock);
goto unclustered_alloc;
}
@@ -5395,6 +5385,11 @@ refill_cluster:
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ if (loop >= LOOP_NO_EMPTY_SIZE) {
+ spin_unlock(&last_ptr->refill_lock);
+ goto unclustered_alloc;
+ }
+
/* allocate a cluster in this block group */
ret = btrfs_find_space_cluster(trans, root,
block_group, last_ptr,
@@ -5435,6 +5430,15 @@ refill_cluster:
}
unclustered_alloc:
+ spin_lock(&block_group->free_space_ctl->tree_lock);
+ if (cached &&
+ block_group->free_space_ctl->free_space <
+ num_bytes + empty_cluster + empty_size) {
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+ goto loop;
+ }
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
/*
@@ -5472,9 +5476,6 @@ checks:
goto loop;
}
- ins->objectid = search_start;
- ins->offset = num_bytes;
-
if (offset < search_start)
btrfs_add_free_space(used_block_group, offset,
search_start - offset);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index cc7492c823f..97fbe939c05 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1167,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
(sizeof(struct page *)));
+ nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
+ nrptrs = max(nrptrs, 8);
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
if (!pages)
return -ENOMEM;
@@ -1836,7 +1838,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
switch (origin) {
case SEEK_END:
case SEEK_CUR:
- offset = generic_file_llseek_unlocked(file, offset, origin);
+ offset = generic_file_llseek(file, offset, origin);
goto out;
case SEEK_DATA:
case SEEK_HOLE:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ec23d43d0c3..ce40db59c70 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2283,23 +2283,23 @@ out:
static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
struct btrfs_free_space *entry,
struct btrfs_free_cluster *cluster,
- u64 offset, u64 bytes, u64 min_bytes)
+ u64 offset, u64 bytes,
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
unsigned long next_zero;
unsigned long i;
- unsigned long search_bits;
- unsigned long total_bits;
+ unsigned long want_bits;
+ unsigned long min_bits;
unsigned long found_bits;
unsigned long start = 0;
unsigned long total_found = 0;
int ret;
- bool found = false;
i = offset_to_bit(entry->offset, block_group->sectorsize,
max_t(u64, offset, entry->offset));
- search_bits = bytes_to_bits(bytes, block_group->sectorsize);
- total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+ want_bits = bytes_to_bits(bytes, block_group->sectorsize);
+ min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
again:
found_bits = 0;
@@ -2308,7 +2308,7 @@ again:
i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
next_zero = find_next_zero_bit(entry->bitmap,
BITS_PER_BITMAP, i);
- if (next_zero - i >= search_bits) {
+ if (next_zero - i >= min_bits) {
found_bits = next_zero - i;
break;
}
@@ -2318,10 +2318,9 @@ again:
if (!found_bits)
return -ENOSPC;
- if (!found) {
+ if (!total_found) {
start = i;
cluster->max_size = 0;
- found = true;
}
total_found += found_bits;
@@ -2329,13 +2328,8 @@ again:
if (cluster->max_size < found_bits * block_group->sectorsize)
cluster->max_size = found_bits * block_group->sectorsize;
- if (total_found < total_bits) {
- i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
- if (i - start > total_bits * 2) {
- total_found = 0;
- cluster->max_size = 0;
- found = false;
- }
+ if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+ i = next_zero + 1;
goto again;
}
@@ -2351,23 +2345,23 @@ again:
/*
* This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
*/
static noinline int
setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
- u64 min_bytes)
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *first = NULL;
struct btrfs_free_space *entry = NULL;
- struct btrfs_free_space *prev = NULL;
struct btrfs_free_space *last;
struct rb_node *node;
u64 window_start;
u64 window_free;
u64 max_extent;
- u64 max_gap = 128 * 1024;
entry = tree_search_offset(ctl, offset, 0, 1);
if (!entry)
@@ -2377,8 +2371,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
* We don't want bitmaps, so just move along until we find a normal
* extent entry.
*/
- while (entry->bitmap) {
- if (list_empty(&entry->list))
+ while (entry->bitmap || entry->bytes < min_bytes) {
+ if (entry->bitmap && list_empty(&entry->list))
list_add_tail(&entry->list, bitmaps);
node = rb_next(&entry->offset_index);
if (!node)
@@ -2391,12 +2385,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
max_extent = entry->bytes;
first = entry;
last = entry;
- prev = entry;
- while (window_free <= min_bytes) {
- node = rb_next(&entry->offset_index);
- if (!node)
- return -ENOSPC;
+ for (node = rb_next(&entry->offset_index); node;
+ node = rb_next(&entry->offset_index)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bitmap) {
@@ -2405,26 +2396,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
continue;
}
- /*
- * we haven't filled the empty size and the window is
- * very large. reset and try again
- */
- if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
- entry->offset - window_start > (min_bytes * 2)) {
- first = entry;
- window_start = entry->offset;
- window_free = entry->bytes;
- last = entry;
+ if (entry->bytes < min_bytes)
+ continue;
+
+ last = entry;
+ window_free += entry->bytes;
+ if (entry->bytes > max_extent)
max_extent = entry->bytes;
- } else {
- last = entry;
- window_free += entry->bytes;
- if (entry->bytes > max_extent)
- max_extent = entry->bytes;
- }
- prev = entry;
}
+ if (window_free < bytes || max_extent < cont1_bytes)
+ return -ENOSPC;
+
cluster->window_start = first->offset;
node = &first->offset_index;
@@ -2438,7 +2421,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
- if (entry->bitmap)
+ if (entry->bitmap || entry->bytes < min_bytes)
continue;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
@@ -2460,7 +2443,7 @@ static noinline int
setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
- u64 min_bytes)
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
@@ -2485,7 +2468,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
if (entry->bytes < min_bytes)
continue;
ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
- bytes, min_bytes);
+ bytes, cont1_bytes, min_bytes);
if (!ret)
return 0;
}
@@ -2499,7 +2482,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
/*
* here we try to find a cluster of blocks in a block group. The goal
- * is to find at least bytes free and up to empty_size + bytes free.
+ * is to find at least bytes+empty_size.
* We might not find them all in one contiguous area.
*
* returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2498,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
struct btrfs_free_space *entry, *tmp;
LIST_HEAD(bitmaps);
u64 min_bytes;
+ u64 cont1_bytes;
int ret;
- /* for metadata, allow allocates with more holes */
+ /*
+ * Choose the minimum extent size we'll require for this
+ * cluster. For SSD_SPREAD, don't allow any fragmentation.
+ * For metadata, allow allocates with smaller extents. For
+ * data, keep it dense.
+ */
if (btrfs_test_opt(root, SSD_SPREAD)) {
- min_bytes = bytes + empty_size;
+ cont1_bytes = min_bytes = bytes + empty_size;
} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
- /*
- * we want to do larger allocations when we are
- * flushing out the delayed refs, it helps prevent
- * making more work as we go along.
- */
- if (trans->transaction->delayed_refs.flushing)
- min_bytes = max(bytes, (bytes + empty_size) >> 1);
- else
- min_bytes = max(bytes, (bytes + empty_size) >> 4);
- } else
- min_bytes = max(bytes, (bytes + empty_size) >> 2);
+ cont1_bytes = bytes;
+ min_bytes = block_group->sectorsize;
+ } else {
+ cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+ min_bytes = block_group->sectorsize;
+ }
spin_lock(&ctl->tree_lock);
@@ -2539,7 +2523,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
* If we know we don't have enough space to make a cluster don't even
* bother doing all the work to try and find one.
*/
- if (ctl->free_space < min_bytes) {
+ if (ctl->free_space < bytes) {
spin_unlock(&ctl->tree_lock);
return -ENOSPC;
}
@@ -2553,10 +2537,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
}
ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
- bytes, min_bytes);
+ bytes + empty_size,
+ cont1_bytes, min_bytes);
if (ret)
ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
- offset, bytes, min_bytes);
+ offset, bytes + empty_size,
+ cont1_bytes, min_bytes);
/* Clear our temporary list */
list_for_each_entry_safe(entry, tmp, &bitmaps, list)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 13b0542015f..fd1a06df5bc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2358,7 +2358,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
inode->i_mode = btrfs_inode_mode(leaf, inode_item);
- inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
+ set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
inode->i_uid = btrfs_inode_uid(leaf, inode_item);
inode->i_gid = btrfs_inode_gid(leaf, inode_item);
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
@@ -6698,7 +6698,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
btrfs_i_size_write(inode, 0);
err = btrfs_update_inode(trans, new_root, inode);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 063b521e3de..5a7227fa938 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -833,13 +833,9 @@ static char *setup_root_args(char *args)
static struct dentry *mount_subvol(const char *subvol_name, int flags,
const char *device_name, char *data)
{
- struct super_block *s;
struct dentry *root;
struct vfsmount *mnt;
- struct mnt_namespace *ns_private;
char *newargs;
- struct path path;
- int error;
newargs = setup_root_args(data);
if (!newargs)
@@ -850,39 +846,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
if (IS_ERR(mnt))
return ERR_CAST(mnt);
- ns_private = create_mnt_ns(mnt);
- if (IS_ERR(ns_private)) {
- mntput(mnt);
- return ERR_CAST(ns_private);
- }
+ root = mount_subtree(mnt, subvol_name);
- /*
- * This will trigger the automount of the subvol so we can just
- * drop the mnt we have here and return the dentry that we
- * found.
- */
- error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
- LOOKUP_FOLLOW, &path);
- put_mnt_ns(ns_private);
- if (error)
- return ERR_PTR(error);
-
- if (!is_subvolume_inode(path.dentry->d_inode)) {
- path_put(&path);
- mntput(mnt);
- error = -EINVAL;
+ if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
+ struct super_block *s = root->d_sb;
+ dput(root);
+ root = ERR_PTR(-EINVAL);
+ deactivate_locked_super(s);
printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
subvol_name);
- return ERR_PTR(-EINVAL);
}
- /* Get a ref to the sb and the dentry we found and return it */
- s = path.mnt->mnt_sb;
- atomic_inc(&s->s_active);
- root = dget(path.dentry);
- path_put(&path);
- down_write(&s->s_umount);
-
return root;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3..360c2dfd1ee 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -467,19 +467,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
- while (count < 4) {
+ while (count < 2) {
unsigned long cur = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (cur &&
trans->transaction->delayed_refs.num_heads_ready > 64) {
trans->delayed_ref_updates = 0;
-
- /*
- * do a full flush if the transaction is trying
- * to close
- */
- if (trans->transaction->delayed_refs.flushing)
- cur = 0;
btrfs_run_delayed_refs(trans, root, cur);
} else {
break;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f4d81c06d48..3568374d419 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1031,7 +1031,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
if (nlink != inode->i_nlink) {
- inode->i_nlink = nlink;
+ set_nlink(inode, nlink);
btrfs_update_inode(trans, root, inode);
}
BTRFS_I(inode)->index_cnt = (u64)-1;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d73439b4d7d..9489a2aca47 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3162,7 +3162,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_stripe_size = 1024 * 1024 * 1024;
max_chunk_size = 10 * max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- max_stripe_size = 256 * 1024 * 1024;
+ /* for larger filesystems, use larger metadata chunks */
+ if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+ max_stripe_size = 1024 * 1024 * 1024;
+ else
+ max_stripe_size = 256 * 1024 * 1024;
max_chunk_size = max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = 8 * 1024 * 1024;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a76e41c04b7..3848b04e310 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -394,36 +394,36 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
XATTR_REPLACE);
}
-int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *suffix;
+ const struct xattr *xattr;
+ struct btrfs_trans_handle *trans = fs_info;
char *name;
+ int err = 0;
- err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
- &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
- }
-
- name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
- GFP_NOFS);
- if (!name) {
- err = -ENOMEM;
- } else {
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1, GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ break;
+ }
strcpy(name, XATTR_SECURITY_PREFIX);
- strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
- err = __btrfs_setxattr(trans, inode, name, value, len, 0);
+ strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+ err = __btrfs_setxattr(trans, inode, name,
+ xattr->value, xattr->value_len, 0);
kfree(name);
+ if (err < 0)
+ break;
}
-
- kfree(suffix);
- kfree(value);
return err;
}
+
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &btrfs_initxattrs, trans);
+}
diff --git a/fs/buffer.c b/fs/buffer.c
index 1a80b048ade..19d8eb7fdc8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -213,13 +213,16 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
* elsewhere, don't buffer_error if we had some unmapped buffers
*/
if (all_mapped) {
+ char b[BDEVNAME_SIZE];
+
printk("__find_get_block_slow() failed. "
"block=%llu, b_blocknr=%llu\n",
(unsigned long long)block,
(unsigned long long)bh->b_blocknr);
printk("b_state=0x%08lx, b_size=%zu\n",
bh->b_state, bh->b_size);
- printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
+ printk("device %s blocksize: %d\n", bdevname(bdev, b),
+ 1 << bd_inode->i_blkbits);
}
out_unlock:
spin_unlock(&bd_mapping->private_lock);
@@ -285,7 +288,7 @@ static void free_more_memory(void)
struct zone *zone;
int nid;
- wakeup_flusher_threads(1024);
+ wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
yield();
for_each_online_node(nid) {
@@ -1470,13 +1473,13 @@ static void discard_buffer(struct buffer_head * bh)
}
/**
- * block_invalidatepage - invalidate part of all of a buffer-backed page
+ * block_invalidatepage - invalidate part or all of a buffer-backed page
*
* @page: the page which is affected
* @offset: the index of the truncation point
*
* block_invalidatepage() is called when all or part of the page has become
- * invalidatedby a truncate operation.
+ * invalidated by a truncate operation.
*
* block_invalidatepage() does not have to release all buffers, but it must
* ensure that no dirty buffer is left outside @offset and that no I/O
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5a3953db811..173b1d22e59 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page)
snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
/* dirty the head */
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (ci->i_head_snapc == NULL)
ci->i_head_snapc = ceph_get_snap_context(snapc);
++ci->i_wrbuffer_ref_head;
@@ -100,7 +100,7 @@ static int ceph_set_page_dirty(struct page *page)
ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
snapc, snapc->seq, snapc->num_snaps);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
/* now adjust page */
spin_lock_irq(&mapping->tree_lock);
@@ -228,102 +228,155 @@ static int ceph_readpage(struct file *filp, struct page *page)
}
/*
- * Build a vector of contiguous pages from the provided page list.
+ * Finish an async read(ahead) op.
*/
-static struct page **page_vector_from_list(struct list_head *page_list,
- unsigned *nr_pages)
+static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
{
- struct page **pages;
- struct page *page;
- int next_index, contig_pages = 0;
+ struct inode *inode = req->r_inode;
+ struct ceph_osd_reply_head *replyhead;
+ int rc, bytes;
+ int i;
- /* build page vector */
- pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
- if (!pages)
- return ERR_PTR(-ENOMEM);
+ /* parse reply */
+ replyhead = msg->front.iov_base;
+ WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+ rc = le32_to_cpu(replyhead->result);
+ bytes = le32_to_cpu(msg->hdr.data_len);
- BUG_ON(list_empty(page_list));
- next_index = list_entry(page_list->prev, struct page, lru)->index;
- list_for_each_entry_reverse(page, page_list, lru) {
- if (page->index == next_index) {
- dout("readpages page %d %p\n", contig_pages, page);
- pages[contig_pages] = page;
- contig_pages++;
- next_index++;
- } else {
- break;
+ dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
+
+ /* unlock all pages, zeroing any data we didn't read */
+ for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
+ struct page *page = req->r_pages[i];
+
+ if (bytes < (int)PAGE_CACHE_SIZE) {
+ /* zero (remainder of) page */
+ int s = bytes < 0 ? 0 : bytes;
+ zero_user_segment(page, s, PAGE_CACHE_SIZE);
}
+ dout("finish_read %p uptodate %p idx %lu\n", inode, page,
+ page->index);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ page_cache_release(page);
}
- *nr_pages = contig_pages;
- return pages;
+ kfree(req->r_pages);
}
/*
- * Read multiple pages. Leave pages we don't read + unlock in page_list;
- * the caller (VM) cleans them up.
+ * start an async read(ahead) operation. return nr_pages we submitted
+ * a read for on success, or negative error code.
*/
-static int ceph_readpages(struct file *file, struct address_space *mapping,
- struct list_head *page_list, unsigned nr_pages)
+static int start_read(struct inode *inode, struct list_head *page_list, int max)
{
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
- int rc = 0;
- struct page **pages;
- loff_t offset;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct page *page = list_entry(page_list->prev, struct page, lru);
+ struct ceph_osd_request *req;
+ u64 off;
u64 len;
+ int i;
+ struct page **pages;
+ pgoff_t next_index;
+ int nr_pages = 0;
+ int ret;
- dout("readpages %p file %p nr_pages %d\n",
- inode, file, nr_pages);
-
- pages = page_vector_from_list(page_list, &nr_pages);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
+ off = page->index << PAGE_CACHE_SHIFT;
- /* guess read extent */
- offset = pages[0]->index << PAGE_CACHE_SHIFT;
+ /* count pages */
+ next_index = page->index;
+ list_for_each_entry_reverse(page, page_list, lru) {
+ if (page->index != next_index)
+ break;
+ nr_pages++;
+ next_index++;
+ if (max && nr_pages == max)
+ break;
+ }
len = nr_pages << PAGE_CACHE_SHIFT;
- rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
- offset, &len,
- ci->i_truncate_seq, ci->i_truncate_size,
- pages, nr_pages, 0);
- if (rc == -ENOENT)
- rc = 0;
- if (rc < 0)
- goto out;
-
- for (; !list_empty(page_list) && len > 0;
- rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
- struct page *page =
- list_entry(page_list->prev, struct page, lru);
+ dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
+ off, len);
+
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
+ off, &len,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, 0,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ NULL, false, 1, 0);
+ if (!req)
+ return -ENOMEM;
+ /* build page vector */
+ nr_pages = len >> PAGE_CACHE_SHIFT;
+ pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
+ ret = -ENOMEM;
+ if (!pages)
+ goto out;
+ for (i = 0; i < nr_pages; ++i) {
+ page = list_entry(page_list->prev, struct page, lru);
+ BUG_ON(PageLocked(page));
list_del(&page->lru);
-
- if (rc < (int)PAGE_CACHE_SIZE) {
- /* zero (remainder of) page */
- int s = rc < 0 ? 0 : rc;
- zero_user_segment(page, s, PAGE_CACHE_SIZE);
- }
-
- if (add_to_page_cache_lru(page, mapping, page->index,
+
+ dout("start_read %p adding %p idx %lu\n", inode, page,
+ page->index);
+ if (add_to_page_cache_lru(page, &inode->i_data, page->index,
GFP_NOFS)) {
page_cache_release(page);
- dout("readpages %p add_to_page_cache failed %p\n",
+ dout("start_read %p add_to_page_cache failed %p\n",
inode, page);
- continue;
+ nr_pages = i;
+ goto out_pages;
}
- dout("readpages %p adding %p idx %lu\n", inode, page,
- page->index);
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- page_cache_release(page);
+ pages[i] = page;
}
- rc = 0;
+ req->r_pages = pages;
+ req->r_num_pages = nr_pages;
+ req->r_callback = finish_read;
+ req->r_inode = inode;
+
+ dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
+ ret = ceph_osdc_start_request(osdc, req, false);
+ if (ret < 0)
+ goto out_pages;
+ ceph_osdc_put_request(req);
+ return nr_pages;
+out_pages:
+ ceph_release_page_vector(pages, nr_pages);
+out:
+ ceph_osdc_put_request(req);
+ return ret;
+}
+
+
+/*
+ * Read multiple pages. Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *page_list, unsigned nr_pages)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ int rc = 0;
+ int max = 0;
+
+ if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+ max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+ >> PAGE_SHIFT;
+
+ dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages,
+ max);
+ while (!list_empty(page_list)) {
+ rc = start_read(inode, page_list, max);
+ if (rc < 0)
+ goto out;
+ BUG_ON(rc == 0);
+ }
out:
- kfree(pages);
+ dout("readpages %p file %p ret %d\n", inode, file, rc);
return rc;
}
@@ -338,7 +391,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
struct ceph_snap_context *snapc = NULL;
struct ceph_cap_snap *capsnap = NULL;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
capsnap->context, capsnap->dirty_pages);
@@ -354,7 +407,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
dout(" head snapc %p has %d dirty pages\n",
snapc, ci->i_wrbuffer_ref_head);
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return snapc;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8d74ad7ba55..8b53193e4f7 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -309,7 +309,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
/*
* Find ceph_cap for given mds, if any.
*
- * Called with i_lock held.
+ * Called with i_ceph_lock held.
*/
static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
{
@@ -332,9 +332,9 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
{
struct ceph_cap *cap;
- spin_lock(&ci->vfs_inode.i_lock);
+ spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
- spin_unlock(&ci->vfs_inode.i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return cap;
}
@@ -361,15 +361,16 @@ static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
int ceph_get_cap_mds(struct inode *inode)
{
+ struct ceph_inode_info *ci = ceph_inode(inode);
int mds;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
mds = __ceph_get_cap_mds(ceph_inode(inode));
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return mds;
}
/*
- * Called under i_lock.
+ * Called under i_ceph_lock.
*/
static void __insert_cap_node(struct ceph_inode_info *ci,
struct ceph_cap *new)
@@ -415,7 +416,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
*
* If I_FLUSH is set, leave the inode at the front of the list.
*
- * Caller holds i_lock
+ * Caller holds i_ceph_lock
* -> we take mdsc->cap_delay_lock
*/
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
@@ -457,7 +458,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
/*
* Cancel delayed work on cap.
*
- * Caller must hold i_lock.
+ * Caller must hold i_ceph_lock.
*/
static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
@@ -487,17 +488,15 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
ci->i_rdcache_gen++;
/*
- * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+ * if we are newly issued FILE_SHARED, clear D_COMPLETE; we
* don't know what happened to this directory while we didn't
* have the cap.
*/
if ((issued & CEPH_CAP_FILE_SHARED) &&
(had & CEPH_CAP_FILE_SHARED) == 0) {
ci->i_shared_gen++;
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
- dout(" marking %p NOT complete\n", &ci->vfs_inode);
- ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
- }
+ if (S_ISDIR(ci->vfs_inode.i_mode))
+ ceph_dir_clear_complete(&ci->vfs_inode);
}
}
@@ -534,14 +533,14 @@ int ceph_add_cap(struct inode *inode,
wanted |= ceph_caps_for_mode(fmode);
retry:
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap) {
if (new_cap) {
cap = new_cap;
new_cap = NULL;
} else {
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
new_cap = get_cap(mdsc, caps_reservation);
if (new_cap == NULL)
return -ENOMEM;
@@ -627,7 +626,7 @@ retry:
if (fmode >= 0)
__ceph_get_fmode(ci, fmode);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
wake_up_all(&ci->i_cap_wq);
return 0;
}
@@ -794,7 +793,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
struct rb_node *p;
int ret = 0;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node);
if (__cap_is_valid(cap) &&
@@ -803,7 +802,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
break;
}
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
dout("ceph_caps_revoking %p %s = %d\n", inode,
ceph_cap_string(mask), ret);
return ret;
@@ -857,7 +856,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
}
/*
- * called under i_lock
+ * called under i_ceph_lock
*/
static int __ceph_is_any_caps(struct ceph_inode_info *ci)
{
@@ -867,7 +866,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
/*
* Remove a cap. Take steps to deal with a racing iterate_session_caps.
*
- * caller should hold i_lock.
+ * caller should hold i_ceph_lock.
* caller will not hold session s_mutex if called from destroy_inode.
*/
void __ceph_remove_cap(struct ceph_cap *cap)
@@ -945,7 +944,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
seq, issue_seq, mseq, follows, size, max_size,
xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false);
if (!msg)
return -ENOMEM;
@@ -1030,7 +1029,7 @@ static void __queue_cap_release(struct ceph_mds_session *session,
/*
* Queue cap releases when an inode is dropped from our cache. Since
- * inode is about to be destroyed, there is no need for i_lock.
+ * inode is about to be destroyed, there is no need for i_ceph_lock.
*/
void ceph_queue_caps_release(struct inode *inode)
{
@@ -1051,7 +1050,7 @@ void ceph_queue_caps_release(struct inode *inode)
/*
* Send a cap msg on the given inode. Update our caps state, then
- * drop i_lock and send the message.
+ * drop i_ceph_lock and send the message.
*
* Make note of max_size reported/requested from mds, revoked caps
* that have now been implemented.
@@ -1063,13 +1062,13 @@ void ceph_queue_caps_release(struct inode *inode)
* Return non-zero if delayed release, or we experienced an error
* such that the caller should requeue + retry later.
*
- * called with i_lock, then drops it.
+ * called with i_ceph_lock, then drops it.
* caller should hold snap_rwsem (read), s_mutex.
*/
static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
int op, int used, int want, int retain, int flushing,
unsigned *pflush_tid)
- __releases(cap->ci->vfs_inode->i_lock)
+ __releases(cap->ci->i_ceph_lock)
{
struct ceph_inode_info *ci = cap->ci;
struct inode *inode = &ci->vfs_inode;
@@ -1172,7 +1171,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
xattr_version = ci->i_xattrs.version;
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
@@ -1200,13 +1199,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
* Unless @again is true, skip cap_snaps that were already sent to
* the MDS (i.e., during this session).
*
- * Called under i_lock. Takes s_mutex as needed.
+ * Called under i_ceph_lock. Takes s_mutex as needed.
*/
void __ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession,
int again)
- __releases(ci->vfs_inode->i_lock)
- __acquires(ci->vfs_inode->i_lock)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
{
struct inode *inode = &ci->vfs_inode;
int mds;
@@ -1263,7 +1262,7 @@ retry:
session = NULL;
}
if (!session) {
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
mutex_lock(&mdsc->mutex);
session = __ceph_lookup_mds_session(mdsc, mds);
mutex_unlock(&mdsc->mutex);
@@ -1277,7 +1276,7 @@ retry:
* deletion or migration. retry, and we'll
* get a better @mds value next time.
*/
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
goto retry;
}
@@ -1287,7 +1286,7 @@ retry:
list_del_init(&capsnap->flushing_item);
list_add_tail(&capsnap->flushing_item,
&session->s_cap_snaps_flushing);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
inode, capsnap, capsnap->follows, capsnap->flush_tid);
@@ -1304,7 +1303,7 @@ retry:
next_follows = capsnap->follows + 1;
ceph_put_cap_snap(capsnap);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
goto retry;
}
@@ -1324,11 +1323,9 @@ out:
static void ceph_flush_snaps(struct ceph_inode_info *ci)
{
- struct inode *inode = &ci->vfs_inode;
-
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
__ceph_flush_snaps(ci, NULL, 0);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -1375,7 +1372,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
* Add dirty inode to the flushing list. Assigned a seq number so we
* can wait for caps to flush without starving.
*
- * Called under i_lock.
+ * Called under i_ceph_lock.
*/
static int __mark_caps_flushing(struct inode *inode,
struct ceph_mds_session *session)
@@ -1423,9 +1420,9 @@ static int try_nonblocking_invalidate(struct inode *inode)
struct ceph_inode_info *ci = ceph_inode(inode);
u32 invalidating_gen = ci->i_rdcache_gen;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
invalidate_mapping_pages(&inode->i_data, 0, -1);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (inode->i_data.nrpages == 0 &&
invalidating_gen == ci->i_rdcache_gen) {
@@ -1472,7 +1469,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
if (mdsc->stopping)
is_delayed = 1;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
@@ -1482,7 +1479,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
__ceph_flush_snaps(ci, &session, 0);
goto retry_locked;
retry:
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
retry_locked:
file_wanted = __ceph_caps_file_wanted(ci);
used = __ceph_caps_used(ci);
@@ -1636,7 +1633,7 @@ ack:
if (mutex_trylock(&session->s_mutex) == 0) {
dout("inverting session/ino locks on %p\n",
session);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (took_snap_rwsem) {
up_read(&mdsc->snap_rwsem);
took_snap_rwsem = 0;
@@ -1650,7 +1647,7 @@ ack:
if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
dout("inverting snap/in locks on %p\n",
inode);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
down_read(&mdsc->snap_rwsem);
took_snap_rwsem = 1;
goto retry;
@@ -1666,10 +1663,10 @@ ack:
mds = cap->mds; /* remember mds, so we don't repeat */
sent++;
- /* __send_cap drops i_lock */
+ /* __send_cap drops i_ceph_lock */
delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
retain, flushing, NULL);
- goto retry; /* retake i_lock and restart our cap scan. */
+ goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
/*
@@ -1683,7 +1680,7 @@ ack:
else if (!is_delayed || force_requeue)
__cap_delay_requeue(mdsc, ci);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (queue_invalidate)
ceph_queue_invalidate(inode);
@@ -1706,7 +1703,7 @@ static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
int flushing = 0;
retry:
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
goto out;
@@ -1718,7 +1715,7 @@ retry:
int delayed;
if (!session) {
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
session = cap->session;
mutex_lock(&session->s_mutex);
goto retry;
@@ -1729,18 +1726,18 @@ retry:
flushing = __mark_caps_flushing(inode, session);
- /* __send_cap drops i_lock */
+ /* __send_cap drops i_ceph_lock */
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
cap->issued | cap->implemented, flushing,
flush_tid);
if (!delayed)
goto out_unlocked;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
__cap_delay_requeue(mdsc, ci);
}
out:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
out_unlocked:
if (session && unlock_session)
mutex_unlock(&session->s_mutex);
@@ -1755,7 +1752,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid)
struct ceph_inode_info *ci = ceph_inode(inode);
int i, ret = 1;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_CAP_BITS; i++)
if ((ci->i_flushing_caps & (1 << i)) &&
ci->i_cap_flush_tid[i] <= tid) {
@@ -1763,7 +1760,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid)
ret = 0;
break;
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return ret;
}
@@ -1870,10 +1867,10 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
struct ceph_mds_client *mdsc =
ceph_sb_to_client(inode->i_sb)->mdsc;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (__ceph_caps_dirty(ci))
__cap_delay_requeue_front(mdsc, ci);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
return err;
}
@@ -1896,7 +1893,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (cap && cap->session == session) {
dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
@@ -1906,7 +1903,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
pr_err("%p auth cap %p not mds%d ???\n", inode,
cap, session->s_mds);
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
}
@@ -1923,7 +1920,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_cap *cap;
int delayed = 0;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (cap && cap->session == session) {
dout("kick_flushing_caps %p cap %p %s\n", inode,
@@ -1934,14 +1931,14 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
cap->issued | cap->implemented,
ci->i_flushing_caps, NULL);
if (delayed) {
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
__cap_delay_requeue(mdsc, ci);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
} else {
pr_err("%p auth cap %p not mds%d ???\n", inode,
cap, session->s_mds);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
}
}
@@ -1954,7 +1951,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
struct ceph_cap *cap;
int delayed = 0;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
@@ -1966,12 +1963,12 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
cap->issued | cap->implemented,
ci->i_flushing_caps, NULL);
if (delayed) {
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
__cap_delay_requeue(mdsc, ci);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
} else {
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
}
@@ -1980,7 +1977,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
* Take references to capabilities we hold, so that we don't release
* them to the MDS prematurely.
*
- * Protected by i_lock.
+ * Protected by i_ceph_lock.
*/
static void __take_cap_refs(struct ceph_inode_info *ci, int got)
{
@@ -2018,7 +2015,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
dout("get_cap_refs %p need %s want %s\n", inode,
ceph_cap_string(need), ceph_cap_string(want));
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
/* make sure file is actually open */
file_wanted = __ceph_caps_file_wanted(ci);
@@ -2079,7 +2076,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
ceph_cap_string(have), ceph_cap_string(need));
}
out:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
dout("get_cap_refs %p ret %d got %s\n", inode,
ret, ceph_cap_string(*got));
return ret;
@@ -2096,7 +2093,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
int check = 0;
/* do we need to explicitly request a larger max_size? */
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if ((endoff >= ci->i_max_size ||
endoff > (inode->i_size << 1)) &&
endoff > ci->i_wanted_max_size) {
@@ -2105,7 +2102,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
ci->i_wanted_max_size = endoff;
check = 1;
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (check)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
@@ -2142,9 +2139,9 @@ retry:
*/
void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
{
- spin_lock(&ci->vfs_inode.i_lock);
+ spin_lock(&ci->i_ceph_lock);
__take_cap_refs(ci, caps);
- spin_unlock(&ci->vfs_inode.i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -2162,7 +2159,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
int last = 0, put = 0, flushsnaps = 0, wake = 0;
struct ceph_cap_snap *capsnap;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (had & CEPH_CAP_PIN)
--ci->i_pin_ref;
if (had & CEPH_CAP_FILE_RD)
@@ -2195,7 +2192,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
}
}
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
last ? " last" : "", put ? " put" : "");
@@ -2227,7 +2224,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
int found = 0;
struct ceph_cap_snap *capsnap = NULL;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
ci->i_wrbuffer_ref -= nr;
last = !ci->i_wrbuffer_ref;
@@ -2276,7 +2273,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
}
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (last) {
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
@@ -2293,7 +2290,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
* Handle a cap GRANT message from the MDS. (Note that a GRANT may
* actually be a revocation if it specifies a smaller cap set.)
*
- * caller holds s_mutex and i_lock, we drop both.
+ * caller holds s_mutex and i_ceph_lock, we drop both.
*
* return value:
* 0 - ok
@@ -2304,7 +2301,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
struct ceph_mds_session *session,
struct ceph_cap *cap,
struct ceph_buffer *xattr_buf)
- __releases(inode->i_lock)
+ __releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
@@ -2363,7 +2360,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
}
if ((issued & CEPH_CAP_LINK_EXCL) == 0)
- inode->i_nlink = le32_to_cpu(grant->nlink);
+ set_nlink(inode, le32_to_cpu(grant->nlink));
if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
int len = le32_to_cpu(grant->xattr_len);
@@ -2455,7 +2452,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
}
BUG_ON(cap->issued & ~cap->implemented);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (writeback)
/*
* queue inode for writeback: we can't actually call
@@ -2485,7 +2482,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
struct ceph_mds_caps *m,
struct ceph_mds_session *session,
struct ceph_cap *cap)
- __releases(inode->i_lock)
+ __releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -2541,7 +2538,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
wake_up_all(&ci->i_cap_wq);
out:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (drop)
iput(inode);
}
@@ -2564,7 +2561,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
inode, ci, session->s_mds, follows);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
if (capsnap->follows == follows) {
if (capsnap->flush_tid != flush_tid) {
@@ -2587,7 +2584,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
capsnap, capsnap->follows);
}
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (drop)
iput(inode);
}
@@ -2600,7 +2597,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
static void handle_cap_trunc(struct inode *inode,
struct ceph_mds_caps *trunc,
struct ceph_mds_session *session)
- __releases(inode->i_lock)
+ __releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
@@ -2619,7 +2616,7 @@ static void handle_cap_trunc(struct inode *inode,
inode, mds, seq, truncate_size, truncate_seq);
queue_trunc = ceph_fill_file_size(inode, issued,
truncate_seq, truncate_size, size);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (queue_trunc)
ceph_queue_vmtruncate(inode);
@@ -2648,7 +2645,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
inode, ci, mds, mseq);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
/* make sure we haven't seen a higher mseq */
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
@@ -2692,7 +2689,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
}
/* else, we already released it */
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -2747,9 +2744,9 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
up_read(&mdsc->snap_rwsem);
/* make sure we re-request max_size, if necessary */
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
ci->i_requested_max_size = 0;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -2764,6 +2761,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_mds_client *mdsc = session->s_mdsc;
struct super_block *sb = mdsc->fsc->sb;
struct inode *inode;
+ struct ceph_inode_info *ci;
struct ceph_cap *cap;
struct ceph_mds_caps *h;
int mds = session->s_mds;
@@ -2817,6 +2815,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
/* lookup ino */
inode = ceph_find_inode(sb, vino);
+ ci = ceph_inode(inode);
dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
vino.snap, inode);
if (!inode) {
@@ -2846,16 +2845,16 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
/* the rest require a cap */
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ceph_inode(inode), mds);
if (!cap) {
dout(" no cap on %p ino %llx.%llx from mds%d\n",
inode, ceph_ino(inode), ceph_snap(inode), mds);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
goto flush_cap_releases;
}
- /* note that each of these drops i_lock for us */
+ /* note that each of these drops i_ceph_lock for us */
switch (op) {
case CEPH_CAP_OP_REVOKE:
case CEPH_CAP_OP_GRANT:
@@ -2871,7 +2870,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
break;
default:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
ceph_cap_op_name(op));
}
@@ -2964,13 +2963,13 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
struct inode *inode = &ci->vfs_inode;
int last = 0;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
BUG_ON(ci->i_nr_by_mode[fmode] == 0);
if (--ci->i_nr_by_mode[fmode] == 0)
last++;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (last && ci->i_vino.snap == CEPH_NOSNAP)
ceph_check_caps(ci, 0, NULL);
@@ -2993,7 +2992,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
int used, dirty;
int ret = 0;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
@@ -3048,7 +3047,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
inode, cap, ceph_cap_string(cap->issued));
}
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return ret;
}
@@ -3063,7 +3062,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
/*
* force an record for the directory caps if we have a dentry lease.
- * this is racy (can't take i_lock and d_lock together), but it
+ * this is racy (can't take i_ceph_lock and d_lock together), but it
* doesn't have to be perfect; the mds will revoke anything we don't
* release.
*/
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 382abc9a6a5..98954003a8d 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -108,7 +108,7 @@ static unsigned fpos_off(loff_t p)
* falling back to a "normal" sync readdir if any dentries in the dir
* are dropped.
*
- * I_COMPLETE tells indicates we have all dentries in the dir. It is
+ * D_COMPLETE tells indicates we have all dentries in the dir. It is
* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
* the MDS if/when the directory is modified).
*/
@@ -199,8 +199,8 @@ more:
filp->f_pos++;
/* make sure a dentry wasn't dropped while we didn't have parent lock */
- if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
- dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+ if (!ceph_dir_test_complete(dir)) {
+ dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
err = -EAGAIN;
goto out;
}
@@ -281,18 +281,18 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
/* can we use the dcache? */
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if ((filp->f_pos == 2 || fi->dentry) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
- (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+ ceph_dir_test_complete(inode) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
err = __dcache_readdir(filp, dirent, filldir);
if (err != -EAGAIN)
return err;
} else {
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
if (fi->dentry) {
err = note_last_dentry(fi, fi->dentry->d_name.name,
@@ -351,7 +351,7 @@ more:
if (!req->r_did_prepopulate) {
dout("readdir !did_prepopulate");
- fi->dir_release_count--; /* preclude I_COMPLETE */
+ fi->dir_release_count--; /* preclude D_COMPLETE */
}
/* note next offset and last dentry name */
@@ -428,13 +428,12 @@ more:
* were released during the whole readdir, and we should have
* the complete dir contents in our cache.
*/
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (ci->i_release_count == fi->dir_release_count) {
- dout(" marking %p complete\n", inode);
- /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
+ ceph_dir_set_complete(inode);
ci->i_max_offset = filp->f_pos;
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
dout("readdir %p filp %p done.\n", inode, filp);
return 0;
@@ -608,21 +607,21 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_dentry_info *di = ceph_dentry(dentry);
- spin_lock(&dir->i_lock);
+ spin_lock(&ci->i_ceph_lock);
dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
if (strncmp(dentry->d_name.name,
fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
!is_root_ceph_dentry(dir, dentry) &&
- (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+ ceph_dir_test_complete(dir) &&
(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
- spin_unlock(&dir->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
dout(" dir %p complete, -ENOENT\n", dir);
d_add(dentry, NULL);
di->lease_shared_gen = ci->i_shared_gen;
return NULL;
}
- spin_unlock(&dir->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
op = ceph_snap(dir) == CEPH_SNAPDIR ?
@@ -842,12 +841,12 @@ static int drop_caps_for_unlink(struct inode *inode)
struct ceph_inode_info *ci = ceph_inode(inode);
int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (inode->i_nlink == 1) {
drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
ci->i_ceph_flags |= CEPH_I_NODELAY;
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return drop;
}
@@ -934,7 +933,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
/* d_move screws up d_subdirs order */
- ceph_i_clear(new_dir, CEPH_I_COMPLETE);
+ ceph_dir_clear_complete(new_dir);
d_move(old_dentry, new_dentry);
@@ -1016,10 +1015,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
struct ceph_dentry_info *di = ceph_dentry(dentry);
int valid = 0;
- spin_lock(&dir->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (ci->i_shared_gen == di->lease_shared_gen)
valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
- spin_unlock(&dir->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
dir, (unsigned)ci->i_shared_gen, dentry,
(unsigned)di->lease_shared_gen, valid);
@@ -1092,7 +1091,52 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
return 1;
}
+/*
+ * Set/clear/test dir complete flag on the dir's dentry.
+ */
+void ceph_dir_set_complete(struct inode *inode)
+{
+ /* not yet implemented */
+}
+
+void ceph_dir_clear_complete(struct inode *inode)
+{
+ /* not yet implemented */
+}
+
+bool ceph_dir_test_complete(struct inode *inode)
+{
+ /* not yet implemented */
+ return false;
+}
+
+/*
+ * When the VFS prunes a dentry from the cache, we need to clear the
+ * complete flag on the parent directory.
+ *
+ * Called under dentry->d_lock.
+ */
+static void ceph_d_prune(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di;
+
+ dout("ceph_d_prune %p\n", dentry);
+ /* do we have a valid parent? */
+ if (!dentry->d_parent || IS_ROOT(dentry))
+ return;
+
+ /* if we are not hashed, we don't affect D_COMPLETE */
+ if (d_unhashed(dentry))
+ return;
+
+ /*
+ * we hold d_lock, so d_parent is stable, and d_fsdata is never
+ * cleared until d_release
+ */
+ di = ceph_dentry(dentry->d_parent);
+ clear_bit(CEPH_D_COMPLETE, &di->flags);
+}
/*
* read() on a dir. This weird interface hack only works if mounted
@@ -1306,6 +1350,7 @@ const struct inode_operations ceph_dir_iops = {
const struct dentry_operations ceph_dentry_ops = {
.d_revalidate = ceph_d_revalidate,
.d_release = ceph_d_release,
+ .d_prune = ceph_d_prune,
};
const struct dentry_operations ceph_snapdir_dentry_ops = {
@@ -1315,4 +1360,5 @@ const struct dentry_operations ceph_snapdir_dentry_ops = {
const struct dentry_operations ceph_snap_dentry_ops = {
.d_release = ceph_d_release,
+ .d_prune = ceph_d_prune,
};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce549d31eeb..ed72428d9c7 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -147,9 +147,9 @@ int ceph_open(struct inode *inode, struct file *file)
/* trivially open snapdir */
if (ceph_snap(inode) == CEPH_SNAPDIR) {
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
__ceph_get_fmode(ci, fmode);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
@@ -158,7 +158,7 @@ int ceph_open(struct inode *inode, struct file *file)
* write) or any MDS (for read). Update wanted set
* asynchronously.
*/
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (__ceph_is_any_real_caps(ci) &&
(((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
int mds_wanted = __ceph_caps_mds_wanted(ci);
@@ -168,7 +168,7 @@ int ceph_open(struct inode *inode, struct file *file)
inode, fmode, ceph_cap_string(wanted),
ceph_cap_string(issued));
__ceph_get_fmode(ci, fmode);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
/* adjust wanted? */
if ((issued & wanted) != wanted &&
@@ -180,10 +180,10 @@ int ceph_open(struct inode *inode, struct file *file)
} else if (ceph_snap(inode) != CEPH_NOSNAP &&
(ci->i_snap_caps & wanted) == wanted) {
__ceph_get_fmode(ci, fmode);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
req = prepare_open_request(inode->i_sb, flags, 0);
@@ -743,9 +743,9 @@ retry_snap:
*/
int dirty;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
ceph_put_cap_refs(ci, got);
ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
@@ -764,9 +764,9 @@ retry_snap:
if (ret >= 0) {
int dirty;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
}
@@ -797,7 +797,8 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
mutex_lock(&inode->i_mutex);
__ceph_do_pending_vmtruncate(inode);
- if (origin != SEEK_CUR || origin != SEEK_SET) {
+
+ if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
if (ret < 0) {
offset = ret;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 095799ba9dd..87fb132fb33 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -9,7 +9,6 @@
#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/vmalloc.h>
-#include <linux/pagevec.h>
#include "super.h"
#include "mds_client.h"
@@ -298,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
dout("alloc_inode %p\n", &ci->vfs_inode);
+ spin_lock_init(&ci->i_ceph_lock);
+
ci->i_version = 0;
ci->i_time_warp_seq = 0;
ci->i_ceph_flags = 0;
@@ -584,7 +585,7 @@ static int fill_inode(struct inode *inode,
iinfo->xattr_len);
}
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
/*
* provided version will be odd if inode value is projected,
@@ -619,7 +620,7 @@ static int fill_inode(struct inode *inode,
}
if ((issued & CEPH_CAP_LINK_EXCL) == 0)
- inode->i_nlink = le32_to_cpu(info->nlink);
+ set_nlink(inode, le32_to_cpu(info->nlink));
/* be careful with mtime, atime, size */
ceph_decode_timespec(&atime, &info->atime);
@@ -681,7 +682,7 @@ static int fill_inode(struct inode *inode,
char *sym;
BUG_ON(symlen != inode->i_size);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
err = -ENOMEM;
sym = kmalloc(symlen+1, GFP_NOFS);
@@ -690,7 +691,7 @@ static int fill_inode(struct inode *inode,
memcpy(sym, iinfo->symlink, symlen);
sym[symlen] = 0;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (!ci->i_symlink)
ci->i_symlink = sym;
else
@@ -716,7 +717,7 @@ static int fill_inode(struct inode *inode,
}
no_change:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
/* queue truncate if we saw i_size decrease */
if (queue_trunc)
@@ -751,13 +752,13 @@ no_change:
info->cap.flags,
caps_reservation);
} else {
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
dout(" %p got snap_caps %s\n", inode,
ceph_cap_string(le32_to_cpu(info->cap.caps)));
ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
if (cap_fmode >= 0)
__ceph_get_fmode(ci, cap_fmode);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
} else if (cap_fmode >= 0) {
pr_warning("mds issued no caps on %llx.%llx\n",
@@ -772,9 +773,9 @@ no_change:
ceph_snap(inode) == CEPH_NOSNAP &&
(le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
- (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+ !ceph_dir_test_complete(inode)) {
dout(" marking %p complete (empty)\n", inode);
- /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
+ ceph_dir_set_complete(inode);
ci->i_max_offset = 2;
}
@@ -850,19 +851,20 @@ static void ceph_set_dentry_offset(struct dentry *dn)
{
struct dentry *dir = dn->d_parent;
struct inode *inode = dir->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_dentry_info *di;
BUG_ON(!inode);
di = ceph_dentry(dn);
- spin_lock(&inode->i_lock);
- if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
- spin_unlock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
+ if (!ceph_dir_test_complete(inode)) {
+ spin_unlock(&ci->i_ceph_lock);
return;
}
di->offset = ceph_inode(inode)->i_max_offset++;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
spin_lock(&dir->d_lock);
spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
@@ -1057,7 +1059,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
* d_move() puts the renamed dentry at the end of
* d_subdirs. We need to assign it an appropriate
* directory offset so we can behave when holding
- * I_COMPLETE.
+ * D_COMPLETE.
*/
ceph_set_dentry_offset(req->r_old_dentry);
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
@@ -1309,7 +1311,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
struct ceph_inode_info *ci = ceph_inode(inode);
int ret = 0;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
inode->i_size = size;
inode->i_blocks = (size + (1 << 9) - 1) >> 9;
@@ -1319,7 +1321,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
(ci->i_reported_size << 1) < ci->i_max_size)
ret = 1;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return ret;
}
@@ -1329,12 +1331,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
*/
void ceph_queue_writeback(struct inode *inode)
{
+ ihold(inode);
if (queue_work(ceph_inode_to_client(inode)->wb_wq,
&ceph_inode(inode)->i_wb_work)) {
dout("ceph_queue_writeback %p\n", inode);
- ihold(inode);
} else {
dout("ceph_queue_writeback %p failed\n", inode);
+ iput(inode);
}
}
@@ -1354,55 +1357,13 @@ static void ceph_writeback_work(struct work_struct *work)
*/
void ceph_queue_invalidate(struct inode *inode)
{
+ ihold(inode);
if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
&ceph_inode(inode)->i_pg_inv_work)) {
dout("ceph_queue_invalidate %p\n", inode);
- ihold(inode);
} else {
dout("ceph_queue_invalidate %p failed\n", inode);
- }
-}
-
-/*
- * invalidate any pages that are not dirty or under writeback. this
- * includes pages that are clean and mapped.
- */
-static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
-{
- struct pagevec pvec;
- pgoff_t next = 0;
- int i;
-
- pagevec_init(&pvec, 0);
- while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
- pgoff_t index;
- int skip_page =
- (PageDirty(page) || PageWriteback(page));
-
- if (!skip_page)
- skip_page = !trylock_page(page);
-
- /*
- * We really shouldn't be looking at the ->index of an
- * unlocked page. But we're not allowed to lock these
- * pages. So we rely upon nobody altering the ->index
- * of this (pinned-by-us) page.
- */
- index = page->index;
- if (index > next)
- next = index;
- next++;
-
- if (skip_page)
- continue;
-
- generic_error_remove_page(mapping, page);
- unlock_page(page);
- }
- pagevec_release(&pvec);
- cond_resched();
+ iput(inode);
}
}
@@ -1418,20 +1379,20 @@ static void ceph_invalidate_work(struct work_struct *work)
u32 orig_gen;
int check = 0;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
dout("invalidate_pages %p gen %d revoking %d\n", inode,
ci->i_rdcache_gen, ci->i_rdcache_revoking);
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
/* nevermind! */
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
goto out;
}
orig_gen = ci->i_rdcache_gen;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
- ceph_invalidate_nondirty_pages(inode->i_mapping);
+ truncate_inode_pages(&inode->i_data, 0);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (orig_gen == ci->i_rdcache_gen &&
orig_gen == ci->i_rdcache_revoking) {
dout("invalidate_pages %p gen %d successful\n", inode,
@@ -1443,7 +1404,7 @@ static void ceph_invalidate_work(struct work_struct *work)
inode, orig_gen, ci->i_rdcache_gen,
ci->i_rdcache_revoking);
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (check)
ceph_check_caps(ci, 0, NULL);
@@ -1478,13 +1439,14 @@ void ceph_queue_vmtruncate(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ ihold(inode);
if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
&ci->i_vmtruncate_work)) {
dout("ceph_queue_vmtruncate %p\n", inode);
- ihold(inode);
} else {
dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
inode, ci->i_truncate_pending);
+ iput(inode);
}
}
@@ -1501,10 +1463,10 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
int wrbuffer_refs, wake = 0;
retry:
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (ci->i_truncate_pending == 0) {
dout("__do_pending_vmtruncate %p none pending\n", inode);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return;
}
@@ -1515,7 +1477,7 @@ retry:
if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
dout("__do_pending_vmtruncate %p flushing snaps first\n",
inode);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
filemap_write_and_wait_range(&inode->i_data, 0,
inode->i_sb->s_maxbytes);
goto retry;
@@ -1525,15 +1487,15 @@ retry:
wrbuffer_refs = ci->i_wrbuffer_ref;
dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
ci->i_truncate_pending, to);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
truncate_inode_pages(inode->i_mapping, to);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
ci->i_truncate_pending--;
if (ci->i_truncate_pending == 0)
wake = 1;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (wrbuffer_refs == 0)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
@@ -1588,7 +1550,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (IS_ERR(req))
return PTR_ERR(req);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
@@ -1736,7 +1698,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
}
release &= issued;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
@@ -1758,7 +1720,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
__ceph_do_pending_vmtruncate(inode);
return err;
out:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
ceph_mdsc_put_request(req);
return err;
}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 3b256b50f7d..790914a598d 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -42,17 +42,39 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
+ struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+ struct ceph_ioctl_layout nl;
int err, i;
- /* copy and validate */
if (copy_from_user(&l, arg, sizeof(l)))
return -EFAULT;
- if ((l.object_size & ~PAGE_MASK) ||
- (l.stripe_unit & ~PAGE_MASK) ||
- !l.stripe_unit ||
- (l.object_size &&
- (unsigned)l.object_size % (unsigned)l.stripe_unit))
+ /* validate changed params against current layout */
+ err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+ if (!err) {
+ nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
+ nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+ nl.object_size = ceph_file_layout_object_size(ci->i_layout);
+ nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+ nl.preferred_osd =
+ (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
+ } else
+ return err;
+
+ if (l.stripe_count)
+ nl.stripe_count = l.stripe_count;
+ if (l.stripe_unit)
+ nl.stripe_unit = l.stripe_unit;
+ if (l.object_size)
+ nl.object_size = l.object_size;
+ if (l.data_pool)
+ nl.data_pool = l.data_pool;
+ if (l.preferred_osd)
+ nl.preferred_osd = l.preferred_osd;
+
+ if ((nl.object_size & ~PAGE_MASK) ||
+ (nl.stripe_unit & ~PAGE_MASK) ||
+ ((unsigned)nl.object_size % (unsigned)nl.stripe_unit))
return -EINVAL;
/* make sure it's a valid data pool */
@@ -219,11 +241,11 @@ static long ceph_ioctl_lazyio(struct file *file)
struct ceph_inode_info *ci = ceph_inode(inode);
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
ci->i_nr_by_mode[fi->fmode]--;
fi->fmode |= CEPH_FILE_MODE_LAZY;
ci->i_nr_by_mode[fi->fmode]++;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
ceph_check_caps(ci, 0, NULL);
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 0c5167e4318..be4a6048733 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -6,7 +6,31 @@
#define CEPH_IOCTL_MAGIC 0x97
-/* just use u64 to align sanely on all archs */
+/*
+ * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy
+ * CEPH_IOC_SET_LAYOUT - set file layout
+ * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy
+ *
+ * The file layout specifies how file data is striped over objects in
+ * the distributed object store, which object pool they belong to (if
+ * it differs from the default), and an optional 'preferred osd' to
+ * store them on.
+ *
+ * Files get a new layout based on the policy set on the containing
+ * directory or one of its ancestors. The GET_LAYOUT ioctl will let
+ * you examine the layout for a file or the policy on a directory.
+ *
+ * SET_LAYOUT will let you set a layout on a newly created file. This
+ * only works immediately after the file is created and before any
+ * data is written to it.
+ *
+ * SET_LAYOUT_POLICY will let you set a layout policy (default layout)
+ * on a directory that will apply to any new files created in that
+ * directory (or any child directory that doesn't specify a layout of
+ * its own).
+ */
+
+/* use u64 to align sanely on all archs */
struct ceph_ioctl_layout {
__u64 stripe_unit, stripe_count, object_size;
__u64 data_pool;
@@ -21,6 +45,8 @@ struct ceph_ioctl_layout {
struct ceph_ioctl_layout)
/*
+ * CEPH_IOC_GET_DATALOC - get location of file data in the cluster
+ *
* Extract identity, address of the OSD and object storing a given
* file offset.
*/
@@ -39,7 +65,34 @@ struct ceph_ioctl_dataloc {
#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
struct ceph_ioctl_dataloc)
+/*
+ * CEPH_IOC_LAZYIO - relax consistency
+ *
+ * Normally Ceph switches to synchronous IO when multiple clients have
+ * the file open (and or more for write). Reads and writes bypass the
+ * page cache and go directly to the OSD. Setting this flag on a file
+ * descriptor will allow buffered IO for this file in cases where the
+ * application knows it won't interfere with other nodes (or doesn't
+ * care).
+ */
#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
+
+/*
+ * CEPH_IOC_SYNCIO - force synchronous IO
+ *
+ * This ioctl sets a file flag that forces the synchronous IO that
+ * bypasses the page cache, even if it is not necessary. This is
+ * essentially the opposite behavior of IOC_LAZYIO. This forces the
+ * same read/write path as a file opened by multiple clients when one
+ * or more of those clients is opened for write.
+ *
+ * Note that this type of sync IO takes a different path than a file
+ * opened with O_SYNC/D_SYNC (writes hit the page cache and are
+ * immediately flushed on page boundaries). It is very similar to
+ * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes
+ * are not copied (user page must remain stable) and O_DIRECT writes
+ * have alignment restrictions (on the buffer and file offset).
+ */
#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 86c59e16ba7..6203d805eb4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -619,7 +619,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
*
* Called under mdsc->mutex.
*/
-struct dentry *get_nonsnap_parent(struct dentry *dentry)
+static struct dentry *get_nonsnap_parent(struct dentry *dentry)
{
/*
* we don't need to worry about protecting the d_parent access
@@ -732,21 +732,21 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
}
}
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
cap = NULL;
if (mode == USE_AUTH_MDS)
cap = ci->i_auth_cap;
if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
if (!cap) {
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
goto random;
}
mds = cap->session->s_mds;
dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
inode, ceph_vinop(inode), mds,
cap == ci->i_auth_cap ? "auth " : "", cap);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return mds;
random:
@@ -764,7 +764,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
+ false);
if (!msg) {
pr_err("create_session_msg ENOMEM creating msg\n");
return NULL;
@@ -950,7 +951,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
__ceph_remove_cap(cap);
if (!__ceph_is_any_real_caps(ci)) {
struct ceph_mds_client *mdsc =
@@ -983,7 +984,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
}
spin_unlock(&mdsc->cap_dirty_lock);
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
while (drop--)
iput(inode);
return 0;
@@ -1014,10 +1015,10 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
wake_up_all(&ci->i_cap_wq);
if (arg) {
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
ci->i_wanted_max_size = 0;
ci->i_requested_max_size = 0;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
return 0;
}
@@ -1150,7 +1151,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
if (session->s_trim_caps <= 0)
return -1;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
mine = cap->issued | cap->implemented;
used = __ceph_caps_used(ci);
oissued = __ceph_caps_issued_other(ci, cap);
@@ -1169,7 +1170,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
__ceph_remove_cap(cap);
} else {
/* try to drop referring dentries */
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
d_prune_aliases(inode);
dout("trim_caps_cb %p cap %p pruned, count now %d\n",
inode, cap, atomic_read(&inode->i_count));
@@ -1177,7 +1178,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
}
out:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return 0;
}
@@ -1240,7 +1241,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
while (session->s_num_cap_releases < session->s_nr_caps + extra) {
spin_unlock(&session->s_cap_lock);
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
- GFP_NOFS);
+ GFP_NOFS, false);
if (!msg)
goto out_unlocked;
dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1295,7 +1296,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
i_flushing_item);
struct inode *inode = &ci->vfs_inode;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (ci->i_cap_flush_seq <= want_flush_seq) {
dout("check_cap_flush still flushing %p "
"seq %lld <= %lld to mds%d\n", inode,
@@ -1303,7 +1304,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
session->s_mds);
ret = 0;
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
@@ -1494,6 +1495,7 @@ retry:
pos, temp);
} else if (stop_on_nosnap && inode &&
ceph_snap(inode) == CEPH_NOSNAP) {
+ spin_unlock(&temp->d_lock);
break;
} else {
pos -= temp->d_name.len;
@@ -1652,7 +1654,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_old_dentry_drop)
len += req->r_old_dentry->d_name.len;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
if (!msg) {
msg = ERR_PTR(-ENOMEM);
goto out_free2;
@@ -2001,7 +2003,7 @@ out:
}
/*
- * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS
* namespace request.
*/
void ceph_invalidate_dir_request(struct ceph_mds_request *req)
@@ -2009,11 +2011,11 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
struct inode *inode = req->r_locked_dir;
struct ceph_inode_info *ci = ceph_inode(inode);
- dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
- spin_lock(&inode->i_lock);
- ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+ dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
+ spin_lock(&ci->i_ceph_lock);
+ ceph_dir_clear_complete(inode);
ci->i_release_count++;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (req->r_dentry)
ceph_invalidate_dentry_lease(req->r_dentry);
@@ -2421,7 +2423,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (err)
goto out_free;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
@@ -2444,7 +2446,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
rec.v1.pathbase = cpu_to_le64(pathbase);
reclen = sizeof(rec.v1);
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (recon_state->flock) {
int num_fcntl_locks, num_flock_locks;
@@ -2518,7 +2520,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
goto fail_nopagelist;
ceph_pagelist_init(pagelist);
- reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
+ reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
if (!reply)
goto fail_nomsg;
@@ -2831,7 +2833,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
dnamelen = dentry->d_name.len;
len += dnamelen;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
if (!msg)
return;
lease = msg->front.iov_base;
@@ -3153,7 +3155,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
/*
* true if all sessions are closed, or we force unmount
*/
-bool done_closing_sessions(struct ceph_mds_client *mdsc)
+static bool done_closing_sessions(struct ceph_mds_client *mdsc)
{
int i, n = 0;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4bb239921db..a50ca0e3947 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -20,7 +20,7 @@
*
* mdsc->snap_rwsem
*
- * inode->i_lock
+ * ci->i_ceph_lock
* mdsc->snap_flush_lock
* mdsc->cap_delay_lock
*
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index e2643719133..a559c80f127 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -446,7 +446,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
return;
}
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
@@ -528,7 +528,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
kfree(capsnap);
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -537,7 +537,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
*
* If capsnap can now be flushed, add to snap_flush list, and return 1.
*
- * Caller must hold i_lock.
+ * Caller must hold i_ceph_lock.
*/
int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
@@ -739,9 +739,9 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
inode = &ci->vfs_inode;
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
__ceph_flush_snaps(ci, &session, 0);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
iput(inode);
spin_lock(&mdsc->snap_flush_lock);
}
@@ -847,7 +847,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
continue;
ci = ceph_inode(inode);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (!ci->i_snap_realm)
goto skip_inode;
/*
@@ -876,7 +876,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
oldrealm = ci->i_snap_realm;
ci->i_snap_realm = realm;
spin_unlock(&realm->inodes_with_caps_lock);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
ceph_get_snap_realm(mdsc, realm);
ceph_put_snap_realm(mdsc, oldrealm);
@@ -885,7 +885,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
continue;
skip_inode:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
iput(inode);
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 88bacaf385d..b48f15f101a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -114,6 +114,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
enum {
Opt_wsize,
Opt_rsize,
+ Opt_rasize,
Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max,
Opt_cap_release_safety,
@@ -136,6 +137,7 @@ enum {
static match_table_t fsopt_tokens = {
{Opt_wsize, "wsize=%d"},
{Opt_rsize, "rsize=%d"},
+ {Opt_rasize, "rasize=%d"},
{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
{Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -196,6 +198,9 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_rsize:
fsopt->rsize = intval;
break;
+ case Opt_rasize:
+ fsopt->rasize = intval;
+ break;
case Opt_caps_wanted_delay_min:
fsopt->caps_wanted_delay_min = intval;
break;
@@ -289,28 +294,29 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
- fsopt->sb_flags = flags;
- fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+ fsopt->sb_flags = flags;
+ fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
- fsopt->rsize = CEPH_RSIZE_DEFAULT;
- fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+ fsopt->rsize = CEPH_RSIZE_DEFAULT;
+ fsopt->rasize = CEPH_RASIZE_DEFAULT;
+ fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
- fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
- fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
- fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
- fsopt->congestion_kb = default_congestion_kb();
-
- /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
- err = -EINVAL;
- if (!dev_name)
- goto out;
- *path = strstr(dev_name, ":/");
- if (*path == NULL) {
- pr_err("device name is missing path (no :/ in %s)\n",
- dev_name);
- goto out;
- }
+ fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+ fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+ fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+ fsopt->congestion_kb = default_congestion_kb();
+
+ /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+ err = -EINVAL;
+ if (!dev_name)
+ goto out;
+ *path = strstr(dev_name, ":/");
+ if (*path == NULL) {
+ pr_err("device name is missing path (no :/ in %s)\n",
+ dev_name);
+ goto out;
+ }
dev_name_end = *path;
dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
@@ -376,6 +382,8 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
seq_printf(m, ",rsize=%d", fsopt->rsize);
+ if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
+ seq_printf(m, ",rasize=%d", fsopt->rasize);
if (fsopt->congestion_kb != default_congestion_kb())
seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
@@ -418,24 +426,27 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
/*
* create a new fs client
*/
-struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_options *opt)
{
struct ceph_fs_client *fsc;
+ const unsigned supported_features =
+ CEPH_FEATURE_FLOCK |
+ CEPH_FEATURE_DIRLAYOUTHASH;
+ const unsigned required_features = 0;
int err = -ENOMEM;
fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
if (!fsc)
return ERR_PTR(-ENOMEM);
- fsc->client = ceph_create_client(opt, fsc);
+ fsc->client = ceph_create_client(opt, fsc, supported_features,
+ required_features);
if (IS_ERR(fsc->client)) {
err = PTR_ERR(fsc->client);
goto fail;
}
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
- CEPH_FEATURE_DIRLAYOUTHASH;
fsc->client->monc.want_mdsmap = 1;
fsc->mount_options = fsopt;
@@ -491,7 +502,7 @@ fail:
return ERR_PTR(err);
}
-void destroy_fs_client(struct ceph_fs_client *fsc)
+static void destroy_fs_client(struct ceph_fs_client *fsc)
{
dout("destroy_fs_client %p\n", fsc);
@@ -627,10 +638,12 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
if (err == 0) {
dout("open_root_inode success\n");
if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
- fsc->sb->s_root == NULL)
+ fsc->sb->s_root == NULL) {
root = d_alloc_root(req->r_target_inode);
- else
+ ceph_init_dentry(root);
+ } else {
root = d_obtain_alias(req->r_target_inode);
+ }
req->r_target_inode = NULL;
dout("open_root_inode success, root dentry is %p\n", root);
} else {
@@ -774,10 +787,10 @@ static int ceph_register_bdi(struct super_block *sb,
{
int err;
- /* set ra_pages based on rsize mount option? */
- if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+ /* set ra_pages based on rasize mount option? */
+ if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
fsc->backing_dev_info.ra_pages =
- (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+ (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
>> PAGE_SHIFT;
else
fsc->backing_dev_info.ra_pages =
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a23eed526f0..edcbf3774a5 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -36,7 +36,8 @@
#define ceph_test_mount_opt(fsc, opt) \
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
-#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */
+#define CEPH_RSIZE_DEFAULT 0 /* max read size */
+#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */
#define CEPH_MAX_READDIR_DEFAULT 1024
#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
@@ -45,8 +46,9 @@ struct ceph_mount_options {
int flags;
int sb_flags;
- int wsize;
- int rsize; /* max readahead */
+ int wsize; /* max write size */
+ int rsize; /* max read size */
+ int rasize; /* max readahead */
int congestion_kb; /* max writeback in flight */
int caps_wanted_delay_min, caps_wanted_delay_max;
int cap_release_safety;
@@ -201,6 +203,7 @@ struct ceph_inode_xattr {
* Ceph dentry state
*/
struct ceph_dentry_info {
+ unsigned long flags;
struct ceph_mds_session *lease_session;
u32 lease_gen, lease_shared_gen;
u32 lease_seq;
@@ -211,6 +214,18 @@ struct ceph_dentry_info {
u64 offset;
};
+/*
+ * dentry flags
+ *
+ * The locking for D_COMPLETE is a bit odd:
+ * - we can clear it at almost any time (see ceph_d_prune)
+ * - it is only meaningful if:
+ * - we hold dir inode i_ceph_lock
+ * - we hold dir FILE_SHARED caps
+ * - the dentry D_COMPLETE is set
+ */
+#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */
+
struct ceph_inode_xattrs_info {
/*
* (still encoded) xattr blob. we avoid the overhead of parsing
@@ -235,6 +250,8 @@ struct ceph_inode_xattrs_info {
struct ceph_inode_info {
struct ceph_vino i_vino; /* ceph ino + snap */
+ spinlock_t i_ceph_lock;
+
u64 i_version;
u32 i_time_warp_seq;
@@ -249,14 +266,14 @@ struct ceph_inode_info {
struct timespec i_rctime;
u64 i_rbytes, i_rfiles, i_rsubdirs;
u64 i_files, i_subdirs;
- u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
+ u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */
struct rb_root i_fragtree;
struct mutex i_fragtree_mutex;
struct ceph_inode_xattrs_info i_xattrs;
- /* capabilities. protected _both_ by i_lock and cap->session's
+ /* capabilities. protected _both_ by i_ceph_lock and cap->session's
* s_mutex. */
struct rb_root i_caps; /* cap list */
struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
@@ -344,9 +361,10 @@ static inline struct ceph_vino ceph_vino(struct inode *inode)
* x86_64+ino32 64 32
* x86_64 64 64
*/
-static inline u32 ceph_ino_to_ino32(ino_t ino)
+static inline u32 ceph_ino_to_ino32(__u64 vino)
{
- ino ^= ino >> (sizeof(ino) * 8 - 32);
+ u32 ino = vino & 0xffffffff;
+ ino ^= vino >> 32;
if (!ino)
ino = 1;
return ino;
@@ -357,11 +375,11 @@ static inline u32 ceph_ino_to_ino32(ino_t ino)
*/
static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
{
- ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
#if BITS_PER_LONG == 32
- ino = ceph_ino_to_ino32(ino);
+ return ceph_ino_to_ino32(vino.ino);
+#else
+ return (ino_t)vino.ino;
#endif
- return ino;
}
/*
@@ -413,7 +431,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
/*
* Ceph inode.
*/
-#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
#define CEPH_I_NODELAY 4 /* do not delay cap release */
#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
@@ -422,18 +439,18 @@ static inline void ceph_i_clear(struct inode *inode, unsigned mask)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
ci->i_ceph_flags &= ~mask;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
static inline void ceph_i_set(struct inode *inode, unsigned mask)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
ci->i_ceph_flags |= mask;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
}
static inline bool ceph_i_test(struct inode *inode, unsigned mask)
@@ -441,9 +458,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
struct ceph_inode_info *ci = ceph_inode(inode);
bool r;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
r = (ci->i_ceph_flags & mask) == mask;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return r;
}
@@ -471,6 +488,13 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
}
/*
+ * set/clear directory D_COMPLETE flag
+ */
+void ceph_dir_set_complete(struct inode *inode);
+void ceph_dir_clear_complete(struct inode *inode);
+bool ceph_dir_test_complete(struct inode *inode);
+
+/*
* caps helpers
*/
static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
@@ -486,9 +510,9 @@ extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
static inline int ceph_caps_issued(struct ceph_inode_info *ci)
{
int issued;
- spin_lock(&ci->vfs_inode.i_lock);
+ spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
- spin_unlock(&ci->vfs_inode.i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return issued;
}
@@ -496,9 +520,9 @@ static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
int touch)
{
int r;
- spin_lock(&ci->vfs_inode.i_lock);
+ spin_lock(&ci->i_ceph_lock);
r = __ceph_caps_issued_mask(ci, mask, touch);
- spin_unlock(&ci->vfs_inode.i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return r;
}
@@ -721,10 +745,9 @@ extern int ceph_add_cap(struct inode *inode,
extern void __ceph_remove_cap(struct ceph_cap *cap);
static inline void ceph_remove_cap(struct ceph_cap *cap)
{
- struct inode *inode = &cap->ci->vfs_inode;
- spin_lock(&inode->i_lock);
+ spin_lock(&cap->ci->i_ceph_lock);
__ceph_remove_cap(cap);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&cap->ci->i_ceph_lock);
}
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 96c6739a028..a5e36e4488a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -343,8 +343,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
}
static int __build_xattrs(struct inode *inode)
- __releases(inode->i_lock)
- __acquires(inode->i_lock)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
{
u32 namelen;
u32 numattr = 0;
@@ -372,7 +372,7 @@ start:
end = p + ci->i_xattrs.blob->vec.iov_len;
ceph_decode_32_safe(&p, end, numattr, bad);
xattr_version = ci->i_xattrs.version;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
GFP_NOFS);
@@ -387,7 +387,7 @@ start:
goto bad_lock;
}
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (ci->i_xattrs.version != xattr_version) {
/* lost a race, retry */
for (i = 0; i < numattr; i++)
@@ -418,7 +418,7 @@ start:
return err;
bad_lock:
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
bad:
if (xattrs) {
for (i = 0; i < numattr; i++)
@@ -512,7 +512,7 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
if (vxattrs)
vxattr = ceph_match_vxattr(vxattrs, name);
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
ci->i_xattrs.version, ci->i_xattrs.index_version);
@@ -520,14 +520,14 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
(ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
goto get_xattr;
} else {
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
/* get xattrs from mds (if we don't already have them) */
err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
if (err)
return err;
}
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (vxattr && vxattr->readonly) {
err = vxattr->getxattr_cb(ci, value, size);
@@ -558,7 +558,7 @@ get_xattr:
memcpy(value, xattr->val, xattr->val_len);
out:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return err;
}
@@ -573,7 +573,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
u32 len;
int i;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
ci->i_xattrs.version, ci->i_xattrs.index_version);
@@ -581,13 +581,13 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
(ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
goto list_xattr;
} else {
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
if (err)
return err;
}
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
err = __build_xattrs(inode);
if (err < 0)
@@ -619,7 +619,7 @@ list_xattr:
}
out:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
return err;
}
@@ -739,7 +739,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
if (!xattr)
goto out;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
retry:
issued = __ceph_caps_issued(ci, NULL);
if (!(issued & CEPH_CAP_XATTR_EXCL))
@@ -752,12 +752,12 @@ retry:
required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
struct ceph_buffer *blob = NULL;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
dout(" preaallocating new blob size=%d\n", required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob)
goto out;
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
ci->i_xattrs.prealloc_blob = blob;
@@ -770,13 +770,13 @@ retry:
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
return err;
do_sync:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
err = ceph_sync_setxattr(dentry, name, value, size, flags);
out:
kfree(newname);
@@ -833,7 +833,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
return -EOPNOTSUPP;
}
- spin_lock(&inode->i_lock);
+ spin_lock(&ci->i_ceph_lock);
__build_xattrs(inode);
issued = __ceph_caps_issued(ci, NULL);
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
@@ -846,12 +846,12 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
return err;
do_sync:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ci->i_ceph_lock);
err = ceph_send_removexattr(dentry, name);
return err;
}
diff --git a/fs/cifs/README b/fs/cifs/README
index c5c2c5e5f0f..895da1dc155 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -745,4 +745,18 @@ installed and something like the following lines should be added to the
create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
create dns_resolver * * /usr/local/sbin/cifs.upcall %k
+CIFS kernel module parameters
+=============================
+These module parameters can be specified or modified either during the time of
+module loading or during the runtime by using the interface
+ /proc/module/cifs/parameters/<param>
+
+i.e. echo "value" > /sys/module/cifs/parameters/<param>
+
+1. echo_retries - The number of echo attempts before giving up and
+ reconnecting to the server. The default is 5. The value 0
+ means never reconnect.
+
+2. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
+ [Y/y/1]. To disable use any of [N/n/0].
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 6d40656e1e2..84e8c072470 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -511,7 +511,7 @@ static const struct file_operations cifsFYI_proc_fops = {
static int cifs_oplock_proc_show(struct seq_file *m, void *v)
{
- seq_printf(m, "%d\n", oplockEnabled);
+ seq_printf(m, "%d\n", enable_oplocks);
return 0;
}
@@ -526,13 +526,16 @@ static ssize_t cifs_oplock_proc_write(struct file *file,
char c;
int rc;
+ printk(KERN_WARNING "CIFS: The /proc/fs/cifs/OplockEnabled interface "
+ "will be removed in kernel version 3.4. Please migrate to "
+ "using the 'enable_oplocks' module parameter in cifs.ko.\n");
rc = get_user(c, buffer);
if (rc)
return rc;
if (c == '0' || c == 'n' || c == 'N')
- oplockEnabled = 0;
+ enable_oplocks = false;
else if (c == '1' || c == 'y' || c == 'Y')
- oplockEnabled = 1;
+ enable_oplocks = true;
return count;
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 7260e11e21f..500d6585927 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -43,6 +43,8 @@
#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */
#define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */
#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
+#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
+#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
struct cifs_sb_info {
struct rb_root tlink_tree;
@@ -55,6 +57,8 @@ struct cifs_sb_info {
atomic_t active;
uid_t mnt_uid;
gid_t mnt_gid;
+ uid_t mnt_backupuid;
+ gid_t mnt_backupgid;
mode_t mnt_file_mode;
mode_t mnt_dir_mode;
unsigned int mnt_cifs_flags;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index d0f59faefb7..72ddf23ef6f 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -91,9 +91,76 @@ cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
spin_unlock(&sidgidlock);
+ root = &siduidtree;
+ spin_lock(&uidsidlock);
+ shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+ spin_unlock(&uidsidlock);
+
+ root = &sidgidtree;
+ spin_lock(&gidsidlock);
+ shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+ spin_unlock(&gidsidlock);
+
return nr_rem;
}
+static void
+sid_rb_insert(struct rb_root *root, unsigned long cid,
+ struct cifs_sid_id **psidid, char *typestr)
+{
+ char *strptr;
+ struct rb_node *node = root->rb_node;
+ struct rb_node *parent = NULL;
+ struct rb_node **linkto = &(root->rb_node);
+ struct cifs_sid_id *lsidid;
+
+ while (node) {
+ lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+ parent = node;
+ if (cid > lsidid->id) {
+ linkto = &(node->rb_left);
+ node = node->rb_left;
+ }
+ if (cid < lsidid->id) {
+ linkto = &(node->rb_right);
+ node = node->rb_right;
+ }
+ }
+
+ (*psidid)->id = cid;
+ (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
+ (*psidid)->refcount = 0;
+
+ sprintf((*psidid)->sidstr, "%s", typestr);
+ strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
+ sprintf(strptr, "%ld", cid);
+
+ clear_bit(SID_ID_PENDING, &(*psidid)->state);
+ clear_bit(SID_ID_MAPPED, &(*psidid)->state);
+
+ rb_link_node(&(*psidid)->rbnode, parent, linkto);
+ rb_insert_color(&(*psidid)->rbnode, root);
+}
+
+static struct cifs_sid_id *
+sid_rb_search(struct rb_root *root, unsigned long cid)
+{
+ struct rb_node *node = root->rb_node;
+ struct cifs_sid_id *lsidid;
+
+ while (node) {
+ lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+ if (cid > lsidid->id)
+ node = node->rb_left;
+ else if (cid < lsidid->id)
+ node = node->rb_right;
+ else /* node found */
+ return lsidid;
+ }
+
+ return NULL;
+}
+
static struct shrinker cifs_shrinker = {
.shrink = cifs_idmap_shrinker,
.seeks = DEFAULT_SEEKS,
@@ -110,6 +177,7 @@ cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen)
memcpy(payload, data, datalen);
key->payload.data = payload;
+ key->datalen = datalen;
return 0;
}
@@ -224,6 +292,120 @@ sidid_pending_wait(void *unused)
}
static int
+id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid)
+{
+ int rc = 0;
+ struct key *sidkey;
+ const struct cred *saved_cred;
+ struct cifs_sid *lsid;
+ struct cifs_sid_id *psidid, *npsidid;
+ struct rb_root *cidtree;
+ spinlock_t *cidlock;
+
+ if (sidtype == SIDOWNER) {
+ cidlock = &siduidlock;
+ cidtree = &uidtree;
+ } else if (sidtype == SIDGROUP) {
+ cidlock = &sidgidlock;
+ cidtree = &gidtree;
+ } else
+ return -EINVAL;
+
+ spin_lock(cidlock);
+ psidid = sid_rb_search(cidtree, cid);
+
+ if (!psidid) { /* node does not exist, allocate one & attempt adding */
+ spin_unlock(cidlock);
+ npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
+ if (!npsidid)
+ return -ENOMEM;
+
+ npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
+ if (!npsidid->sidstr) {
+ kfree(npsidid);
+ return -ENOMEM;
+ }
+
+ spin_lock(cidlock);
+ psidid = sid_rb_search(cidtree, cid);
+ if (psidid) { /* node happened to get inserted meanwhile */
+ ++psidid->refcount;
+ spin_unlock(cidlock);
+ kfree(npsidid->sidstr);
+ kfree(npsidid);
+ } else {
+ psidid = npsidid;
+ sid_rb_insert(cidtree, cid, &psidid,
+ sidtype == SIDOWNER ? "oi:" : "gi:");
+ ++psidid->refcount;
+ spin_unlock(cidlock);
+ }
+ } else {
+ ++psidid->refcount;
+ spin_unlock(cidlock);
+ }
+
+ /*
+ * If we are here, it is safe to access psidid and its fields
+ * since a reference was taken earlier while holding the spinlock.
+ * A reference on the node is put without holding the spinlock
+ * and it is OK to do so in this case, shrinker will not erase
+ * this node until all references are put and we do not access
+ * any fields of the node after a reference is put .
+ */
+ if (test_bit(SID_ID_MAPPED, &psidid->state)) {
+ memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
+ psidid->time = jiffies; /* update ts for accessing */
+ goto id_sid_out;
+ }
+
+ if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) {
+ rc = -EINVAL;
+ goto id_sid_out;
+ }
+
+ if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
+ saved_cred = override_creds(root_cred);
+ sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
+ if (IS_ERR(sidkey)) {
+ rc = -EINVAL;
+ cFYI(1, "%s: Can't map and id to a SID", __func__);
+ } else {
+ lsid = (struct cifs_sid *)sidkey->payload.data;
+ memcpy(&psidid->sid, lsid,
+ sidkey->datalen < sizeof(struct cifs_sid) ?
+ sidkey->datalen : sizeof(struct cifs_sid));
+ memcpy(ssid, &psidid->sid,
+ sidkey->datalen < sizeof(struct cifs_sid) ?
+ sidkey->datalen : sizeof(struct cifs_sid));
+ set_bit(SID_ID_MAPPED, &psidid->state);
+ key_put(sidkey);
+ kfree(psidid->sidstr);
+ }
+ psidid->time = jiffies; /* update ts for accessing */
+ revert_creds(saved_cred);
+ clear_bit(SID_ID_PENDING, &psidid->state);
+ wake_up_bit(&psidid->state, SID_ID_PENDING);
+ } else {
+ rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
+ sidid_pending_wait, TASK_INTERRUPTIBLE);
+ if (rc) {
+ cFYI(1, "%s: sidid_pending_wait interrupted %d",
+ __func__, rc);
+ --psidid->refcount;
+ return rc;
+ }
+ if (test_bit(SID_ID_MAPPED, &psidid->state))
+ memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
+ else
+ rc = -EINVAL;
+ }
+id_sid_out:
+ --psidid->refcount;
+ return rc;
+}
+
+static int
sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
struct cifs_fattr *fattr, uint sidtype)
{
@@ -383,6 +565,10 @@ init_cifs_idmap(void)
spin_lock_init(&sidgidlock);
gidtree = RB_ROOT;
+ spin_lock_init(&uidsidlock);
+ siduidtree = RB_ROOT;
+ spin_lock_init(&gidsidlock);
+ sidgidtree = RB_ROOT;
register_shrinker(&cifs_shrinker);
cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring));
@@ -422,6 +608,18 @@ cifs_destroy_idmaptrees(void)
while ((node = rb_first(root)))
rb_erase(node, root);
spin_unlock(&sidgidlock);
+
+ root = &siduidtree;
+ spin_lock(&uidsidlock);
+ while ((node = rb_first(root)))
+ rb_erase(node, root);
+ spin_unlock(&uidsidlock);
+
+ root = &sidgidtree;
+ spin_lock(&gidsidlock);
+ while ((node = rb_first(root)))
+ rb_erase(node, root);
+ spin_unlock(&gidsidlock);
}
/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
@@ -706,7 +904,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
acl_size = sizeof(struct cifs_acl);
num_aces = le32_to_cpu(pdacl->num_aces);
- if (num_aces > 0) {
+ if (num_aces > 0) {
umode_t user_mask = S_IRWXU;
umode_t group_mask = S_IRWXG;
umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
@@ -868,52 +1066,82 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
else
cFYI(1, "no ACL"); /* BB grant all or default perms? */
-/* cifscred->uid = owner_sid_ptr->rid;
- cifscred->gid = group_sid_ptr->rid;
- memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr,
- sizeof(struct cifs_sid));
- memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
- sizeof(struct cifs_sid)); */
-
return rc;
}
-
/* Convert permission bits from mode to equivalent CIFS ACL */
static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
- struct inode *inode, __u64 nmode)
+ __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag)
{
int rc = 0;
__u32 dacloffset;
__u32 ndacloffset;
__u32 sidsoffset;
struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+ struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */
struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
- if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
- return -EIO;
-
- owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ if (nmode != NO_CHANGE_64) { /* chmod */
+ owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
le32_to_cpu(pntsd->osidoffset));
- group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
le32_to_cpu(pntsd->gsidoffset));
-
- dacloffset = le32_to_cpu(pntsd->dacloffset);
- dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-
- ndacloffset = sizeof(struct cifs_ntsd);
- ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
- ndacl_ptr->revision = dacl_ptr->revision;
- ndacl_ptr->size = 0;
- ndacl_ptr->num_aces = 0;
-
- rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode);
-
- sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
-
- /* copy security descriptor control portion and owner and group sid */
- copy_sec_desc(pntsd, pnntsd, sidsoffset);
+ dacloffset = le32_to_cpu(pntsd->dacloffset);
+ dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+ ndacloffset = sizeof(struct cifs_ntsd);
+ ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
+ ndacl_ptr->revision = dacl_ptr->revision;
+ ndacl_ptr->size = 0;
+ ndacl_ptr->num_aces = 0;
+
+ rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr,
+ nmode);
+ sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
+ /* copy sec desc control portion & owner and group sids */
+ copy_sec_desc(pntsd, pnntsd, sidsoffset);
+ *aclflag = CIFS_ACL_DACL;
+ } else {
+ memcpy(pnntsd, pntsd, secdesclen);
+ if (uid != NO_CHANGE_32) { /* chown */
+ owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
+ le32_to_cpu(pnntsd->osidoffset));
+ nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+ GFP_KERNEL);
+ if (!nowner_sid_ptr)
+ return -ENOMEM;
+ rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr);
+ if (rc) {
+ cFYI(1, "%s: Mapping error %d for owner id %d",
+ __func__, rc, uid);
+ kfree(nowner_sid_ptr);
+ return rc;
+ }
+ memcpy(owner_sid_ptr, nowner_sid_ptr,
+ sizeof(struct cifs_sid));
+ kfree(nowner_sid_ptr);
+ *aclflag = CIFS_ACL_OWNER;
+ }
+ if (gid != NO_CHANGE_32) { /* chgrp */
+ group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
+ le32_to_cpu(pnntsd->gsidoffset));
+ ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+ GFP_KERNEL);
+ if (!ngroup_sid_ptr)
+ return -ENOMEM;
+ rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr);
+ if (rc) {
+ cFYI(1, "%s: Mapping error %d for group id %d",
+ __func__, rc, gid);
+ kfree(ngroup_sid_ptr);
+ return rc;
+ }
+ memcpy(group_sid_ptr, ngroup_sid_ptr,
+ sizeof(struct cifs_sid));
+ kfree(ngroup_sid_ptr);
+ *aclflag = CIFS_ACL_GROUP;
+ }
+ }
return rc;
}
@@ -945,7 +1173,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
{
struct cifs_ntsd *pntsd = NULL;
int oplock = 0;
- int xid, rc;
+ int xid, rc, create_options = 0;
__u16 fid;
struct cifs_tcon *tcon;
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -956,9 +1184,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
tcon = tlink_tcon(tlink);
xid = GetXid();
- rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
- &fid, &oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL,
+ create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
if (!rc) {
rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
CIFSSMBClose(xid, tcon, fid);
@@ -991,13 +1222,15 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
return pntsd;
}
-static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
- struct cifs_ntsd *pnntsd, u32 acllen)
+ /* Set an ACL on the server */
+int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+ struct inode *inode, const char *path, int aclflag)
{
int oplock = 0;
- int xid, rc;
+ int xid, rc, access_flags, create_options = 0;
__u16 fid;
struct cifs_tcon *tcon;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -1006,15 +1239,23 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
tcon = tlink_tcon(tlink);
xid = GetXid();
- rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
- &fid, &oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
+ access_flags = WRITE_OWNER;
+ else
+ access_flags = WRITE_DAC;
+
+ rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags,
+ create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
if (rc) {
cERROR(1, "Unable to open file to set ACL");
goto out;
}
- rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
+ rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag);
cFYI(DBG2, "SetCIFSACL rc = %d", rc);
CIFSSMBClose(xid, tcon, fid);
@@ -1024,17 +1265,6 @@ out:
return rc;
}
-/* Set an ACL on the server */
-int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
- struct inode *inode, const char *path)
-{
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-
- cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
-
- return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
-}
-
/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
int
cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
@@ -1066,9 +1296,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
}
/* Convert mode bits to an ACL so we can update the ACL on the server */
-int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
+int
+id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
+ uid_t uid, gid_t gid)
{
int rc = 0;
+ int aclflag = CIFS_ACL_DACL; /* default flag to set */
__u32 secdesclen = 0;
struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
@@ -1098,13 +1331,15 @@ int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
return -ENOMEM;
}
- rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
+ rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
+ &aclflag);
cFYI(DBG2, "build_sec_desc rc: %d", rc);
if (!rc) {
/* Set the security descriptor */
- rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
+ rc = set_cifs_acl(pnntsd, secdesclen, inode,
+ path, aclflag);
cFYI(DBG2, "set_cifs_acl rc: %d", rc);
}
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 30acd22147e..5d9b9acc5fc 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -37,83 +37,8 @@
* the sequence number before this function is called. Also, this function
* should be called with the server->srv_mutex held.
*/
-static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
- struct TCP_Server_Info *server, char *signature)
-{
- int rc;
-
- if (cifs_pdu == NULL || signature == NULL || server == NULL)
- return -EINVAL;
-
- if (!server->secmech.sdescmd5) {
- cERROR(1, "%s: Can't generate signature\n", __func__);
- return -1;
- }
-
- rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
- if (rc) {
- cERROR(1, "%s: Could not init md5\n", __func__);
- return rc;
- }
-
- rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
- server->session_key.response, server->session_key.len);
- if (rc) {
- cERROR(1, "%s: Could not update with response\n", __func__);
- return rc;
- }
-
- rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
- cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length));
- if (rc) {
- cERROR(1, "%s: Could not update with payload\n", __func__);
- return rc;
- }
-
- rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
- if (rc)
- cERROR(1, "%s: Could not generate md5 hash\n", __func__);
-
- return rc;
-}
-
-/* must be called with server->srv_mutex held */
-int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
- __u32 *pexpected_response_sequence_number)
-{
- int rc = 0;
- char smb_signature[20];
-
- if ((cifs_pdu == NULL) || (server == NULL))
- return -EINVAL;
-
- if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
- server->tcpStatus == CifsNeedNegotiate)
- return rc;
-
- if (!server->session_estab) {
- strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
- return rc;
- }
-
- cifs_pdu->Signature.Sequence.SequenceNumber =
- cpu_to_le32(server->sequence_number);
- cifs_pdu->Signature.Sequence.Reserved = 0;
-
- *pexpected_response_sequence_number = server->sequence_number++;
- server->sequence_number++;
-
- rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
- if (rc)
- memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
- else
- memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8);
-
- return rc;
-}
-
-static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
- struct TCP_Server_Info *server, char *signature)
+static int cifs_calc_signature(const struct kvec *iov, int n_vec,
+ struct TCP_Server_Info *server, char *signature)
{
int i;
int rc;
@@ -179,7 +104,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
{
int rc = 0;
char smb_signature[20];
- struct smb_hdr *cifs_pdu = iov[0].iov_base;
+ struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base;
if ((cifs_pdu == NULL) || (server == NULL))
return -EINVAL;
@@ -189,7 +114,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
return rc;
if (!server->session_estab) {
- strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
+ memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
return rc;
}
@@ -200,7 +125,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
*pexpected_response_sequence_number = server->sequence_number++;
server->sequence_number++;
- rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
+ rc = cifs_calc_signature(iov, n_vec, server, smb_signature);
if (rc)
memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
else
@@ -209,13 +134,27 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
return rc;
}
-int cifs_verify_signature(struct smb_hdr *cifs_pdu,
+/* must be called with server->srv_mutex held */
+int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
+ __u32 *pexpected_response_sequence_number)
+{
+ struct kvec iov;
+
+ iov.iov_base = cifs_pdu;
+ iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4;
+
+ return cifs_sign_smb2(&iov, 1, server,
+ pexpected_response_sequence_number);
+}
+
+int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
struct TCP_Server_Info *server,
__u32 expected_sequence_number)
{
unsigned int rc;
char server_response_sig[8];
char what_we_think_sig_should_be[20];
+ struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base;
if (cifs_pdu == NULL || server == NULL)
return -EINVAL;
@@ -247,8 +186,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
cifs_pdu->Signature.Sequence.Reserved = 0;
mutex_lock(&server->srv_mutex);
- rc = cifs_calculate_signature(cifs_pdu, server,
- what_we_think_sig_should_be);
+ rc = cifs_calc_signature(iov, nr_iov, server,
+ what_we_think_sig_should_be);
mutex_unlock(&server->srv_mutex);
if (rc)
@@ -265,7 +204,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
}
/* first calculate 24 bytes ntlm response and then 16 byte session key */
-int setup_ntlm_response(struct cifs_ses *ses)
+int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
{
int rc = 0;
unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
@@ -282,14 +221,14 @@ int setup_ntlm_response(struct cifs_ses *ses)
ses->auth_key.len = temp_len;
rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
- ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+ ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp);
if (rc) {
cFYI(1, "%s Can't generate NTLM response, error: %d",
__func__, rc);
return rc;
}
- rc = E_md4hash(ses->password, temp_key);
+ rc = E_md4hash(ses->password, temp_key, nls_cp);
if (rc) {
cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
return rc;
@@ -465,7 +404,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
}
/* calculate md4 hash of password */
- E_md4hash(ses->password, nt_hash);
+ E_md4hash(ses->password, nt_hash, nls_cp);
rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
CIFS_NTHASH_SIZE);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 54b8f1e7da9..8f1fe324162 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -53,7 +53,7 @@
int cifsFYI = 0;
int cifsERROR = 1;
int traceSMB = 0;
-unsigned int oplockEnabled = 1;
+bool enable_oplocks = true;
unsigned int linuxExtEnabled = 1;
unsigned int lookupCacheEnabled = 1;
unsigned int multiuser_mount = 0;
@@ -74,7 +74,7 @@ module_param(cifs_min_small, int, 0);
MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
"Range: 2 to 256");
unsigned int cifs_max_pending = CIFS_MAX_REQ;
-module_param(cifs_max_pending, int, 0);
+module_param(cifs_max_pending, int, 0444);
MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
"Default: 50 Range: 2 to 256");
unsigned short echo_retries = 5;
@@ -82,6 +82,10 @@ module_param(echo_retries, ushort, 0644);
MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
"reconnecting server. Default: 5. 0 means "
"never reconnect.");
+module_param(enable_oplocks, bool, 0644);
+MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
+ "y/Y/1");
+
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
@@ -132,12 +136,12 @@ cifs_read_super(struct super_block *sb)
else
sb->s_d_op = &cifs_dentry_ops;
-#ifdef CIFS_NFSD_EXPORT
+#ifdef CONFIG_CIFS_NFSD_EXPORT
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
cFYI(1, "export ops supported");
sb->s_export_op = &cifs_export_ops;
}
-#endif /* CIFS_NFSD_EXPORT */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
return 0;
@@ -432,6 +436,12 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
seq_printf(s, ",mfsymlinks");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
seq_printf(s, ",fsc");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
+ seq_printf(s, ",nostrictsync");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
+ seq_printf(s, ",noperm");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+ seq_printf(s, ",strictcache");
seq_printf(s, ",rsize=%d", cifs_sb->rsize);
seq_printf(s, ",wsize=%d", cifs_sb->wsize);
@@ -530,7 +540,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
char *full_path = NULL;
char *s, *p;
char sep;
- int xid;
full_path = cifs_build_path_to_root(vol, cifs_sb,
cifs_sb_master_tcon(cifs_sb));
@@ -539,7 +548,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
cFYI(1, "Get root dentry for %s", full_path);
- xid = GetXid();
sep = CIFS_DIR_SEP(cifs_sb);
dentry = dget(sb->s_root);
p = s = full_path;
@@ -570,7 +578,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
dput(dentry);
dentry = child;
} while (!IS_ERR(dentry));
- _FreeXid(xid);
kfree(full_path);
return dentry;
}
@@ -723,7 +730,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
if (rc < 0)
return (loff_t)rc;
}
- return generic_file_llseek_unlocked(file, offset, origin);
+ return generic_file_llseek(file, offset, origin);
}
static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
@@ -942,7 +949,8 @@ cifs_init_once(void *inode)
struct cifsInodeInfo *cifsi = inode;
inode_init_once(&cifsi->vfs_inode);
- INIT_LIST_HEAD(&cifsi->lockList);
+ INIT_LIST_HEAD(&cifsi->llist);
+ mutex_init(&cifsi->lock_mutex);
}
static int
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 95da8027983..30ff56005d8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -121,9 +121,9 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
-#ifdef CIFS_NFSD_EXPORT
+#ifdef CONFIG_CIFS_NFSD_EXPORT
extern const struct export_operations cifs_export_ops;
-#endif /* CIFS_NFSD_EXPORT */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "1.75"
+#define CIFS_VERSION "1.76"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 95dad9d14cf..8238aa13e01 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -167,6 +167,8 @@ struct smb_vol {
uid_t cred_uid;
uid_t linux_uid;
gid_t linux_gid;
+ uid_t backupuid;
+ gid_t backupgid;
mode_t file_mode;
mode_t dir_mode;
unsigned secFlg;
@@ -179,6 +181,8 @@ struct smb_vol {
bool noperm:1;
bool no_psx_acl:1; /* set if posix acl support should be disabled */
bool cifs_acl:1;
+ bool backupuid_specified; /* mount option backupuid is specified */
+ bool backupgid_specified; /* mount option backupgid is specified */
bool no_xattr:1; /* set if xattr (EA) support should be disabled*/
bool server_ino:1; /* use inode numbers from server ie UniqueId */
bool direct_io:1;
@@ -219,7 +223,8 @@ struct smb_vol {
CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \
CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
- CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO)
+ CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
+ CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
MS_NODEV | MS_SYNCHRONOUS)
@@ -286,7 +291,13 @@ struct TCP_Server_Info {
bool sec_kerberosu2u; /* supports U2U Kerberos */
bool sec_kerberos; /* supports plain Kerberos */
bool sec_mskerberos; /* supports legacy MS Kerberos */
+ bool large_buf; /* is current buffer large? */
struct delayed_work echo; /* echo ping workqueue job */
+ struct kvec *iov; /* reusable kvec array for receives */
+ unsigned int nr_iov; /* number of kvecs in array */
+ char *smallbuf; /* pointer to current "small" buffer */
+ char *bigbuf; /* pointer to current "big" buffer */
+ unsigned int total_read; /* total amount of data read in this pass */
#ifdef CONFIG_CIFS_FSCACHE
struct fscache_cookie *fscache; /* client index cache cookie */
#endif
@@ -485,9 +496,13 @@ extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
*/
struct cifsLockInfo {
struct list_head llist; /* pointer to next cifsLockInfo */
+ struct list_head blist; /* pointer to locks blocked on this */
+ wait_queue_head_t block_q;
__u64 offset;
__u64 length;
+ __u32 pid;
__u8 type;
+ __u16 netfid;
};
/*
@@ -520,8 +535,6 @@ struct cifsFileInfo {
struct dentry *dentry;
unsigned int f_flags;
struct tcon_link *tlink;
- struct mutex lock_mutex;
- struct list_head llist; /* list of byte range locks we have. */
bool invalidHandle:1; /* file closed via session abend */
bool oplock_break_cancelled:1;
int count; /* refcount protected by cifs_file_list_lock */
@@ -554,7 +567,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
*/
struct cifsInodeInfo {
- struct list_head lockList;
+ struct list_head llist; /* brlocks for this inode */
+ bool can_cache_brlcks;
+ struct mutex lock_mutex; /* protect two fields above */
/* BB add in lists for dirty pages i.e. write caching info for oplock */
struct list_head openFileList;
__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
@@ -643,8 +658,24 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
struct mid_q_entry;
/*
- * This is the prototype for the mid callback function. When creating one,
- * take special care to avoid deadlocks. Things to bear in mind:
+ * This is the prototype for the mid receive function. This function is for
+ * receiving the rest of the SMB frame, starting with the WordCount (which is
+ * just after the MID in struct smb_hdr). Note:
+ *
+ * - This will be called by cifsd, with no locks held.
+ * - The mid will still be on the pending_mid_q.
+ * - mid->resp_buf will point to the current buffer.
+ *
+ * Returns zero on a successful receive, or an error. The receive state in
+ * the TCP_Server_Info will also be updated.
+ */
+typedef int (mid_receive_t)(struct TCP_Server_Info *server,
+ struct mid_q_entry *mid);
+
+/*
+ * This is the prototype for the mid callback function. This is called once the
+ * mid has been received off of the socket. When creating one, take special
+ * care to avoid deadlocks. Things to bear in mind:
*
* - it will be called by cifsd, with no locks held
* - the mid will be removed from any lists
@@ -662,9 +693,10 @@ struct mid_q_entry {
unsigned long when_sent; /* time when smb send finished */
unsigned long when_received; /* when demux complete (taken off wire) */
#endif
+ mid_receive_t *receive; /* call receive callback */
mid_callback_t *callback; /* call completion callback */
void *callback_data; /* general purpose pointer for callback */
- struct smb_hdr *resp_buf; /* response buffer */
+ struct smb_hdr *resp_buf; /* pointer to received SMB header */
int midState; /* wish this were enum but can not pass to wait_event */
__u8 command; /* smb command code */
bool largeBuf:1; /* if valid response, is pointer to large buf */
@@ -964,7 +996,8 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
to be established on existing mount if we
have the uid/password or Kerberos credential
or equivalent for current user */
-GLOBAL_EXTERN unsigned int oplockEnabled;
+/* enable or disable oplocks */
+GLOBAL_EXTERN bool enable_oplocks;
GLOBAL_EXTERN unsigned int lookupCacheEnabled;
GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
with more secure ntlmssp2 challenge/resp */
@@ -978,10 +1011,16 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
/* reconnect after this many failed echo attempts */
GLOBAL_EXTERN unsigned short echo_retries;
+#ifdef CONFIG_CIFS_ACL
GLOBAL_EXTERN struct rb_root uidtree;
GLOBAL_EXTERN struct rb_root gidtree;
GLOBAL_EXTERN spinlock_t siduidlock;
GLOBAL_EXTERN spinlock_t sidgidlock;
+GLOBAL_EXTERN struct rb_root siduidtree;
+GLOBAL_EXTERN struct rb_root sidgidtree;
+GLOBAL_EXTERN spinlock_t uidsidlock;
+GLOBAL_EXTERN spinlock_t gidsidlock;
+#endif /* CONFIG_CIFS_ACL */
void cifs_oplock_break(struct work_struct *work);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de3aa285de0..3fb03e2c8e8 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1089,9 +1089,7 @@ typedef struct smb_com_read_rsp {
__le16 DataLengthHigh;
__u64 Reserved2;
__u16 ByteCount;
- __u8 Pad; /* BB check for whether padded to DWORD
- boundary and optimum performance here */
- char Data[1];
+ /* read response data immediately follows */
} __attribute__((packed)) READ_RSP;
typedef struct locking_andx_range {
@@ -1913,6 +1911,10 @@ typedef struct whoami_rsp_data { /* Query level 0x202 */
/* SETFSInfo Levels */
#define SMB_SET_CIFS_UNIX_INFO 0x200
+/* level 0x203 is defined above in list of QFS info levels */
+/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */
+
+/* Level 0x200 request structure follows */
typedef struct smb_com_transaction2_setfsi_req {
struct smb_hdr hdr; /* wct = 15 */
__le16 TotalParameterCount;
@@ -1940,13 +1942,39 @@ typedef struct smb_com_transaction2_setfsi_req {
__le64 ClientUnixCap; /* Data end */
} __attribute__((packed)) TRANSACTION2_SETFSI_REQ;
+/* level 0x203 request structure follows */
+typedef struct smb_com_transaction2_setfs_enc_req {
+ struct smb_hdr hdr; /* wct = 15 */
+ __le16 TotalParameterCount;
+ __le16 TotalDataCount;
+ __le16 MaxParameterCount;
+ __le16 MaxDataCount;
+ __u8 MaxSetupCount;
+ __u8 Reserved;
+ __le16 Flags;
+ __le32 Timeout;
+ __u16 Reserved2;
+ __le16 ParameterCount; /* 4 */
+ __le16 ParameterOffset;
+ __le16 DataCount; /* 12 */
+ __le16 DataOffset;
+ __u8 SetupCount; /* one */
+ __u8 Reserved3;
+ __le16 SubCommand; /* TRANS2_SET_FS_INFORMATION */
+ __le16 ByteCount;
+ __u8 Pad;
+ __u16 Reserved4; /* Parameters start. */
+ __le16 InformationLevel;/* Parameters end. */
+ /* NTLMSSP Blob, Data start. */
+} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ;
+
+/* response for setfsinfo levels 0x200 and 0x203 */
typedef struct smb_com_transaction2_setfsi_rsp {
struct smb_hdr hdr; /* wct = 10 */
struct trans2_resp t2;
__u16 ByteCount;
} __attribute__((packed)) TRANSACTION2_SETFSI_RSP;
-
typedef struct smb_com_transaction2_get_dfs_refer_req {
struct smb_hdr hdr; /* wct = 15 */
__le16 TotalParameterCount;
@@ -2098,13 +2126,13 @@ typedef struct {
#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and
QFS PROXY call */
#ifdef CONFIG_CIFS_POSIX
-/* Can not set pathnames cap yet until we send new posix create SMB since
- otherwise server can treat such handles opened with older ntcreatex
- (by a new client which knows how to send posix path ops)
- as non-posix handles (can affect write behavior with byte range locks.
- We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */
+/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send
+ LockingX instead of posix locking call on unix sess (and we do not expect
+ LockingX to use different (ie Windows) semantics than posix locking on
+ the same session (if WINE needs to do this later, we can add this cap
+ back in later */
/* #define CIFS_UNIX_CAP_MASK 0x000000fb */
-#define CIFS_UNIX_CAP_MASK 0x000000db
+#define CIFS_UNIX_CAP_MASK 0x000003db
#else
#define CIFS_UNIX_CAP_MASK 0x00000013
#endif /* CONFIG_CIFS_POSIX */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8df28e925e5..6f4e243e0f6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -69,8 +69,9 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
struct TCP_Server_Info *server);
extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
- unsigned int nvec, mid_callback_t *callback,
- void *cbdata, bool ignore_pend);
+ unsigned int nvec, mid_receive_t *receive,
+ mid_callback_t *callback, void *cbdata,
+ bool ignore_pend);
extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
struct smb_hdr * /* input */ ,
struct smb_hdr * /* out */ ,
@@ -90,6 +91,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
extern bool is_valid_oplock_break(struct smb_hdr *smb,
struct TCP_Server_Info *);
+extern bool backup_cred(struct cifs_sb_info *);
extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
@@ -145,12 +147,19 @@ extern int cifs_get_inode_info_unix(struct inode **pinode,
extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr, struct inode *inode,
const char *path, const __u16 *pfid);
-extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
+extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
+ uid_t, gid_t);
extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
const char *, u32 *);
extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
- const char *);
+ const char *, int);
+extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
+extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+ unsigned int to_read);
+extern int cifs_readv_from_socket(struct TCP_Server_Info *server,
+ struct kvec *iov_orig, unsigned int nr_segs,
+ unsigned int to_read);
extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
struct cifs_sb_info *cifs_sb);
extern int cifs_match_super(struct super_block *, void *);
@@ -359,14 +368,17 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
const struct nls_table *nls_codepage,
int remap_special_chars);
+extern int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid,
+ const __u8 lock_type, const __u32 num_unlock,
+ const __u32 num_lock, LOCKING_ANDX_RANGE *buf);
extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
- const __u16 netfid, const __u64 len,
+ const __u16 netfid, const __u32 netpid, const __u64 len,
const __u64 offset, const __u32 numUnlock,
const __u32 numLock, const __u8 lockType,
const bool waitFlag, const __u8 oplock_level);
extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
- const __u16 smb_file_id, const int get_flag,
- const __u64 len, struct file_lock *,
+ const __u16 smb_file_id, const __u32 netpid,
+ const int get_flag, const __u64 len, struct file_lock *,
const __u16 lock_type, const bool waitFlag);
extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon);
extern int CIFSSMBEcho(struct TCP_Server_Info *server);
@@ -380,11 +392,12 @@ extern void tconInfoFree(struct cifs_tcon *);
extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
__u32 *);
-extern int cifs_verify_signature(struct smb_hdr *,
+extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
struct TCP_Server_Info *server,
__u32 expected_sequence_number);
-extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
-extern int setup_ntlm_response(struct cifs_ses *);
+extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
+ const struct nls_table *);
+extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
@@ -419,7 +432,7 @@ extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon,
extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon,
__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16,
- struct cifs_ntsd *, __u32);
+ struct cifs_ntsd *, __u32, int);
extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon,
const unsigned char *searchName,
char *acl_inf, const int buflen, const int acl_type,
@@ -436,10 +449,29 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
const unsigned char *path,
struct cifs_sb_info *cifs_sb, int xid);
extern int mdfour(unsigned char *, unsigned char *, int);
-extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
+extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
+ const struct nls_table *codepage);
extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
unsigned char *p24);
+/* asynchronous read support */
+struct cifs_readdata {
+ struct cifsFileInfo *cfile;
+ struct address_space *mapping;
+ __u64 offset;
+ unsigned int bytes;
+ pid_t pid;
+ int result;
+ struct list_head pages;
+ struct work_struct work;
+ unsigned int nr_iov;
+ struct kvec iov[1];
+};
+
+struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages);
+void cifs_readdata_free(struct cifs_readdata *rdata);
+int cifs_async_readv(struct cifs_readdata *rdata);
+
/* asynchronous write support */
struct cifs_writedata {
struct kref refcount;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a80f7bd97b9..6600aa2d2ef 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -33,6 +33,8 @@
#include <linux/slab.h>
#include <linux/posix_acl_xattr.h>
#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/task_io_accounting_ops.h>
#include <asm/uaccess.h>
#include "cifspdu.h"
#include "cifsglob.h"
@@ -40,6 +42,7 @@
#include "cifsproto.h"
#include "cifs_unicode.h"
#include "cifs_debug.h"
+#include "fscache.h"
#ifdef CONFIG_CIFS_POSIX
static struct {
@@ -83,6 +86,9 @@ static struct {
#endif /* CONFIG_CIFS_WEAK_PW_HASH */
#endif /* CIFS_POSIX */
+/* Forward declarations */
+static void cifs_readv_complete(struct work_struct *work);
+
/* Mark as invalid, all open files on tree connections since they
were closed when session to server was lost */
static void mark_open_files_invalid(struct cifs_tcon *pTcon)
@@ -453,8 +459,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
}
server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode);
server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
- server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
- (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+ server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
/* even though we do not use raw we might as well set this
accurately, in case we ever find a need for it */
@@ -561,8 +566,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
little endian */
server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
/* probably no need to store and check maxvcs */
- server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
- (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+ server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
server->capabilities = le32_to_cpu(pSMBr->Capabilities);
@@ -739,7 +743,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
iov.iov_base = smb;
iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
- rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true);
+ rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback,
+ server, true);
if (rc)
cFYI(1, "Echo request failed: %d", rc);
@@ -1376,6 +1381,359 @@ openRetry:
return rc;
}
+struct cifs_readdata *
+cifs_readdata_alloc(unsigned int nr_pages)
+{
+ struct cifs_readdata *rdata;
+
+ /* readdata + 1 kvec for each page */
+ rdata = kzalloc(sizeof(*rdata) +
+ sizeof(struct kvec) * nr_pages, GFP_KERNEL);
+ if (rdata != NULL) {
+ INIT_WORK(&rdata->work, cifs_readv_complete);
+ INIT_LIST_HEAD(&rdata->pages);
+ }
+ return rdata;
+}
+
+void
+cifs_readdata_free(struct cifs_readdata *rdata)
+{
+ cifsFileInfo_put(rdata->cfile);
+ kfree(rdata);
+}
+
+/*
+ * Discard any remaining data in the current SMB. To do this, we borrow the
+ * current bigbuf.
+ */
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ READ_RSP *rsp = (READ_RSP *)server->smallbuf;
+ unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length);
+ int remaining = rfclen + 4 - server->total_read;
+ struct cifs_readdata *rdata = mid->callback_data;
+
+ while (remaining > 0) {
+ int length;
+
+ length = cifs_read_from_socket(server, server->bigbuf,
+ min_t(unsigned int, remaining,
+ CIFSMaxBufSize + MAX_CIFS_HDR_SIZE));
+ if (length < 0)
+ return length;
+ server->total_read += length;
+ remaining -= length;
+ }
+
+ dequeue_mid(mid, rdata->result);
+ return 0;
+}
+
+static int
+cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ int length, len;
+ unsigned int data_offset, remaining, data_len;
+ struct cifs_readdata *rdata = mid->callback_data;
+ READ_RSP *rsp = (READ_RSP *)server->smallbuf;
+ unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4;
+ u64 eof;
+ pgoff_t eof_index;
+ struct page *page, *tpage;
+
+ cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__,
+ mid->mid, rdata->offset, rdata->bytes);
+
+ /*
+ * read the rest of READ_RSP header (sans Data array), or whatever we
+ * can if there's not enough data. At this point, we've read down to
+ * the Mid.
+ */
+ len = min_t(unsigned int, rfclen, sizeof(*rsp)) -
+ sizeof(struct smb_hdr) + 1;
+
+ rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1;
+ rdata->iov[0].iov_len = len;
+
+ length = cifs_readv_from_socket(server, rdata->iov, 1, len);
+ if (length < 0)
+ return length;
+ server->total_read += length;
+
+ /* Was the SMB read successful? */
+ rdata->result = map_smb_to_linux_error(&rsp->hdr, false);
+ if (rdata->result != 0) {
+ cFYI(1, "%s: server returned error %d", __func__,
+ rdata->result);
+ return cifs_readv_discard(server, mid);
+ }
+
+ /* Is there enough to get to the rest of the READ_RSP header? */
+ if (server->total_read < sizeof(READ_RSP)) {
+ cFYI(1, "%s: server returned short header. got=%u expected=%zu",
+ __func__, server->total_read, sizeof(READ_RSP));
+ rdata->result = -EIO;
+ return cifs_readv_discard(server, mid);
+ }
+
+ data_offset = le16_to_cpu(rsp->DataOffset) + 4;
+ if (data_offset < server->total_read) {
+ /*
+ * win2k8 sometimes sends an offset of 0 when the read
+ * is beyond the EOF. Treat it as if the data starts just after
+ * the header.
+ */
+ cFYI(1, "%s: data offset (%u) inside read response header",
+ __func__, data_offset);
+ data_offset = server->total_read;
+ } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
+ /* data_offset is beyond the end of smallbuf */
+ cFYI(1, "%s: data offset (%u) beyond end of smallbuf",
+ __func__, data_offset);
+ rdata->result = -EIO;
+ return cifs_readv_discard(server, mid);
+ }
+
+ cFYI(1, "%s: total_read=%u data_offset=%u", __func__,
+ server->total_read, data_offset);
+
+ len = data_offset - server->total_read;
+ if (len > 0) {
+ /* read any junk before data into the rest of smallbuf */
+ rdata->iov[0].iov_base = server->smallbuf + server->total_read;
+ rdata->iov[0].iov_len = len;
+ length = cifs_readv_from_socket(server, rdata->iov, 1, len);
+ if (length < 0)
+ return length;
+ server->total_read += length;
+ }
+
+ /* set up first iov for signature check */
+ rdata->iov[0].iov_base = server->smallbuf;
+ rdata->iov[0].iov_len = server->total_read;
+ cFYI(1, "0: iov_base=%p iov_len=%zu",
+ rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+
+ /* how much data is in the response? */
+ data_len = le16_to_cpu(rsp->DataLengthHigh) << 16;
+ data_len += le16_to_cpu(rsp->DataLength);
+ if (data_offset + data_len > rfclen) {
+ /* data_len is corrupt -- discard frame */
+ rdata->result = -EIO;
+ return cifs_readv_discard(server, mid);
+ }
+
+ /* marshal up the page array */
+ len = 0;
+ remaining = data_len;
+ rdata->nr_iov = 1;
+
+ /* determine the eof that the server (probably) has */
+ eof = CIFS_I(rdata->mapping->host)->server_eof;
+ eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
+ cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
+
+ list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+ if (remaining >= PAGE_CACHE_SIZE) {
+ /* enough data to fill the page */
+ rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+ rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE;
+ cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+ rdata->nr_iov, page->index,
+ rdata->iov[rdata->nr_iov].iov_base,
+ rdata->iov[rdata->nr_iov].iov_len);
+ ++rdata->nr_iov;
+ len += PAGE_CACHE_SIZE;
+ remaining -= PAGE_CACHE_SIZE;
+ } else if (remaining > 0) {
+ /* enough for partial page, fill and zero the rest */
+ rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+ rdata->iov[rdata->nr_iov].iov_len = remaining;
+ cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+ rdata->nr_iov, page->index,
+ rdata->iov[rdata->nr_iov].iov_base,
+ rdata->iov[rdata->nr_iov].iov_len);
+ memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
+ '\0', PAGE_CACHE_SIZE - remaining);
+ ++rdata->nr_iov;
+ len += remaining;
+ remaining = 0;
+ } else if (page->index > eof_index) {
+ /*
+ * The VFS will not try to do readahead past the
+ * i_size, but it's possible that we have outstanding
+ * writes with gaps in the middle and the i_size hasn't
+ * caught up yet. Populate those with zeroed out pages
+ * to prevent the VFS from repeatedly attempting to
+ * fill them until the writes are flushed.
+ */
+ zero_user(page, 0, PAGE_CACHE_SIZE);
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ page_cache_release(page);
+ } else {
+ /* no need to hold page hostage */
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+
+ /* issue the read if we have any iovecs left to fill */
+ if (rdata->nr_iov > 1) {
+ length = cifs_readv_from_socket(server, &rdata->iov[1],
+ rdata->nr_iov - 1, len);
+ if (length < 0)
+ return length;
+ server->total_read += length;
+ } else {
+ length = 0;
+ }
+
+ rdata->bytes = length;
+
+ cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read,
+ rfclen, remaining);
+
+ /* discard anything left over */
+ if (server->total_read < rfclen)
+ return cifs_readv_discard(server, mid);
+
+ dequeue_mid(mid, false);
+ return length;
+}
+
+static void
+cifs_readv_complete(struct work_struct *work)
+{
+ struct cifs_readdata *rdata = container_of(work,
+ struct cifs_readdata, work);
+ struct page *page, *tpage;
+
+ list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+
+ if (rdata->result == 0) {
+ kunmap(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
+
+ unlock_page(page);
+
+ if (rdata->result == 0)
+ cifs_readpage_to_fscache(rdata->mapping->host, page);
+
+ page_cache_release(page);
+ }
+ cifs_readdata_free(rdata);
+}
+
+static void
+cifs_readv_callback(struct mid_q_entry *mid)
+{
+ struct cifs_readdata *rdata = mid->callback_data;
+ struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
+
+ cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__,
+ mid->mid, mid->midState, rdata->result, rdata->bytes);
+
+ switch (mid->midState) {
+ case MID_RESPONSE_RECEIVED:
+ /* result already set, check signature */
+ if (server->sec_mode &
+ (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ if (cifs_verify_signature(rdata->iov, rdata->nr_iov,
+ server, mid->sequence_number + 1))
+ cERROR(1, "Unexpected SMB signature");
+ }
+ /* FIXME: should this be counted toward the initiating task? */
+ task_io_account_read(rdata->bytes);
+ cifs_stats_bytes_read(tcon, rdata->bytes);
+ break;
+ case MID_REQUEST_SUBMITTED:
+ case MID_RETRY_NEEDED:
+ rdata->result = -EAGAIN;
+ break;
+ default:
+ rdata->result = -EIO;
+ }
+
+ queue_work(system_nrt_wq, &rdata->work);
+ DeleteMidQEntry(mid);
+ atomic_dec(&server->inFlight);
+ wake_up(&server->request_q);
+}
+
+/* cifs_async_readv - send an async write, and set up mid to handle result */
+int
+cifs_async_readv(struct cifs_readdata *rdata)
+{
+ int rc;
+ READ_REQ *smb = NULL;
+ int wct;
+ struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+
+ cFYI(1, "%s: offset=%llu bytes=%u", __func__,
+ rdata->offset, rdata->bytes);
+
+ if (tcon->ses->capabilities & CAP_LARGE_FILES)
+ wct = 12;
+ else {
+ wct = 10; /* old style read */
+ if ((rdata->offset >> 32) > 0) {
+ /* can not handle this big offset for old */
+ return -EIO;
+ }
+ }
+
+ rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb);
+ if (rc)
+ return rc;
+
+ smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid);
+ smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
+
+ smb->AndXCommand = 0xFF; /* none */
+ smb->Fid = rdata->cfile->netfid;
+ smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF);
+ if (wct == 12)
+ smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32);
+ smb->Remaining = 0;
+ smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF);
+ smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16);
+ if (wct == 12)
+ smb->ByteCount = 0;
+ else {
+ /* old style read */
+ struct smb_com_readx_req *smbr =
+ (struct smb_com_readx_req *)smb;
+ smbr->ByteCount = 0;
+ }
+
+ /* 4 for RFC1001 length + 1 for BCC */
+ rdata->iov[0].iov_base = smb;
+ rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
+
+ rc = cifs_call_async(tcon->ses->server, rdata->iov, 1,
+ cifs_readv_receive, cifs_readv_callback,
+ rdata, false);
+
+ if (rc == 0)
+ cifs_stats_inc(&tcon->num_reads);
+
+ cifs_small_buf_release(smb);
+ return rc;
+}
+
int
CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes,
char **buf, int *pbuf_type)
@@ -1836,7 +2194,7 @@ cifs_async_writev(struct cifs_writedata *wdata)
kref_get(&wdata->refcount);
rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1,
- cifs_writev_callback, wdata, false);
+ NULL, cifs_writev_callback, wdata, false);
if (rc == 0)
cifs_stats_inc(&tcon->num_writes);
@@ -1962,10 +2320,50 @@ CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
return rc;
}
+int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid,
+ const __u8 lock_type, const __u32 num_unlock,
+ const __u32 num_lock, LOCKING_ANDX_RANGE *buf)
+{
+ int rc = 0;
+ LOCK_REQ *pSMB = NULL;
+ struct kvec iov[2];
+ int resp_buf_type;
+ __u16 count;
+
+ cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock);
+
+ rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
+ if (rc)
+ return rc;
+
+ pSMB->Timeout = 0;
+ pSMB->NumberOfLocks = cpu_to_le16(num_lock);
+ pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock);
+ pSMB->LockType = lock_type;
+ pSMB->AndXCommand = 0xFF; /* none */
+ pSMB->Fid = netfid; /* netfid stays le */
+
+ count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+ inc_rfc1001_len(pSMB, count);
+ pSMB->ByteCount = cpu_to_le16(count);
+
+ iov[0].iov_base = (char *)pSMB;
+ iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 -
+ (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+ iov[1].iov_base = (char *)buf;
+ iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+
+ cifs_stats_inc(&tcon->num_locks);
+ rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
+ if (rc)
+ cFYI(1, "Send error in cifs_lockv = %d", rc);
+
+ return rc;
+}
int
CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
- const __u16 smb_file_id, const __u64 len,
+ const __u16 smb_file_id, const __u32 netpid, const __u64 len,
const __u64 offset, const __u32 numUnlock,
const __u32 numLock, const __u8 lockType,
const bool waitFlag, const __u8 oplock_level)
@@ -2001,7 +2399,7 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
pSMB->Fid = smb_file_id; /* netfid stays le */
if ((numLock != 0) || (numUnlock != 0)) {
- pSMB->Locks[0].Pid = cpu_to_le16(current->tgid);
+ pSMB->Locks[0].Pid = cpu_to_le16(netpid);
/* BB where to store pid high? */
pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len);
pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32));
@@ -2035,9 +2433,9 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
int
CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
- const __u16 smb_file_id, const int get_flag, const __u64 len,
- struct file_lock *pLockData, const __u16 lock_type,
- const bool waitFlag)
+ const __u16 smb_file_id, const __u32 netpid, const int get_flag,
+ const __u64 len, struct file_lock *pLockData,
+ const __u16 lock_type, const bool waitFlag)
{
struct smb_com_transaction2_sfi_req *pSMB = NULL;
struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
@@ -2095,7 +2493,7 @@ CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
} else
pSMB->Timeout = 0;
- parm_data->pid = cpu_to_le32(current->tgid);
+ parm_data->pid = cpu_to_le32(netpid);
parm_data->start = cpu_to_le64(pLockData->fl_start);
parm_data->length = cpu_to_le64(len); /* normalize negative numbers */
@@ -2812,8 +3210,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon,
pSMB->TotalDataCount = 0;
pSMB->MaxParameterCount = cpu_to_le32(2);
/* BB find exact data count max from sess structure BB */
- pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
- MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+ pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
pSMB->MaxSetupCount = 4;
pSMB->Reserved = 0;
pSMB->ParameterOffset = 0;
@@ -3306,8 +3703,7 @@ smb_init_nttransact(const __u16 sub_command, const int setup_count,
pSMB->Reserved = 0;
pSMB->TotalParameterCount = cpu_to_le32(parm_len);
pSMB->TotalDataCount = 0;
- pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
- MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+ pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
pSMB->ParameterCount = pSMB->TotalParameterCount;
pSMB->DataCount = pSMB->TotalDataCount;
temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
@@ -3467,7 +3863,7 @@ qsec_out:
int
CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
- struct cifs_ntsd *pntsd, __u32 acllen)
+ struct cifs_ntsd *pntsd, __u32 acllen, int aclflag)
{
__u16 byte_count, param_count, data_count, param_offset, data_offset;
int rc = 0;
@@ -3504,7 +3900,7 @@ setCifsAclRetry:
pSMB->Fid = fid; /* file handle always le */
pSMB->Reserved2 = 0;
- pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL);
+ pSMB->AclFlags = cpu_to_le32(aclflag);
if (pntsd && acllen) {
memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
@@ -3977,8 +4373,7 @@ findFirstRetry:
params = 12 + name_len /* includes null */ ;
pSMB->TotalDataCount = 0; /* no EAs */
pSMB->MaxParameterCount = cpu_to_le16(10);
- pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf -
- MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+ pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
pSMB->MaxSetupCount = 0;
pSMB->Reserved = 0;
pSMB->Flags = 0;
@@ -4052,8 +4447,7 @@ findFirstRetry:
psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
psrch_inf->entries_in_buffer;
lnoff = le16_to_cpu(parms->LastNameOffset);
- if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
- lnoff) {
+ if (CIFSMaxBufSize < lnoff) {
cERROR(1, "ignoring corrupt resume name");
psrch_inf->last_entry = NULL;
return rc;
@@ -4097,9 +4491,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
byte_count = 0;
pSMB->TotalDataCount = 0; /* no EAs */
pSMB->MaxParameterCount = cpu_to_le16(8);
- pSMB->MaxDataCount =
- cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) &
- 0xFFFFFF00);
+ pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
pSMB->MaxSetupCount = 0;
pSMB->Reserved = 0;
pSMB->Flags = 0;
@@ -4181,8 +4573,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
psrch_inf->index_of_last_entry +=
psrch_inf->entries_in_buffer;
lnoff = le16_to_cpu(parms->LastNameOffset);
- if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
- lnoff) {
+ if (CIFSMaxBufSize < lnoff) {
cERROR(1, "ignoring corrupt resume name");
psrch_inf->last_entry = NULL;
return rc;
@@ -5840,7 +6231,7 @@ QAllEAsRetry:
if (ea_name) {
if (ea_name_len == name_len &&
- strncmp(ea_name, temp_ptr, name_len) == 0) {
+ memcmp(ea_name, temp_ptr, name_len) == 0) {
temp_ptr += name_len + 1;
rc = value_len;
if (buf_size == 0)
@@ -6035,12 +6426,7 @@ int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon,
pSMB->TotalParameterCount = 0 ;
pSMB->TotalDataCount = 0;
pSMB->MaxParameterCount = cpu_to_le32(2);
- /* BB find exact data count max from sess structure BB */
- pSMB->MaxDataCount = 0; /* same in little endian or be */
-/* BB VERIFY verify which is correct for above BB */
- pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
- MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
-
+ pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
pSMB->MaxSetupCount = 4;
pSMB->Reserved = 0;
pSMB->ParameterOffset = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 71beb020197..f3670cf7258 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -37,6 +37,7 @@
#include <asm/uaccess.h>
#include <asm/processor.h>
#include <linux/inet.h>
+#include <linux/module.h>
#include <net/ipv6.h>
#include "cifspdu.h"
#include "cifsglob.h"
@@ -181,7 +182,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
-EINVAL = invalid transact2
*/
-static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
+static int check2ndT2(struct smb_hdr *pSMB)
{
struct smb_t2_rsp *pSMBt;
int remaining;
@@ -214,9 +215,9 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
cFYI(1, "missing %d bytes from transact2, check next response",
remaining);
- if (total_data_size > maxBufSize) {
+ if (total_data_size > CIFSMaxBufSize) {
cERROR(1, "TotalDataSize %d is over maximum buffer %d",
- total_data_size, maxBufSize);
+ total_data_size, CIFSMaxBufSize);
return -EINVAL;
}
return remaining;
@@ -281,7 +282,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
byte_count += total_in_buf2;
/* don't allow buffer to overflow */
- if (byte_count > CIFSMaxBufSize)
+ if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)
return -ENOBUFS;
pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
@@ -320,27 +321,24 @@ requeue_echo:
}
static bool
-allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
- bool is_large_buf)
+allocate_buffers(struct TCP_Server_Info *server)
{
- char *bbuf = *bigbuf, *sbuf = *smallbuf;
-
- if (bbuf == NULL) {
- bbuf = (char *)cifs_buf_get();
- if (!bbuf) {
+ if (!server->bigbuf) {
+ server->bigbuf = (char *)cifs_buf_get();
+ if (!server->bigbuf) {
cERROR(1, "No memory for large SMB response");
msleep(3000);
/* retry will check if exiting */
return false;
}
- } else if (is_large_buf) {
+ } else if (server->large_buf) {
/* we are reusing a dirty large buf, clear its start */
- memset(bbuf, 0, size);
+ memset(server->bigbuf, 0, sizeof(struct smb_hdr));
}
- if (sbuf == NULL) {
- sbuf = (char *)cifs_small_buf_get();
- if (!sbuf) {
+ if (!server->smallbuf) {
+ server->smallbuf = (char *)cifs_small_buf_get();
+ if (!server->smallbuf) {
cERROR(1, "No memory for SMB response");
msleep(1000);
/* retry will check if exiting */
@@ -349,36 +347,118 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
/* beginning of smb buffer is cleared in our buf_get */
} else {
/* if existing small buf clear beginning */
- memset(sbuf, 0, size);
+ memset(server->smallbuf, 0, sizeof(struct smb_hdr));
}
- *bigbuf = bbuf;
- *smallbuf = sbuf;
-
return true;
}
-static int
-read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
- struct kvec *iov, unsigned int to_read,
- unsigned int *ptotal_read, bool is_header_read)
+static bool
+server_unresponsive(struct TCP_Server_Info *server)
+{
+ if (echo_retries > 0 && server->tcpStatus == CifsGood &&
+ time_after(jiffies, server->lstrp +
+ (echo_retries * SMB_ECHO_INTERVAL))) {
+ cERROR(1, "Server %s has not responded in %d seconds. "
+ "Reconnecting...", server->hostname,
+ (echo_retries * SMB_ECHO_INTERVAL / HZ));
+ cifs_reconnect(server);
+ wake_up(&server->response_q);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * kvec_array_init - clone a kvec array, and advance into it
+ * @new: pointer to memory for cloned array
+ * @iov: pointer to original array
+ * @nr_segs: number of members in original array
+ * @bytes: number of bytes to advance into the cloned array
+ *
+ * This function will copy the array provided in iov to a section of memory
+ * and advance the specified number of bytes into the new array. It returns
+ * the number of segments in the new array. "new" must be at least as big as
+ * the original iov array.
+ */
+static unsigned int
+kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs,
+ size_t bytes)
+{
+ size_t base = 0;
+
+ while (bytes || !iov->iov_len) {
+ int copy = min(bytes, iov->iov_len);
+
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ nr_segs--;
+ base = 0;
+ }
+ }
+ memcpy(new, iov, sizeof(*iov) * nr_segs);
+ new->iov_base += base;
+ new->iov_len -= base;
+ return nr_segs;
+}
+
+static struct kvec *
+get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs)
{
- int length, rc = 0;
- unsigned int total_read;
- char *buf = iov->iov_base;
+ struct kvec *new_iov;
+
+ if (server->iov && nr_segs <= server->nr_iov)
+ return server->iov;
+
+ /* not big enough -- allocate a new one and release the old */
+ new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS);
+ if (new_iov) {
+ kfree(server->iov);
+ server->iov = new_iov;
+ server->nr_iov = nr_segs;
+ }
+ return new_iov;
+}
+
+int
+cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
+ unsigned int nr_segs, unsigned int to_read)
+{
+ int length = 0;
+ int total_read;
+ unsigned int segs;
+ struct msghdr smb_msg;
+ struct kvec *iov;
+
+ iov = get_server_iovec(server, nr_segs);
+ if (!iov)
+ return -ENOMEM;
+
+ smb_msg.msg_control = NULL;
+ smb_msg.msg_controllen = 0;
+
+ for (total_read = 0; to_read; total_read += length, to_read -= length) {
+ try_to_freeze();
+
+ if (server_unresponsive(server)) {
+ total_read = -EAGAIN;
+ break;
+ }
+
+ segs = kvec_array_init(iov, iov_orig, nr_segs, total_read);
+
+ length = kernel_recvmsg(server->ssocket, &smb_msg,
+ iov, segs, to_read, 0);
- for (total_read = 0; total_read < to_read; total_read += length) {
- length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1,
- to_read - total_read, 0);
if (server->tcpStatus == CifsExiting) {
- /* then will exit */
- rc = 2;
+ total_read = -ESHUTDOWN;
break;
} else if (server->tcpStatus == CifsNeedReconnect) {
cifs_reconnect(server);
- /* Reconnect wakes up rspns q */
- /* Now we will reread sock */
- rc = 1;
+ total_read = -EAGAIN;
break;
} else if (length == -ERESTARTSYS ||
length == -EAGAIN ||
@@ -390,56 +470,54 @@ read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
*/
usleep_range(1000, 2000);
length = 0;
- if (!is_header_read)
- continue;
- /* Special handling for header read */
- if (total_read) {
- iov->iov_base = (to_read - total_read) +
- buf;
- iov->iov_len = to_read - total_read;
- smb_msg->msg_control = NULL;
- smb_msg->msg_controllen = 0;
- rc = 3;
- } else
- rc = 1;
- break;
+ continue;
} else if (length <= 0) {
- cERROR(1, "Received no data, expecting %d",
- to_read - total_read);
+ cFYI(1, "Received no data or error: expecting %d "
+ "got %d", to_read, length);
cifs_reconnect(server);
- rc = 1;
+ total_read = -EAGAIN;
break;
}
}
+ return total_read;
+}
- *ptotal_read = total_read;
- return rc;
+int
+cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+ unsigned int to_read)
+{
+ struct kvec iov;
+
+ iov.iov_base = buf;
+ iov.iov_len = to_read;
+
+ return cifs_readv_from_socket(server, &iov, 1, to_read);
}
static bool
-check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
+is_smb_response(struct TCP_Server_Info *server, unsigned char type)
{
- char temp = *buf;
- unsigned int pdu_length = be32_to_cpu(
- ((struct smb_hdr *)buf)->smb_buf_length);
-
/*
* The first byte big endian of the length field,
* is actually not part of the length but the type
* with the most common, zero, as regular data.
*/
- if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
- return false;
- } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
- cFYI(1, "Good RFC 1002 session rsp");
- return false;
- } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
+ switch (type) {
+ case RFC1002_SESSION_MESSAGE:
+ /* Regular SMB response */
+ return true;
+ case RFC1002_SESSION_KEEP_ALIVE:
+ cFYI(1, "RFC 1002 session keep alive");
+ break;
+ case RFC1002_POSITIVE_SESSION_RESPONSE:
+ cFYI(1, "RFC 1002 positive session response");
+ break;
+ case RFC1002_NEGATIVE_SESSION_RESPONSE:
/*
* We get this from Windows 98 instead of an error on
* SMB negprot response.
*/
- cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
- pdu_length);
+ cFYI(1, "RFC 1002 negative session response");
/* give server a second to clean up */
msleep(1000);
/*
@@ -448,87 +526,89 @@ check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
* is since we do not begin with RFC1001 session
* initialize frame).
*/
- cifs_set_port((struct sockaddr *)
- &server->dstaddr, CIFS_PORT);
+ cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
cifs_reconnect(server);
wake_up(&server->response_q);
- return false;
- } else if (temp != (char) 0) {
- cERROR(1, "Unknown RFC 1002 frame");
- cifs_dump_mem(" Received Data: ", buf, 4);
- cifs_reconnect(server);
- return false;
- }
-
- /* else we have an SMB response */
- if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
- (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
- cERROR(1, "Invalid size SMB length %d pdu_length %d",
- 4, pdu_length+4);
+ break;
+ default:
+ cERROR(1, "RFC 1002 unknown response type 0x%x", type);
cifs_reconnect(server);
- wake_up(&server->response_q);
- return false;
}
- return true;
+ return false;
}
static struct mid_q_entry *
-find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
- int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf)
+find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf)
{
- struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL;
+ struct mid_q_entry *mid;
spin_lock(&GlobalMid_Lock);
- list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) {
- if (mid->mid != buf->Mid ||
- mid->midState != MID_REQUEST_SUBMITTED ||
- mid->command != buf->Command)
- continue;
-
- if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) {
- /* We have a multipart transact2 resp */
- *is_multi_rsp = true;
- if (mid->resp_buf) {
- /* merge response - fix up 1st*/
- *length = coalesce_t2(buf, mid->resp_buf);
- if (*length > 0) {
- *length = 0;
- mid->multiRsp = true;
- break;
- }
- /* All parts received or packet is malformed. */
- mid->multiEnd = true;
- goto multi_t2_fnd;
- }
- if (!is_large_buf) {
- /*FIXME: switch to already allocated largebuf?*/
- cERROR(1, "1st trans2 resp needs bigbuf");
- } else {
- /* Have first buffer */
- mid->resp_buf = buf;
- mid->largeBuf = true;
- *bigbuf = NULL;
- }
- break;
+ list_for_each_entry(mid, &server->pending_mid_q, qhead) {
+ if (mid->mid == buf->Mid &&
+ mid->midState == MID_REQUEST_SUBMITTED &&
+ mid->command == buf->Command) {
+ spin_unlock(&GlobalMid_Lock);
+ return mid;
}
- mid->resp_buf = buf;
- mid->largeBuf = is_large_buf;
-multi_t2_fnd:
- if (*length == 0)
- mid->midState = MID_RESPONSE_RECEIVED;
- else
- mid->midState = MID_RESPONSE_MALFORMED;
+ }
+ spin_unlock(&GlobalMid_Lock);
+ return NULL;
+}
+
+void
+dequeue_mid(struct mid_q_entry *mid, bool malformed)
+{
#ifdef CONFIG_CIFS_STATS2
- mid->when_received = jiffies;
+ mid->when_received = jiffies;
#endif
- list_del_init(&mid->qhead);
- ret = mid;
- break;
- }
+ spin_lock(&GlobalMid_Lock);
+ if (!malformed)
+ mid->midState = MID_RESPONSE_RECEIVED;
+ else
+ mid->midState = MID_RESPONSE_MALFORMED;
+ list_del_init(&mid->qhead);
spin_unlock(&GlobalMid_Lock);
+}
- return ret;
+static void
+handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+ struct smb_hdr *buf, int malformed)
+{
+ if (malformed == 0 && check2ndT2(buf) > 0) {
+ mid->multiRsp = true;
+ if (mid->resp_buf) {
+ /* merge response - fix up 1st*/
+ malformed = coalesce_t2(buf, mid->resp_buf);
+ if (malformed > 0)
+ return;
+
+ /* All parts received or packet is malformed. */
+ mid->multiEnd = true;
+ return dequeue_mid(mid, malformed);
+ }
+ if (!server->large_buf) {
+ /*FIXME: switch to already allocated largebuf?*/
+ cERROR(1, "1st trans2 resp needs bigbuf");
+ } else {
+ /* Have first buffer */
+ mid->resp_buf = buf;
+ mid->largeBuf = true;
+ server->bigbuf = NULL;
+ }
+ return;
+ }
+ mid->resp_buf = buf;
+ mid->largeBuf = server->large_buf;
+ /* Was previous buf put in mpx struct for multi-rsp? */
+ if (!mid->multiRsp) {
+ /* smb buffer will be freed by user thread */
+ if (server->large_buf)
+ server->bigbuf = NULL;
+ else
+ server->smallbuf = NULL;
+ }
+ dequeue_mid(mid, malformed);
}
static void clean_demultiplex_info(struct TCP_Server_Info *server)
@@ -618,6 +698,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
}
kfree(server->hostname);
+ kfree(server->iov);
kfree(server);
length = atomic_dec_return(&tcpSesAllocCount);
@@ -627,20 +708,70 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
}
static int
+standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ int length;
+ char *buf = server->smallbuf;
+ struct smb_hdr *smb_buffer = (struct smb_hdr *)buf;
+ unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
+
+ /* make sure this will fit in a large buffer */
+ if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+ cERROR(1, "SMB response too long (%u bytes)",
+ pdu_length);
+ cifs_reconnect(server);
+ wake_up(&server->response_q);
+ return -EAGAIN;
+ }
+
+ /* switch to large buffer if too big for a small one */
+ if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
+ server->large_buf = true;
+ memcpy(server->bigbuf, server->smallbuf, server->total_read);
+ buf = server->bigbuf;
+ smb_buffer = (struct smb_hdr *)buf;
+ }
+
+ /* now read the rest */
+ length = cifs_read_from_socket(server,
+ buf + sizeof(struct smb_hdr) - 1,
+ pdu_length - sizeof(struct smb_hdr) + 1 + 4);
+ if (length < 0)
+ return length;
+ server->total_read += length;
+
+ dump_smb(smb_buffer, server->total_read);
+
+ /*
+ * We know that we received enough to get to the MID as we
+ * checked the pdu_length earlier. Now check to see
+ * if the rest of the header is OK. We borrow the length
+ * var for the rest of the loop to avoid a new stack var.
+ *
+ * 48 bytes is enough to display the header and a little bit
+ * into the payload for debugging purposes.
+ */
+ length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read);
+ if (length != 0)
+ cifs_dump_mem("Bad SMB: ", buf,
+ min_t(unsigned int, server->total_read, 48));
+
+ if (mid)
+ handle_mid(mid, server, smb_buffer, length);
+
+ return length;
+}
+
+static int
cifs_demultiplex_thread(void *p)
{
int length;
struct TCP_Server_Info *server = p;
- unsigned int pdu_length, total_read;
- char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL;
+ unsigned int pdu_length;
+ char *buf = NULL;
struct smb_hdr *smb_buffer = NULL;
- struct msghdr smb_msg;
- struct kvec iov;
struct task_struct *task_to_wake = NULL;
struct mid_q_entry *mid_entry;
- bool isLargeBuf = false;
- bool isMultiRsp = false;
- int rc;
current->flags |= PF_MEMALLOC;
cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
@@ -655,111 +786,65 @@ cifs_demultiplex_thread(void *p)
if (try_to_freeze())
continue;
- if (!allocate_buffers(&bigbuf, &smallbuf,
- sizeof(struct smb_hdr), isLargeBuf))
+ if (!allocate_buffers(server))
continue;
- isLargeBuf = false;
- isMultiRsp = false;
- smb_buffer = (struct smb_hdr *)smallbuf;
- buf = smallbuf;
- iov.iov_base = buf;
- iov.iov_len = 4;
- smb_msg.msg_control = NULL;
- smb_msg.msg_controllen = 0;
+ server->large_buf = false;
+ smb_buffer = (struct smb_hdr *)server->smallbuf;
+ buf = server->smallbuf;
pdu_length = 4; /* enough to get RFC1001 header */
-incomplete_rcv:
- if (echo_retries > 0 && server->tcpStatus == CifsGood &&
- time_after(jiffies, server->lstrp +
- (echo_retries * SMB_ECHO_INTERVAL))) {
- cERROR(1, "Server %s has not responded in %d seconds. "
- "Reconnecting...", server->hostname,
- (echo_retries * SMB_ECHO_INTERVAL / HZ));
- cifs_reconnect(server);
- wake_up(&server->response_q);
- continue;
- }
-
- rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
- &total_read, true /* header read */);
- if (rc == 3)
- goto incomplete_rcv;
- else if (rc == 2)
- break;
- else if (rc == 1)
+ length = cifs_read_from_socket(server, buf, pdu_length);
+ if (length < 0)
continue;
+ server->total_read = length;
/*
* The right amount was read from socket - 4 bytes,
* so we can now interpret the length field.
*/
-
- /*
- * Note that RFC 1001 length is big endian on the wire,
- * but we convert it here so it is always manipulated
- * as host byte order.
- */
pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
- cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
- if (!check_rfc1002_header(server, buf))
+ cFYI(1, "RFC1002 header 0x%x", pdu_length);
+ if (!is_smb_response(server, buf[0]))
continue;
- /* else length ok */
- if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
- isLargeBuf = true;
- memcpy(bigbuf, smallbuf, 4);
- smb_buffer = (struct smb_hdr *)bigbuf;
- buf = bigbuf;
+ /* make sure we have enough to get to the MID */
+ if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) {
+ cERROR(1, "SMB response too short (%u bytes)",
+ pdu_length);
+ cifs_reconnect(server);
+ wake_up(&server->response_q);
+ continue;
}
- iov.iov_base = 4 + buf;
- iov.iov_len = pdu_length;
- rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
- &total_read, false);
- if (rc == 2)
- break;
- else if (rc == 1)
+ /* read down to the MID */
+ length = cifs_read_from_socket(server, buf + 4,
+ sizeof(struct smb_hdr) - 1 - 4);
+ if (length < 0)
continue;
+ server->total_read += length;
- total_read += 4; /* account for rfc1002 hdr */
+ mid_entry = find_mid(server, smb_buffer);
- dump_smb(smb_buffer, total_read);
+ if (!mid_entry || !mid_entry->receive)
+ length = standard_receive3(server, mid_entry);
+ else
+ length = mid_entry->receive(server, mid_entry);
- /*
- * We know that we received enough to get to the MID as we
- * checked the pdu_length earlier. Now check to see
- * if the rest of the header is OK. We borrow the length
- * var for the rest of the loop to avoid a new stack var.
- *
- * 48 bytes is enough to display the header and a little bit
- * into the payload for debugging purposes.
- */
- length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
- if (length != 0)
- cifs_dump_mem("Bad SMB: ", buf,
- min_t(unsigned int, total_read, 48));
+ if (length < 0)
+ continue;
- server->lstrp = jiffies;
+ if (server->large_buf) {
+ buf = server->bigbuf;
+ smb_buffer = (struct smb_hdr *)buf;
+ }
- mid_entry = find_cifs_mid(server, smb_buffer, &length,
- isLargeBuf, &isMultiRsp, &bigbuf);
+ server->lstrp = jiffies;
if (mid_entry != NULL) {
- mid_entry->callback(mid_entry);
- /* Was previous buf put in mpx struct for multi-rsp? */
- if (!isMultiRsp) {
- /* smb buffer will be freed by user thread */
- if (isLargeBuf)
- bigbuf = NULL;
- else
- smallbuf = NULL;
- }
- } else if (length != 0) {
- /* response sanity checks failed */
- continue;
- } else if (!is_valid_oplock_break(smb_buffer, server) &&
- !isMultiRsp) {
+ if (!mid_entry->multiRsp || mid_entry->multiEnd)
+ mid_entry->callback(mid_entry);
+ } else if (!is_valid_oplock_break(smb_buffer, server)) {
cERROR(1, "No task to wake, unknown frame received! "
"NumMids %d", atomic_read(&midCount));
cifs_dump_mem("Received Data is: ", buf,
@@ -773,9 +858,9 @@ incomplete_rcv:
} /* end while !EXITING */
/* buffer usually freed in free_mid - need to free it here on exit */
- cifs_buf_release(bigbuf);
- if (smallbuf) /* no sense logging a debug message if NULL */
- cifs_small_buf_release(smallbuf);
+ cifs_buf_release(server->bigbuf);
+ if (server->smallbuf) /* no sense logging a debug message if NULL */
+ cifs_small_buf_release(server->smallbuf);
task_to_wake = xchg(&server->tsk, NULL);
clean_demultiplex_info(server);
@@ -827,6 +912,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
{
char *value, *data, *end;
char *mountdata_copy = NULL, *options;
+ int err;
unsigned int temp_len, i, j;
char separator[2];
short int override_uid = -1;
@@ -883,6 +969,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
cFYI(1, "Null separator not allowed");
}
}
+ vol->backupuid_specified = false; /* no backup intent for a user */
+ vol->backupgid_specified = false; /* no backup intent for a group */
while ((data = strsep(&options, separator)) != NULL) {
if (!*data)
@@ -1442,6 +1530,22 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->mfsymlinks = true;
} else if (strnicmp(data, "multiuser", 8) == 0) {
vol->multiuser = true;
+ } else if (!strnicmp(data, "backupuid", 9) && value && *value) {
+ err = kstrtouint(value, 0, &vol->backupuid);
+ if (err < 0) {
+ cERROR(1, "%s: Invalid backupuid value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->backupuid_specified = true;
+ } else if (!strnicmp(data, "backupgid", 9) && value && *value) {
+ err = kstrtouint(value, 0, &vol->backupgid);
+ if (err < 0) {
+ cERROR(1, "%s: Invalid backupgid value",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->backupgid_specified = true;
} else
printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
data);
@@ -2018,7 +2122,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
warned_on_ntlm = true;
cERROR(1, "default security mechanism requested. The default "
"security mechanism will be upgraded from ntlm to "
- "ntlmv2 in kernel release 3.2");
+ "ntlmv2 in kernel release 3.3");
}
ses->overrideSecFlg = volume_info->secFlg;
@@ -2209,16 +2313,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
(new->mnt_cifs_flags & CIFS_MOUNT_MASK))
return 0;
- if (old->rsize != new->rsize)
- return 0;
-
/*
- * We want to share sb only if we don't specify wsize or specified wsize
- * is greater or equal than existing one.
+ * We want to share sb only if we don't specify an r/wsize or
+ * specified r/wsize is greater than or equal to existing one.
*/
if (new->wsize && new->wsize < old->wsize)
return 0;
+ if (new->rsize && new->rsize < old->rsize)
+ return 0;
+
if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
return 0;
@@ -2656,14 +2760,6 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
CIFS_MOUNT_POSIX_PATHS;
}
- if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) {
- if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
- cifs_sb->rsize = 127 * 1024;
- cFYI(DBG2, "larger reads not supported by srv");
- }
- }
-
-
cFYI(1, "Negotiate caps 0x%x", (int)cap);
#ifdef CONFIG_CIFS_DEBUG2
if (cap & CIFS_UNIX_FCNTL_CAP)
@@ -2708,31 +2804,19 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
spin_lock_init(&cifs_sb->tlink_tree_lock);
cifs_sb->tlink_tree = RB_ROOT;
- if (pvolume_info->rsize > CIFSMaxBufSize) {
- cERROR(1, "rsize %d too large, using MaxBufSize",
- pvolume_info->rsize);
- cifs_sb->rsize = CIFSMaxBufSize;
- } else if ((pvolume_info->rsize) &&
- (pvolume_info->rsize <= CIFSMaxBufSize))
- cifs_sb->rsize = pvolume_info->rsize;
- else /* default */
- cifs_sb->rsize = CIFSMaxBufSize;
-
- if (cifs_sb->rsize < 2048) {
- cifs_sb->rsize = 2048;
- /* Windows ME may prefer this */
- cFYI(1, "readsize set to minimum: 2048");
- }
-
/*
- * Temporarily set wsize for matching superblock. If we end up using
- * new sb then cifs_negotiate_wsize will later negotiate it downward
- * if needed.
+ * Temporarily set r/wsize for matching superblock. If we end up using
+ * new sb then client will later negotiate it downward if needed.
*/
+ cifs_sb->rsize = pvolume_info->rsize;
cifs_sb->wsize = pvolume_info->wsize;
cifs_sb->mnt_uid = pvolume_info->linux_uid;
cifs_sb->mnt_gid = pvolume_info->linux_gid;
+ if (pvolume_info->backupuid_specified)
+ cifs_sb->mnt_backupuid = pvolume_info->backupuid;
+ if (pvolume_info->backupgid_specified)
+ cifs_sb->mnt_backupgid = pvolume_info->backupgid;
cifs_sb->mnt_file_mode = pvolume_info->file_mode;
cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
cFYI(1, "file mode: 0x%x dir mode: 0x%x",
@@ -2763,6 +2847,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
if (pvolume_info->cifs_acl)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
+ if (pvolume_info->backupuid_specified)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID;
+ if (pvolume_info->backupgid_specified)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID;
if (pvolume_info->override_uid)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
if (pvolume_info->override_gid)
@@ -2795,29 +2883,41 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
}
/*
- * When the server supports very large writes via POSIX extensions, we can
- * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including
- * the RFC1001 length.
+ * When the server supports very large reads and writes via POSIX extensions,
+ * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
+ * including the RFC1001 length.
*
* Note that this might make for "interesting" allocation problems during
* writeback however as we have to allocate an array of pointers for the
* pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ *
+ * For reads, there is a similar problem as we need to allocate an array
+ * of kvecs to handle the receive, though that should only need to be done
+ * once.
*/
#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4)
/*
- * When the server doesn't allow large posix writes, only allow a wsize of
- * 128k minus the size of the WRITE_AND_X header. That allows for a write up
- * to the maximum size described by RFC1002.
+ * When the server doesn't allow large posix writes, only allow a rsize/wsize
+ * of 2^17-1 minus the size of the call header. That allows for a read or
+ * write up to the maximum size described by RFC1002.
*/
-#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
/*
* The default wsize is 1M. find_get_pages seems to return a maximum of 256
* pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
* a single wsize request with a single call.
*/
-#define CIFS_DEFAULT_WSIZE (1024 * 1024)
+#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
+
+/*
+ * Windows only supports a max of 60k reads. Default to that when posix
+ * extensions aren't in force.
+ */
+#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
static unsigned int
cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
@@ -2825,7 +2925,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
struct TCP_Server_Info *server = tcon->ses->server;
unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
- CIFS_DEFAULT_WSIZE;
+ CIFS_DEFAULT_IOSIZE;
/* can server support 24-bit write sizes? (via UNIX extensions) */
if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
@@ -2848,6 +2948,50 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
return wsize;
}
+static unsigned int
+cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
+{
+ __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+ struct TCP_Server_Info *server = tcon->ses->server;
+ unsigned int rsize, defsize;
+
+ /*
+ * Set default value...
+ *
+ * HACK alert! Ancient servers have very small buffers. Even though
+ * MS-CIFS indicates that servers are only limited by the client's
+ * bufsize for reads, testing against win98se shows that it throws
+ * INVALID_PARAMETER errors if you try to request too large a read.
+ *
+ * If the server advertises a MaxBufferSize of less than one page,
+ * assume that it also can't satisfy reads larger than that either.
+ *
+ * FIXME: Is there a better heuristic for this?
+ */
+ if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP))
+ defsize = CIFS_DEFAULT_IOSIZE;
+ else if (server->capabilities & CAP_LARGE_READ_X)
+ defsize = CIFS_DEFAULT_NON_POSIX_RSIZE;
+ else if (server->maxBuf >= PAGE_CACHE_SIZE)
+ defsize = CIFSMaxBufSize;
+ else
+ defsize = server->maxBuf - sizeof(READ_RSP);
+
+ rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize;
+
+ /*
+ * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
+ * the client's MaxBufferSize.
+ */
+ if (!(server->capabilities & CAP_LARGE_READ_X))
+ rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
+
+ /* hard limit of CIFS_MAX_RSIZE */
+ rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
+
+ return rsize;
+}
+
static int
is_path_accessible(int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path)
@@ -2877,9 +3021,9 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
{
kfree(volume_info->username);
kzfree(volume_info->password);
- kfree(volume_info->UNC);
if (volume_info->UNCip != volume_info->UNC + 2)
kfree(volume_info->UNCip);
+ kfree(volume_info->UNC);
kfree(volume_info->domainname);
kfree(volume_info->iocharset);
kfree(volume_info->prepath);
@@ -3041,6 +3185,22 @@ cifs_get_volume_info(char *mount_data, const char *devname)
return volume_info;
}
+/* make sure ra_pages is a multiple of rsize */
+static inline unsigned int
+cifs_ra_pages(struct cifs_sb_info *cifs_sb)
+{
+ unsigned int reads;
+ unsigned int rsize_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
+
+ if (rsize_pages >= default_backing_dev_info.ra_pages)
+ return default_backing_dev_info.ra_pages;
+ else if (rsize_pages == 0)
+ return rsize_pages;
+
+ reads = default_backing_dev_info.ra_pages / rsize_pages;
+ return reads * rsize_pages;
+}
+
int
cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
{
@@ -3059,8 +3219,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
if (rc)
return rc;
- cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
-
#ifdef CONFIG_CIFS_DFS_UPCALL
try_mount_again:
/* cleanup activities if we're chasing a referral */
@@ -3125,15 +3283,11 @@ try_mount_again:
CIFSSMBQFSAttributeInfo(xid, tcon);
}
- if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
- cifs_sb->rsize = 1024 * 127;
- cFYI(DBG2, "no very large read support, rsize now 127K");
- }
- if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
- cifs_sb->rsize = min(cifs_sb->rsize,
- (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
-
cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info);
+ cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info);
+
+ /* tune readahead according to rsize */
+ cifs_sb->bdi.ra_pages = cifs_ra_pages(cifs_sb);
remote_path_check:
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -3301,7 +3455,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
else
#endif /* CIFS_WEAK_PW_HASH */
rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
- bcc_ptr);
+ bcc_ptr, nls_codepage);
bcc_ptr += CIFS_AUTH_RESP_SIZE;
if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 72d448bf96c..d7eeb9d3ed6 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
}
tcon = tlink_tcon(tlink);
- if (oplockEnabled)
+ if (enable_oplocks)
oplock = REQ_OPLOCK;
if (nd)
@@ -244,6 +244,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
create_options |= CREATE_OPTION_READONLY;
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
if (tcon->ses->capabilities & CAP_NT_SMBS)
rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
desiredAccess, create_options,
@@ -357,6 +360,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
{
int rc = -EPERM;
int xid;
+ int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
@@ -431,9 +435,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
return rc;
}
- /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
- GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+ GENERIC_WRITE, create_options,
&fileHandle, &oplock, buf, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
if (rc)
@@ -642,8 +648,16 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
if (direntry->d_inode) {
if (cifs_revalidate_dentry(direntry))
return 0;
- else
+ else {
+ /*
+ * Forcibly invalidate automounting directory inodes
+ * (remote DFS directories) so to have them
+ * instantiated again for automount
+ */
+ if (IS_AUTOMOUNT(direntry->d_inode))
+ return 0;
return 1;
+ }
}
/*
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 55d87ac5200..9c7ecdccf2f 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -45,7 +45,7 @@
#include "cifs_debug.h"
#include "cifsfs.h"
-#ifdef CIFS_NFSD_EXPORT
+#ifdef CONFIG_CIFS_NFSD_EXPORT
static struct dentry *cifs_get_parent(struct dentry *dentry)
{
/* BB need to add code here eventually to enable export via NFSD */
@@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = {
.encode_fs = */
};
-#endif /* CIFS_NFSD_EXPORT */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9f41a10523a..4dd9283885e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -32,6 +32,7 @@
#include <linux/delay.h>
#include <linux/mount.h>
#include <linux/slab.h>
+#include <linux/swap.h>
#include <asm/div64.h>
#include "cifsfs.h"
#include "cifspdu.h"
@@ -174,6 +175,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
int rc;
int desiredAccess;
int disposition;
+ int create_options = CREATE_NOT_DIR;
FILE_ALL_INFO *buf;
desiredAccess = cifs_convert_flags(f_flags);
@@ -210,9 +212,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
if (!buf)
return -ENOMEM;
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
if (tcon->ses->capabilities & CAP_NT_SMBS)
rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
- desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+ desiredAccess, create_options, pnetfid, poplock, buf,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
& CIFS_MOUNT_MAP_SPECIAL_CHR);
else
@@ -258,8 +263,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
pCifsFile->invalidHandle = false;
pCifsFile->tlink = cifs_get_tlink(tlink);
mutex_init(&pCifsFile->fh_mutex);
- mutex_init(&pCifsFile->lock_mutex);
- INIT_LIST_HEAD(&pCifsFile->llist);
INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
spin_lock(&cifs_file_list_lock);
@@ -272,11 +275,14 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
spin_unlock(&cifs_file_list_lock);
cifs_set_oplock_level(pCifsInode, oplock);
+ pCifsInode->can_cache_brlcks = pCifsInode->clientCanCacheAll;
file->private_data = pCifsFile;
return pCifsFile;
}
+static void cifs_del_lock_waiters(struct cifsLockInfo *lock);
+
/*
* Release a reference on the file private data. This may involve closing
* the filehandle out on the server. Must be called without holding
@@ -327,12 +333,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
/* Delete any outstanding lock records. We'll lose them when the file
* is closed anyway.
*/
- mutex_lock(&cifs_file->lock_mutex);
- list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
+ mutex_lock(&cifsi->lock_mutex);
+ list_for_each_entry_safe(li, tmp, &cifsi->llist, llist) {
+ if (li->netfid != cifs_file->netfid)
+ continue;
list_del(&li->llist);
+ cifs_del_lock_waiters(li);
kfree(li);
}
- mutex_unlock(&cifs_file->lock_mutex);
+ mutex_unlock(&cifsi->lock_mutex);
cifs_put_tlink(cifs_file->tlink);
dput(cifs_file->dentry);
@@ -371,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file)
cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
inode, file->f_flags, full_path);
- if (oplockEnabled)
+ if (enable_oplocks)
oplock = REQ_OPLOCK;
else
oplock = 0;
@@ -465,6 +474,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
char *full_path = NULL;
int desiredAccess;
int disposition = FILE_OPEN;
+ int create_options = CREATE_NOT_DIR;
__u16 netfid;
xid = GetXid();
@@ -495,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
cFYI(1, "inode = 0x%p file flags 0x%x for %s",
inode, pCifsFile->f_flags, full_path);
- if (oplockEnabled)
+ if (enable_oplocks)
oplock = REQ_OPLOCK;
else
oplock = 0;
@@ -524,6 +534,9 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
/* Can not refresh inode by passing in file_info buf to be returned
by SMBOpen and then calling get_inode_info with returned buf
since file might have write behind data that needs to be flushed
@@ -531,7 +544,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
that inode was not dirty locally we could do this */
rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
- CREATE_NOT_DIR, &netfid, &oplock, NULL,
+ create_options, &netfid, &oplock, NULL,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
if (rc) {
@@ -631,219 +644,713 @@ int cifs_closedir(struct inode *inode, struct file *file)
return rc;
}
-static int store_file_lock(struct cifsFileInfo *fid, __u64 len,
- __u64 offset, __u8 lockType)
+static struct cifsLockInfo *
+cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid)
{
- struct cifsLockInfo *li =
+ struct cifsLockInfo *lock =
kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
- if (li == NULL)
- return -ENOMEM;
- li->offset = offset;
- li->length = len;
- li->type = lockType;
- mutex_lock(&fid->lock_mutex);
- list_add(&li->llist, &fid->llist);
- mutex_unlock(&fid->lock_mutex);
- return 0;
+ if (!lock)
+ return lock;
+ lock->offset = offset;
+ lock->length = length;
+ lock->type = type;
+ lock->netfid = netfid;
+ lock->pid = current->tgid;
+ INIT_LIST_HEAD(&lock->blist);
+ init_waitqueue_head(&lock->block_q);
+ return lock;
}
-int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
+static void
+cifs_del_lock_waiters(struct cifsLockInfo *lock)
{
- int rc, xid;
- __u32 numLock = 0;
- __u32 numUnlock = 0;
- __u64 length;
- bool wait_flag = false;
- struct cifs_sb_info *cifs_sb;
+ struct cifsLockInfo *li, *tmp;
+ list_for_each_entry_safe(li, tmp, &lock->blist, blist) {
+ list_del_init(&li->blist);
+ wake_up(&li->block_q);
+ }
+}
+
+static bool
+__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
+ __u64 length, __u8 type, __u16 netfid,
+ struct cifsLockInfo **conf_lock)
+{
+ struct cifsLockInfo *li, *tmp;
+
+ list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+ if (offset + length <= li->offset ||
+ offset >= li->offset + li->length)
+ continue;
+ else if ((type & LOCKING_ANDX_SHARED_LOCK) &&
+ ((netfid == li->netfid && current->tgid == li->pid) ||
+ type == li->type))
+ continue;
+ else {
+ *conf_lock = li;
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool
+cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
+ struct cifsLockInfo **conf_lock)
+{
+ return __cifs_find_lock_conflict(cinode, lock->offset, lock->length,
+ lock->type, lock->netfid, conf_lock);
+}
+
+/*
+ * Check if there is another lock that prevents us to set the lock (mandatory
+ * style). If such a lock exists, update the flock structure with its
+ * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
+ * or leave it the same if we can't. Returns 0 if we don't need to request to
+ * the server or 1 otherwise.
+ */
+static int
+cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
+ __u8 type, __u16 netfid, struct file_lock *flock)
+{
+ int rc = 0;
+ struct cifsLockInfo *conf_lock;
+ bool exist;
+
+ mutex_lock(&cinode->lock_mutex);
+
+ exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid,
+ &conf_lock);
+ if (exist) {
+ flock->fl_start = conf_lock->offset;
+ flock->fl_end = conf_lock->offset + conf_lock->length - 1;
+ flock->fl_pid = conf_lock->pid;
+ if (conf_lock->type & LOCKING_ANDX_SHARED_LOCK)
+ flock->fl_type = F_RDLCK;
+ else
+ flock->fl_type = F_WRLCK;
+ } else if (!cinode->can_cache_brlcks)
+ rc = 1;
+ else
+ flock->fl_type = F_UNLCK;
+
+ mutex_unlock(&cinode->lock_mutex);
+ return rc;
+}
+
+static void
+cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
+{
+ mutex_lock(&cinode->lock_mutex);
+ list_add_tail(&lock->llist, &cinode->llist);
+ mutex_unlock(&cinode->lock_mutex);
+}
+
+/*
+ * Set the byte-range lock (mandatory style). Returns:
+ * 1) 0, if we set the lock and don't need to request to the server;
+ * 2) 1, if no locks prevent us but we need to request to the server;
+ * 3) -EACCESS, if there is a lock that prevents us and wait is false.
+ */
+static int
+cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
+ bool wait)
+{
+ struct cifsLockInfo *conf_lock;
+ bool exist;
+ int rc = 0;
+
+try_again:
+ exist = false;
+ mutex_lock(&cinode->lock_mutex);
+
+ exist = cifs_find_lock_conflict(cinode, lock, &conf_lock);
+ if (!exist && cinode->can_cache_brlcks) {
+ list_add_tail(&lock->llist, &cinode->llist);
+ mutex_unlock(&cinode->lock_mutex);
+ return rc;
+ }
+
+ if (!exist)
+ rc = 1;
+ else if (!wait)
+ rc = -EACCES;
+ else {
+ list_add_tail(&lock->blist, &conf_lock->blist);
+ mutex_unlock(&cinode->lock_mutex);
+ rc = wait_event_interruptible(lock->block_q,
+ (lock->blist.prev == &lock->blist) &&
+ (lock->blist.next == &lock->blist));
+ if (!rc)
+ goto try_again;
+ mutex_lock(&cinode->lock_mutex);
+ list_del_init(&lock->blist);
+ }
+
+ mutex_unlock(&cinode->lock_mutex);
+ return rc;
+}
+
+/*
+ * Check if there is another lock that prevents us to set the lock (posix
+ * style). If such a lock exists, update the flock structure with its
+ * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
+ * or leave it the same if we can't. Returns 0 if we don't need to request to
+ * the server or 1 otherwise.
+ */
+static int
+cifs_posix_lock_test(struct file *file, struct file_lock *flock)
+{
+ int rc = 0;
+ struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+ unsigned char saved_type = flock->fl_type;
+
+ if ((flock->fl_flags & FL_POSIX) == 0)
+ return 1;
+
+ mutex_lock(&cinode->lock_mutex);
+ posix_test_lock(file, flock);
+
+ if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) {
+ flock->fl_type = saved_type;
+ rc = 1;
+ }
+
+ mutex_unlock(&cinode->lock_mutex);
+ return rc;
+}
+
+/*
+ * Set the byte-range lock (posix style). Returns:
+ * 1) 0, if we set the lock and don't need to request to the server;
+ * 2) 1, if we need to request to the server;
+ * 3) <0, if the error occurs while setting the lock.
+ */
+static int
+cifs_posix_lock_set(struct file *file, struct file_lock *flock)
+{
+ struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+ int rc = 1;
+
+ if ((flock->fl_flags & FL_POSIX) == 0)
+ return rc;
+
+ mutex_lock(&cinode->lock_mutex);
+ if (!cinode->can_cache_brlcks) {
+ mutex_unlock(&cinode->lock_mutex);
+ return rc;
+ }
+ rc = posix_lock_file_wait(file, flock);
+ mutex_unlock(&cinode->lock_mutex);
+ return rc;
+}
+
+static int
+cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
+{
+ int xid, rc = 0, stored_rc;
+ struct cifsLockInfo *li, *tmp;
struct cifs_tcon *tcon;
- __u16 netfid;
- __u8 lockType = LOCKING_ANDX_LARGE_FILES;
- bool posix_locking = 0;
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ unsigned int num, max_num;
+ LOCKING_ANDX_RANGE *buf, *cur;
+ int types[] = {LOCKING_ANDX_LARGE_FILES,
+ LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
+ int i;
- length = 1 + pfLock->fl_end - pfLock->fl_start;
- rc = -EACCES;
xid = GetXid();
+ tcon = tlink_tcon(cfile->tlink);
- cFYI(1, "Lock parm: 0x%x flockflags: "
- "0x%x flocktype: 0x%x start: %lld end: %lld",
- cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
- pfLock->fl_end);
+ mutex_lock(&cinode->lock_mutex);
+ if (!cinode->can_cache_brlcks) {
+ mutex_unlock(&cinode->lock_mutex);
+ FreeXid(xid);
+ return rc;
+ }
- if (pfLock->fl_flags & FL_POSIX)
+ max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
+ sizeof(LOCKING_ANDX_RANGE);
+ buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ if (!buf) {
+ mutex_unlock(&cinode->lock_mutex);
+ FreeXid(xid);
+ return rc;
+ }
+
+ for (i = 0; i < 2; i++) {
+ cur = buf;
+ num = 0;
+ list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+ if (li->type != types[i])
+ continue;
+ cur->Pid = cpu_to_le16(li->pid);
+ cur->LengthLow = cpu_to_le32((u32)li->length);
+ cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
+ cur->OffsetLow = cpu_to_le32((u32)li->offset);
+ cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
+ if (++num == max_num) {
+ stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+ li->type, 0, num, buf);
+ if (stored_rc)
+ rc = stored_rc;
+ cur = buf;
+ num = 0;
+ } else
+ cur++;
+ }
+
+ if (num) {
+ stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+ types[i], 0, num, buf);
+ if (stored_rc)
+ rc = stored_rc;
+ }
+ }
+
+ cinode->can_cache_brlcks = false;
+ mutex_unlock(&cinode->lock_mutex);
+
+ kfree(buf);
+ FreeXid(xid);
+ return rc;
+}
+
+/* copied from fs/locks.c with a name change */
+#define cifs_for_each_lock(inode, lockp) \
+ for (lockp = &inode->i_flock; *lockp != NULL; \
+ lockp = &(*lockp)->fl_next)
+
+static int
+cifs_push_posix_locks(struct cifsFileInfo *cfile)
+{
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+ struct file_lock *flock, **before;
+ struct cifsLockInfo *lck, *tmp;
+ int rc = 0, xid, type;
+ __u64 length;
+ struct list_head locks_to_send;
+
+ xid = GetXid();
+
+ mutex_lock(&cinode->lock_mutex);
+ if (!cinode->can_cache_brlcks) {
+ mutex_unlock(&cinode->lock_mutex);
+ FreeXid(xid);
+ return rc;
+ }
+
+ INIT_LIST_HEAD(&locks_to_send);
+
+ lock_flocks();
+ cifs_for_each_lock(cfile->dentry->d_inode, before) {
+ flock = *before;
+ length = 1 + flock->fl_end - flock->fl_start;
+ if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
+ type = CIFS_RDLCK;
+ else
+ type = CIFS_WRLCK;
+
+ lck = cifs_lock_init(flock->fl_start, length, type,
+ cfile->netfid);
+ if (!lck) {
+ rc = -ENOMEM;
+ goto send_locks;
+ }
+ lck->pid = flock->fl_pid;
+
+ list_add_tail(&lck->llist, &locks_to_send);
+ }
+
+send_locks:
+ unlock_flocks();
+
+ list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
+ struct file_lock tmp_lock;
+ int stored_rc;
+
+ tmp_lock.fl_start = lck->offset;
+ stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid,
+ 0, lck->length, &tmp_lock,
+ lck->type, 0);
+ if (stored_rc)
+ rc = stored_rc;
+ list_del(&lck->llist);
+ kfree(lck);
+ }
+
+ cinode->can_cache_brlcks = false;
+ mutex_unlock(&cinode->lock_mutex);
+
+ FreeXid(xid);
+ return rc;
+}
+
+static int
+cifs_push_locks(struct cifsFileInfo *cfile)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+ struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+
+ if ((tcon->ses->capabilities & CAP_UNIX) &&
+ (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+ return cifs_push_posix_locks(cfile);
+
+ return cifs_push_mandatory_locks(cfile);
+}
+
+static void
+cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
+ bool *wait_flag)
+{
+ if (flock->fl_flags & FL_POSIX)
cFYI(1, "Posix");
- if (pfLock->fl_flags & FL_FLOCK)
+ if (flock->fl_flags & FL_FLOCK)
cFYI(1, "Flock");
- if (pfLock->fl_flags & FL_SLEEP) {
+ if (flock->fl_flags & FL_SLEEP) {
cFYI(1, "Blocking lock");
- wait_flag = true;
+ *wait_flag = true;
}
- if (pfLock->fl_flags & FL_ACCESS)
+ if (flock->fl_flags & FL_ACCESS)
cFYI(1, "Process suspended by mandatory locking - "
- "not implemented yet");
- if (pfLock->fl_flags & FL_LEASE)
+ "not implemented yet");
+ if (flock->fl_flags & FL_LEASE)
cFYI(1, "Lease on file - not implemented yet");
- if (pfLock->fl_flags &
+ if (flock->fl_flags &
(~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
- cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
+ cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags);
- if (pfLock->fl_type == F_WRLCK) {
+ *type = LOCKING_ANDX_LARGE_FILES;
+ if (flock->fl_type == F_WRLCK) {
cFYI(1, "F_WRLCK ");
- numLock = 1;
- } else if (pfLock->fl_type == F_UNLCK) {
+ *lock = 1;
+ } else if (flock->fl_type == F_UNLCK) {
cFYI(1, "F_UNLCK");
- numUnlock = 1;
- /* Check if unlock includes more than
- one lock range */
- } else if (pfLock->fl_type == F_RDLCK) {
+ *unlock = 1;
+ /* Check if unlock includes more than one lock range */
+ } else if (flock->fl_type == F_RDLCK) {
cFYI(1, "F_RDLCK");
- lockType |= LOCKING_ANDX_SHARED_LOCK;
- numLock = 1;
- } else if (pfLock->fl_type == F_EXLCK) {
+ *type |= LOCKING_ANDX_SHARED_LOCK;
+ *lock = 1;
+ } else if (flock->fl_type == F_EXLCK) {
cFYI(1, "F_EXLCK");
- numLock = 1;
- } else if (pfLock->fl_type == F_SHLCK) {
+ *lock = 1;
+ } else if (flock->fl_type == F_SHLCK) {
cFYI(1, "F_SHLCK");
- lockType |= LOCKING_ANDX_SHARED_LOCK;
- numLock = 1;
+ *type |= LOCKING_ANDX_SHARED_LOCK;
+ *lock = 1;
} else
cFYI(1, "Unknown type of lock");
+}
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
- netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
+static int
+cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
+ bool wait_flag, bool posix_lck, int xid)
+{
+ int rc = 0;
+ __u64 length = 1 + flock->fl_end - flock->fl_start;
+ struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+ struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ __u16 netfid = cfile->netfid;
- if ((tcon->ses->capabilities & CAP_UNIX) &&
- (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
- ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
- posix_locking = 1;
- /* BB add code here to normalize offset and length to
- account for negative length which we can not accept over the
- wire */
- if (IS_GETLK(cmd)) {
- if (posix_locking) {
- int posix_lock_type;
- if (lockType & LOCKING_ANDX_SHARED_LOCK)
- posix_lock_type = CIFS_RDLCK;
- else
- posix_lock_type = CIFS_WRLCK;
- rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
- length, pfLock, posix_lock_type,
- wait_flag);
- FreeXid(xid);
+ if (posix_lck) {
+ int posix_lock_type;
+
+ rc = cifs_posix_lock_test(file, flock);
+ if (!rc)
return rc;
- }
- /* BB we could chain these into one lock request BB */
- rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
- 0, 1, lockType, 0 /* wait flag */, 0);
- if (rc == 0) {
- rc = CIFSSMBLock(xid, tcon, netfid, length,
- pfLock->fl_start, 1 /* numUnlock */ ,
- 0 /* numLock */ , lockType,
- 0 /* wait flag */, 0);
- pfLock->fl_type = F_UNLCK;
- if (rc != 0)
- cERROR(1, "Error unlocking previously locked "
- "range %d during test of lock", rc);
- rc = 0;
+ if (type & LOCKING_ANDX_SHARED_LOCK)
+ posix_lock_type = CIFS_RDLCK;
+ else
+ posix_lock_type = CIFS_WRLCK;
+ rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
+ 1 /* get */, length, flock,
+ posix_lock_type, wait_flag);
+ return rc;
+ }
- } else {
- /* if rc == ERR_SHARING_VIOLATION ? */
- rc = 0;
+ rc = cifs_lock_test(cinode, flock->fl_start, length, type, netfid,
+ flock);
+ if (!rc)
+ return rc;
+
+ /* BB we could chain these into one lock request BB */
+ rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
+ flock->fl_start, 0, 1, type, 0, 0);
+ if (rc == 0) {
+ rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
+ length, flock->fl_start, 1, 0,
+ type, 0, 0);
+ flock->fl_type = F_UNLCK;
+ if (rc != 0)
+ cERROR(1, "Error unlocking previously locked "
+ "range %d during test of lock", rc);
+ return 0;
+ }
+
+ if (type & LOCKING_ANDX_SHARED_LOCK) {
+ flock->fl_type = F_WRLCK;
+ return 0;
+ }
+
+ rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
+ flock->fl_start, 0, 1,
+ type | LOCKING_ANDX_SHARED_LOCK, 0, 0);
+ if (rc == 0) {
+ rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
+ length, flock->fl_start, 1, 0,
+ type | LOCKING_ANDX_SHARED_LOCK,
+ 0, 0);
+ flock->fl_type = F_RDLCK;
+ if (rc != 0)
+ cERROR(1, "Error unlocking previously locked "
+ "range %d during test of lock", rc);
+ } else
+ flock->fl_type = F_WRLCK;
+
+ return 0;
+}
- if (lockType & LOCKING_ANDX_SHARED_LOCK) {
- pfLock->fl_type = F_WRLCK;
+static void
+cifs_move_llist(struct list_head *source, struct list_head *dest)
+{
+ struct list_head *li, *tmp;
+ list_for_each_safe(li, tmp, source)
+ list_move(li, dest);
+}
+
+static void
+cifs_free_llist(struct list_head *llist)
+{
+ struct cifsLockInfo *li, *tmp;
+ list_for_each_entry_safe(li, tmp, llist, llist) {
+ cifs_del_lock_waiters(li);
+ list_del(&li->llist);
+ kfree(li);
+ }
+}
+
+static int
+cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
+{
+ int rc = 0, stored_rc;
+ int types[] = {LOCKING_ANDX_LARGE_FILES,
+ LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
+ unsigned int i;
+ unsigned int max_num, num;
+ LOCKING_ANDX_RANGE *buf, *cur;
+ struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsLockInfo *li, *tmp;
+ __u64 length = 1 + flock->fl_end - flock->fl_start;
+ struct list_head tmp_llist;
+
+ INIT_LIST_HEAD(&tmp_llist);
+
+ max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
+ sizeof(LOCKING_ANDX_RANGE);
+ buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ mutex_lock(&cinode->lock_mutex);
+ for (i = 0; i < 2; i++) {
+ cur = buf;
+ num = 0;
+ list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+ if (flock->fl_start > li->offset ||
+ (flock->fl_start + length) <
+ (li->offset + li->length))
+ continue;
+ if (current->tgid != li->pid)
+ continue;
+ if (cfile->netfid != li->netfid)
+ continue;
+ if (types[i] != li->type)
+ continue;
+ if (!cinode->can_cache_brlcks) {
+ cur->Pid = cpu_to_le16(li->pid);
+ cur->LengthLow = cpu_to_le32((u32)li->length);
+ cur->LengthHigh =
+ cpu_to_le32((u32)(li->length>>32));
+ cur->OffsetLow = cpu_to_le32((u32)li->offset);
+ cur->OffsetHigh =
+ cpu_to_le32((u32)(li->offset>>32));
+ /*
+ * We need to save a lock here to let us add
+ * it again to the inode list if the unlock
+ * range request fails on the server.
+ */
+ list_move(&li->llist, &tmp_llist);
+ if (++num == max_num) {
+ stored_rc = cifs_lockv(xid, tcon,
+ cfile->netfid,
+ li->type, num,
+ 0, buf);
+ if (stored_rc) {
+ /*
+ * We failed on the unlock range
+ * request - add all locks from
+ * the tmp list to the head of
+ * the inode list.
+ */
+ cifs_move_llist(&tmp_llist,
+ &cinode->llist);
+ rc = stored_rc;
+ } else
+ /*
+ * The unlock range request
+ * succeed - free the tmp list.
+ */
+ cifs_free_llist(&tmp_llist);
+ cur = buf;
+ num = 0;
+ } else
+ cur++;
} else {
- rc = CIFSSMBLock(xid, tcon, netfid, length,
- pfLock->fl_start, 0, 1,
- lockType | LOCKING_ANDX_SHARED_LOCK,
- 0 /* wait flag */, 0);
- if (rc == 0) {
- rc = CIFSSMBLock(xid, tcon, netfid,
- length, pfLock->fl_start, 1, 0,
- lockType |
- LOCKING_ANDX_SHARED_LOCK,
- 0 /* wait flag */, 0);
- pfLock->fl_type = F_RDLCK;
- if (rc != 0)
- cERROR(1, "Error unlocking "
- "previously locked range %d "
- "during test of lock", rc);
- rc = 0;
- } else {
- pfLock->fl_type = F_WRLCK;
- rc = 0;
- }
+ /*
+ * We can cache brlock requests - simply remove
+ * a lock from the inode list.
+ */
+ list_del(&li->llist);
+ cifs_del_lock_waiters(li);
+ kfree(li);
}
}
-
- FreeXid(xid);
- return rc;
+ if (num) {
+ stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+ types[i], num, 0, buf);
+ if (stored_rc) {
+ cifs_move_llist(&tmp_llist, &cinode->llist);
+ rc = stored_rc;
+ } else
+ cifs_free_llist(&tmp_llist);
+ }
}
- if (!numLock && !numUnlock) {
- /* if no lock or unlock then nothing
- to do since we do not know what it is */
- FreeXid(xid);
- return -EOPNOTSUPP;
- }
+ mutex_unlock(&cinode->lock_mutex);
+ kfree(buf);
+ return rc;
+}
+
+static int
+cifs_setlk(struct file *file, struct file_lock *flock, __u8 type,
+ bool wait_flag, bool posix_lck, int lock, int unlock, int xid)
+{
+ int rc = 0;
+ __u64 length = 1 + flock->fl_end - flock->fl_start;
+ struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+ struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+ struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+ __u16 netfid = cfile->netfid;
- if (posix_locking) {
+ if (posix_lck) {
int posix_lock_type;
- if (lockType & LOCKING_ANDX_SHARED_LOCK)
+
+ rc = cifs_posix_lock_set(file, flock);
+ if (!rc || rc < 0)
+ return rc;
+
+ if (type & LOCKING_ANDX_SHARED_LOCK)
posix_lock_type = CIFS_RDLCK;
else
posix_lock_type = CIFS_WRLCK;
- if (numUnlock == 1)
+ if (unlock == 1)
posix_lock_type = CIFS_UNLCK;
- rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */,
- length, pfLock, posix_lock_type,
- wait_flag);
- } else {
- struct cifsFileInfo *fid = file->private_data;
+ rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
+ 0 /* set */, length, flock,
+ posix_lock_type, wait_flag);
+ goto out;
+ }
- if (numLock) {
- rc = CIFSSMBLock(xid, tcon, netfid, length,
- pfLock->fl_start, 0, numLock, lockType,
- wait_flag, 0);
+ if (lock) {
+ struct cifsLockInfo *lock;
- if (rc == 0) {
- /* For Windows locks we must store them. */
- rc = store_file_lock(fid, length,
- pfLock->fl_start, lockType);
- }
- } else if (numUnlock) {
- /* For each stored lock that this unlock overlaps
- completely, unlock it. */
- int stored_rc = 0;
- struct cifsLockInfo *li, *tmp;
+ lock = cifs_lock_init(flock->fl_start, length, type, netfid);
+ if (!lock)
+ return -ENOMEM;
- rc = 0;
- mutex_lock(&fid->lock_mutex);
- list_for_each_entry_safe(li, tmp, &fid->llist, llist) {
- if (pfLock->fl_start <= li->offset &&
- (pfLock->fl_start + length) >=
- (li->offset + li->length)) {
- stored_rc = CIFSSMBLock(xid, tcon,
- netfid, li->length,
- li->offset, 1, 0,
- li->type, false, 0);
- if (stored_rc)
- rc = stored_rc;
- else {
- list_del(&li->llist);
- kfree(li);
- }
- }
- }
- mutex_unlock(&fid->lock_mutex);
+ rc = cifs_lock_add_if(cinode, lock, wait_flag);
+ if (rc < 0)
+ kfree(lock);
+ if (rc <= 0)
+ goto out;
+
+ rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
+ flock->fl_start, 0, 1, type, wait_flag, 0);
+ if (rc) {
+ kfree(lock);
+ goto out;
}
+
+ cifs_lock_add(cinode, lock);
+ } else if (unlock)
+ rc = cifs_unlock_range(cfile, flock, xid);
+
+out:
+ if (flock->fl_flags & FL_POSIX)
+ posix_lock_file_wait(file, flock);
+ return rc;
+}
+
+int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
+{
+ int rc, xid;
+ int lock = 0, unlock = 0;
+ bool wait_flag = false;
+ bool posix_lck = false;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+ struct cifsInodeInfo *cinode;
+ struct cifsFileInfo *cfile;
+ __u16 netfid;
+ __u8 type;
+
+ rc = -EACCES;
+ xid = GetXid();
+
+ cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld "
+ "end: %lld", cmd, flock->fl_flags, flock->fl_type,
+ flock->fl_start, flock->fl_end);
+
+ cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag);
+
+ cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ cfile = (struct cifsFileInfo *)file->private_data;
+ tcon = tlink_tcon(cfile->tlink);
+ netfid = cfile->netfid;
+ cinode = CIFS_I(file->f_path.dentry->d_inode);
+
+ if ((tcon->ses->capabilities & CAP_UNIX) &&
+ (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+ posix_lck = true;
+ /*
+ * BB add code here to normalize offset and length to account for
+ * negative length which we can not accept over the wire.
+ */
+ if (IS_GETLK(cmd)) {
+ rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid);
+ FreeXid(xid);
+ return rc;
}
- if (pfLock->fl_flags & FL_POSIX)
- posix_lock_file_wait(file, pfLock);
+ if (!lock && !unlock) {
+ /*
+ * if no lock or unlock then nothing to do since we do not
+ * know what it is
+ */
+ FreeXid(xid);
+ return -EOPNOTSUPP;
+ }
+
+ rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock,
+ xid);
FreeXid(xid);
return rc;
}
@@ -1714,6 +2221,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
struct smb_com_read_rsp *pSMBr;
struct cifs_io_parms io_parms;
char *read_data;
+ unsigned int rsize;
__u32 pid;
if (!nr_segs)
@@ -1726,6 +2234,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
xid = GetXid();
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ /* FIXME: set up handlers for larger reads and/or convert to async */
+ rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
+
open_file = file->private_data;
pTcon = tlink_tcon(open_file->tlink);
@@ -1738,7 +2249,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
cFYI(1, "attempting read on write only file instance");
for (total_read = 0; total_read < len; total_read += bytes_read) {
- cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
+ cur_len = min_t(const size_t, len - total_read, rsize);
rc = -EAGAIN;
read_data = NULL;
@@ -1830,6 +2341,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
unsigned int bytes_read = 0;
unsigned int total_read;
unsigned int current_read_size;
+ unsigned int rsize;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *pTcon;
int xid;
@@ -1842,6 +2354,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
xid = GetXid();
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ /* FIXME: set up handlers for larger reads and/or convert to async */
+ rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
+
if (file->private_data == NULL) {
rc = -EBADF;
FreeXid(xid);
@@ -1861,14 +2376,14 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
for (total_read = 0, current_offset = read_data;
read_size > total_read;
total_read += bytes_read, current_offset += bytes_read) {
- current_read_size = min_t(const int, read_size - total_read,
- cifs_sb->rsize);
+ current_read_size = min_t(uint, read_size - total_read, rsize);
+
/* For windows me and 9x we do not want to request more
than it negotiated since it will refuse the read then */
if ((pTcon->ses) &&
!(pTcon->ses->capabilities & CAP_LARGE_FILES)) {
- current_read_size = min_t(const int, current_read_size,
- pTcon->ses->server->maxBuf - 128);
+ current_read_size = min_t(uint, current_read_size,
+ CIFSMaxBufSize);
}
rc = -EAGAIN;
while (rc == -EAGAIN) {
@@ -1957,82 +2472,24 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
return rc;
}
-
-static void cifs_copy_cache_pages(struct address_space *mapping,
- struct list_head *pages, int bytes_read, char *data)
-{
- struct page *page;
- char *target;
-
- while (bytes_read > 0) {
- if (list_empty(pages))
- break;
-
- page = list_entry(pages->prev, struct page, lru);
- list_del(&page->lru);
-
- if (add_to_page_cache_lru(page, mapping, page->index,
- GFP_KERNEL)) {
- page_cache_release(page);
- cFYI(1, "Add page cache failed");
- data += PAGE_CACHE_SIZE;
- bytes_read -= PAGE_CACHE_SIZE;
- continue;
- }
- page_cache_release(page);
-
- target = kmap_atomic(page, KM_USER0);
-
- if (PAGE_CACHE_SIZE > bytes_read) {
- memcpy(target, data, bytes_read);
- /* zero the tail end of this partial page */
- memset(target + bytes_read, 0,
- PAGE_CACHE_SIZE - bytes_read);
- bytes_read = 0;
- } else {
- memcpy(target, data, PAGE_CACHE_SIZE);
- bytes_read -= PAGE_CACHE_SIZE;
- }
- kunmap_atomic(target, KM_USER0);
-
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- data += PAGE_CACHE_SIZE;
-
- /* add page to FS-Cache */
- cifs_readpage_to_fscache(mapping->host, page);
- }
- return;
-}
-
static int cifs_readpages(struct file *file, struct address_space *mapping,
struct list_head *page_list, unsigned num_pages)
{
- int rc = -EACCES;
- int xid;
- loff_t offset;
- struct page *page;
- struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *pTcon;
- unsigned int bytes_read = 0;
- unsigned int read_size, i;
- char *smb_read_data = NULL;
- struct smb_com_read_rsp *pSMBr;
- struct cifsFileInfo *open_file;
- struct cifs_io_parms io_parms;
- int buf_type = CIFS_NO_BUFFER;
- __u32 pid;
+ int rc;
+ struct list_head tmplist;
+ struct cifsFileInfo *open_file = file->private_data;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ unsigned int rsize = cifs_sb->rsize;
+ pid_t pid;
- xid = GetXid();
- if (file->private_data == NULL) {
- rc = -EBADF;
- FreeXid(xid);
- return rc;
- }
- open_file = file->private_data;
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- pTcon = tlink_tcon(open_file->tlink);
+ /*
+ * Give up immediately if rsize is too small to read an entire page.
+ * The VFS will fall back to readpage. We should never reach this
+ * point however since we set ra_pages to 0 when the rsize is smaller
+ * than a cache page.
+ */
+ if (unlikely(rsize < PAGE_CACHE_SIZE))
+ return 0;
/*
* Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2041,125 +2498,127 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
&num_pages);
if (rc == 0)
- goto read_complete;
+ return rc;
- cFYI(DBG2, "rpages: num pages %d", num_pages);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
else
pid = current->tgid;
- for (i = 0; i < num_pages; ) {
- unsigned contig_pages;
- struct page *tmp_page;
- unsigned long expected_index;
+ rc = 0;
+ INIT_LIST_HEAD(&tmplist);
- if (list_empty(page_list))
- break;
+ cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file,
+ mapping, num_pages);
+
+ /*
+ * Start with the page at end of list and move it to private
+ * list. Do the same with any following pages until we hit
+ * the rsize limit, hit an index discontinuity, or run out of
+ * pages. Issue the async read and then start the loop again
+ * until the list is empty.
+ *
+ * Note that list order is important. The page_list is in
+ * the order of declining indexes. When we put the pages in
+ * the rdata->pages, then we want them in increasing order.
+ */
+ while (!list_empty(page_list)) {
+ unsigned int bytes = PAGE_CACHE_SIZE;
+ unsigned int expected_index;
+ unsigned int nr_pages = 1;
+ loff_t offset;
+ struct page *page, *tpage;
+ struct cifs_readdata *rdata;
page = list_entry(page_list->prev, struct page, lru);
+
+ /*
+ * Lock the page and put it in the cache. Since no one else
+ * should have access to this page, we're safe to simply set
+ * PG_locked without checking it first.
+ */
+ __set_page_locked(page);
+ rc = add_to_page_cache_locked(page, mapping,
+ page->index, GFP_KERNEL);
+
+ /* give up if we can't stick it in the cache */
+ if (rc) {
+ __clear_page_locked(page);
+ break;
+ }
+
+ /* move first page to the tmplist */
offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ list_move_tail(&page->lru, &tmplist);
- /* count adjacent pages that we will read into */
- contig_pages = 0;
- expected_index =
- list_entry(page_list->prev, struct page, lru)->index;
- list_for_each_entry_reverse(tmp_page, page_list, lru) {
- if (tmp_page->index == expected_index) {
- contig_pages++;
- expected_index++;
- } else
+ /* now try and add more pages onto the request */
+ expected_index = page->index + 1;
+ list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
+ /* discontinuity ? */
+ if (page->index != expected_index)
break;
+
+ /* would this page push the read over the rsize? */
+ if (bytes + PAGE_CACHE_SIZE > rsize)
+ break;
+
+ __set_page_locked(page);
+ if (add_to_page_cache_locked(page, mapping,
+ page->index, GFP_KERNEL)) {
+ __clear_page_locked(page);
+ break;
+ }
+ list_move_tail(&page->lru, &tmplist);
+ bytes += PAGE_CACHE_SIZE;
+ expected_index++;
+ nr_pages++;
}
- if (contig_pages + i > num_pages)
- contig_pages = num_pages - i;
-
- /* for reads over a certain size could initiate async
- read ahead */
-
- read_size = contig_pages * PAGE_CACHE_SIZE;
- /* Read size needs to be in multiples of one page */
- read_size = min_t(const unsigned int, read_size,
- cifs_sb->rsize & PAGE_CACHE_MASK);
- cFYI(DBG2, "rpages: read size 0x%x contiguous pages %d",
- read_size, contig_pages);
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
+
+ rdata = cifs_readdata_alloc(nr_pages);
+ if (!rdata) {
+ /* best to give up if we're out of mem */
+ list_for_each_entry_safe(page, tpage, &tmplist, lru) {
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ rc = -ENOMEM;
+ break;
+ }
+
+ spin_lock(&cifs_file_list_lock);
+ cifsFileInfo_get(open_file);
+ spin_unlock(&cifs_file_list_lock);
+ rdata->cfile = open_file;
+ rdata->mapping = mapping;
+ rdata->offset = offset;
+ rdata->bytes = bytes;
+ rdata->pid = pid;
+ list_splice_init(&tmplist, &rdata->pages);
+
+ do {
if (open_file->invalidHandle) {
rc = cifs_reopen_file(open_file, true);
if (rc != 0)
- break;
+ continue;
}
- io_parms.netfid = open_file->netfid;
- io_parms.pid = pid;
- io_parms.tcon = pTcon;
- io_parms.offset = offset;
- io_parms.length = read_size;
- rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
- &smb_read_data, &buf_type);
- /* BB more RC checks ? */
- if (rc == -EAGAIN) {
- if (smb_read_data) {
- if (buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(smb_read_data);
- else if (buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(smb_read_data);
- smb_read_data = NULL;
- }
- }
- }
- if ((rc < 0) || (smb_read_data == NULL)) {
- cFYI(1, "Read error in readpages: %d", rc);
- break;
- } else if (bytes_read > 0) {
- task_io_account_read(bytes_read);
- pSMBr = (struct smb_com_read_rsp *)smb_read_data;
- cifs_copy_cache_pages(mapping, page_list, bytes_read,
- smb_read_data + 4 /* RFC1001 hdr */ +
- le16_to_cpu(pSMBr->DataOffset));
-
- i += bytes_read >> PAGE_CACHE_SHIFT;
- cifs_stats_bytes_read(pTcon, bytes_read);
- if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) {
- i++; /* account for partial page */
-
- /* server copy of file can have smaller size
- than client */
- /* BB do we need to verify this common case ?
- this case is ok - if we are at server EOF
- we will hit it on next read */
+ rc = cifs_async_readv(rdata);
+ } while (rc == -EAGAIN);
- /* break; */
+ if (rc != 0) {
+ list_for_each_entry_safe(page, tpage, &rdata->pages,
+ lru) {
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+ unlock_page(page);
+ page_cache_release(page);
}
- } else {
- cFYI(1, "No bytes read (%d) at offset %lld . "
- "Cleaning remaining pages from readahead list",
- bytes_read, offset);
- /* BB turn off caching and do new lookup on
- file size at server? */
+ cifs_readdata_free(rdata);
break;
}
- if (smb_read_data) {
- if (buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(smb_read_data);
- else if (buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(smb_read_data);
- smb_read_data = NULL;
- }
- bytes_read = 0;
}
-/* need to free smb_read_data buf before exit */
- if (smb_read_data) {
- if (buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(smb_read_data);
- else if (buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(smb_read_data);
- smb_read_data = NULL;
- }
-
-read_complete:
- FreeXid(xid);
return rc;
}
@@ -2408,6 +2867,10 @@ void cifs_oplock_break(struct work_struct *work)
cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
}
+ rc = cifs_push_locks(cfile);
+ if (rc)
+ cERROR(1, "Push locks rc = %d", rc);
+
/*
* releasing stale oplock after recent reconnect of smb session using
* a now incorrect file handle is not a data integrity issue but do
@@ -2415,8 +2878,9 @@ void cifs_oplock_break(struct work_struct *work)
* disconnected since oplock already released by the server
*/
if (!cfile->oplock_break_cancelled) {
- rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
- 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
+ rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid,
+ current->tgid, 0, 0, 0, 0,
+ LOCKING_ANDX_OPLOCK_RELEASE, false,
cinode->clientCanCacheRead ? 1 : 0);
cFYI(1, "Oplock release rc = %d", rc);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a7b2dcd4a53..e851d5b8931 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -132,7 +132,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
inode->i_mtime = fattr->cf_mtime;
inode->i_ctime = fattr->cf_ctime;
inode->i_rdev = fattr->cf_rdev;
- inode->i_nlink = fattr->cf_nlink;
+ set_nlink(inode, fattr->cf_nlink);
inode->i_uid = fattr->cf_uid;
inode->i_gid = fattr->cf_gid;
@@ -562,7 +562,16 @@ int cifs_get_file_info(struct file *filp)
xid = GetXid();
rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
- if (rc == -EOPNOTSUPP || rc == -EINVAL) {
+ switch (rc) {
+ case 0:
+ cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
+ break;
+ case -EREMOTE:
+ cifs_create_dfs_fattr(&fattr, inode->i_sb);
+ rc = 0;
+ break;
+ case -EOPNOTSUPP:
+ case -EINVAL:
/*
* FIXME: legacy server -- fall back to path-based call?
* for now, just skip revalidating and mark inode for
@@ -570,18 +579,14 @@ int cifs_get_file_info(struct file *filp)
*/
rc = 0;
CIFS_I(inode)->time = 0;
+ default:
goto cgfi_exit;
- } else if (rc == -EREMOTE) {
- cifs_create_dfs_fattr(&fattr, inode->i_sb);
- rc = 0;
- } else if (rc)
- goto cgfi_exit;
+ }
/*
* don't bother with SFU junk here -- just mark inode as needing
* revalidation.
*/
- cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
cifs_fattr_to_inode(inode, &fattr);
@@ -900,7 +905,7 @@ struct inode *cifs_root_iget(struct super_block *sb)
if (rc && tcon->ipc) {
cFYI(1, "ipc connection - fake read inode");
inode->i_mode |= S_IFDIR;
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_op = &cifs_ipc_inode_ops;
inode->i_fop = &simple_dir_operations;
inode->i_uid = cifs_sb->mnt_uid;
@@ -1362,7 +1367,7 @@ mkdir_get_info:
/* setting nlink not necessary except in cases where we
* failed to get it from the server or was set bogus */
if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
- direntry->d_inode->i_nlink = 2;
+ set_nlink(direntry->d_inode, 2);
mode &= ~current_umask();
/* must turn on setgid bit if parent dir has it */
@@ -2096,6 +2101,8 @@ static int
cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
{
int xid;
+ uid_t uid = NO_CHANGE_32;
+ gid_t gid = NO_CHANGE_32;
struct inode *inode = direntry->d_inode;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsInodeInfo *cifsInode = CIFS_I(inode);
@@ -2146,13 +2153,25 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
goto cifs_setattr_exit;
}
- /*
- * Without unix extensions we can't send ownership changes to the
- * server, so silently ignore them. This is consistent with how
- * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With
- * CIFSACL support + proper Windows to Unix idmapping, we may be
- * able to support this in the future.
- */
+ if (attrs->ia_valid & ATTR_UID)
+ uid = attrs->ia_uid;
+
+ if (attrs->ia_valid & ATTR_GID)
+ gid = attrs->ia_gid;
+
+#ifdef CONFIG_CIFS_ACL
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+ if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) {
+ rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
+ uid, gid);
+ if (rc) {
+ cFYI(1, "%s: Setting id failed with error: %d",
+ __func__, rc);
+ goto cifs_setattr_exit;
+ }
+ }
+ } else
+#endif /* CONFIG_CIFS_ACL */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
@@ -2161,15 +2180,12 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
attrs->ia_valid &= ~ATTR_MODE;
if (attrs->ia_valid & ATTR_MODE) {
- cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
mode = attrs->ia_mode;
- }
-
- if (attrs->ia_valid & ATTR_MODE) {
rc = 0;
#ifdef CONFIG_CIFS_ACL
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
- rc = mode_to_cifs_acl(inode, full_path, mode);
+ rc = id_mode_to_cifs_acl(inode, full_path, mode,
+ NO_CHANGE_32, NO_CHANGE_32);
if (rc) {
cFYI(1, "%s: Setting ACL failed with error: %d",
__func__, rc);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index db3f18cdf02..6b0e0643439 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -183,14 +183,20 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
static int
CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
const char *fromName, const char *toName,
- const struct nls_table *nls_codepage, int remap)
+ struct cifs_sb_info *cifs_sb)
{
int rc;
int oplock = 0;
+ int remap;
+ int create_options = CREATE_NOT_DIR;
__u16 netfid = 0;
u8 *buf;
unsigned int bytes_written = 0;
struct cifs_io_parms io_parms;
+ struct nls_table *nls_codepage;
+
+ nls_codepage = cifs_sb->local_nls;
+ remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
if (!buf)
@@ -202,8 +208,11 @@ CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
return rc;
}
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
- CREATE_NOT_DIR, &netfid, &oplock, NULL,
+ create_options, &netfid, &oplock, NULL,
nls_codepage, remap);
if (rc != 0) {
kfree(buf);
@@ -424,7 +433,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
if (old_file->d_inode) {
cifsInode = CIFS_I(old_file->d_inode);
if (rc == 0) {
- old_file->d_inode->i_nlink++;
+ inc_nlink(old_file->d_inode);
/* BB should we make this contingent on superblock flag NOATIME? */
/* old_file->d_inode->i_ctime = CURRENT_TIME;*/
/* parent dir timestamps will update from srv
@@ -559,9 +568,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
/* BB what if DFS and this volume is on different share? BB */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb);
else if (pTcon->unix_ext)
rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
cifs_sb->local_nls);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 7c169339259..703ef5c6fdb 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -420,19 +420,22 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
}
int
-checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
+checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read)
{
- __u32 len = be32_to_cpu(smb->smb_buf_length);
+ __u32 rfclen = be32_to_cpu(smb->smb_buf_length);
__u32 clc_len; /* calculated length */
- cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
+ cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x",
+ total_read, rfclen);
- if (length < 2 + sizeof(struct smb_hdr)) {
- if ((length >= sizeof(struct smb_hdr) - 1)
+ /* is this frame too small to even get to a BCC? */
+ if (total_read < 2 + sizeof(struct smb_hdr)) {
+ if ((total_read >= sizeof(struct smb_hdr) - 1)
&& (smb->Status.CifsError != 0)) {
+ /* it's an error return */
smb->WordCount = 0;
/* some error cases do not return wct and bcc */
return 0;
- } else if ((length == sizeof(struct smb_hdr) + 1) &&
+ } else if ((total_read == sizeof(struct smb_hdr) + 1) &&
(smb->WordCount == 0)) {
char *tmp = (char *)smb;
/* Need to work around a bug in two servers here */
@@ -452,39 +455,35 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
} else {
cERROR(1, "Length less than smb header size");
}
- return 1;
- }
- if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
- cERROR(1, "smb length greater than MaxBufSize, mid=%d",
- smb->Mid);
- return 1;
+ return -EIO;
}
+ /* otherwise, there is enough to get to the BCC */
if (check_smb_hdr(smb, mid))
- return 1;
+ return -EIO;
clc_len = smbCalcSize(smb);
- if (4 + len != length) {
+ if (4 + rfclen != total_read) {
cERROR(1, "Length read does not match RFC1001 length %d",
- len);
- return 1;
+ rfclen);
+ return -EIO;
}
- if (4 + len != clc_len) {
+ if (4 + rfclen != clc_len) {
/* check if bcc wrapped around for large read responses */
- if ((len > 64 * 1024) && (len > clc_len)) {
+ if ((rfclen > 64 * 1024) && (rfclen > clc_len)) {
/* check if lengths match mod 64K */
- if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
+ if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF))
return 0; /* bcc wrapped */
}
cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
- clc_len, 4 + len, smb->Mid);
+ clc_len, 4 + rfclen, smb->Mid);
- if (4 + len < clc_len) {
+ if (4 + rfclen < clc_len) {
cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
- len, smb->Mid);
- return 1;
- } else if (len > clc_len + 512) {
+ rfclen, smb->Mid);
+ return -EIO;
+ } else if (rfclen > clc_len + 512) {
/*
* Some servers (Windows XP in particular) send more
* data than the lengths in the SMB packet would
@@ -495,8 +494,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
* data to 512 bytes.
*/
cERROR(1, "RFC1001 size %u more than 512 bytes larger "
- "than SMB for mid=%u", len, smb->Mid);
- return 1;
+ "than SMB for mid=%u", rfclen, smb->Mid);
+ return -EIO;
}
}
return 0;
@@ -676,3 +675,18 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
cinode->clientCanCacheRead = false;
}
}
+
+bool
+backup_cred(struct cifs_sb_info *cifs_sb)
+{
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) {
+ if (cifs_sb->mnt_backupuid == current_fsuid())
+ return true;
+ }
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) {
+ if (in_group_p(cifs_sb->mnt_backupgid))
+ return true;
+ }
+
+ return false;
+}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5de03ec2014..a090bbe6ee2 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -554,7 +554,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
rc);
return rc;
}
- cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
+ /* FindFirst/Next set last_entry to NULL on malformed reply */
+ if (cifsFile->srch_inf.last_entry)
+ cifs_save_resume_key(cifsFile->srch_inf.last_entry,
+ cifsFile);
}
while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
@@ -562,7 +565,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
cFYI(1, "calling findnext2");
rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
&cifsFile->srch_inf);
- cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
+ /* FindFirst/Next set last_entry to NULL on malformed reply */
+ if (cifsFile->srch_inf.last_entry)
+ cifs_save_resume_key(cifsFile->srch_inf.last_entry,
+ cifsFile);
if (rc)
return -ENOENT;
}
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d3e619692ee..4ec3ee9d72c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -124,7 +124,9 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
/* that we use in next few lines */
/* Note that header is initialized to zero in header_assemble */
pSMB->req.AndXCommand = 0xFF;
- pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
+ pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32,
+ CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
+ USHRT_MAX));
pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
pSMB->req.VcNumber = get_next_vcnum(ses);
@@ -681,7 +683,7 @@ ssetup_ntlmssp_authenticate:
cpu_to_le16(CIFS_AUTH_RESP_SIZE);
/* calculate ntlm response and session key */
- rc = setup_ntlm_response(ses);
+ rc = setup_ntlm_response(ses, nls_cp);
if (rc) {
cERROR(1, "Error %d during NTLM authentication", rc);
goto ssetup_exit;
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 42b9fff4875..80d85088193 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -199,160 +199,36 @@ SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
return rc;
}
-/* Routines for Windows NT MD4 Hash functions. */
-static int
-_my_wcslen(__u16 *str)
-{
- int len = 0;
- while (*str++ != 0)
- len++;
- return len;
-}
-
-/*
- * Convert a string into an NT UNICODE string.
- * Note that regardless of processor type
- * this must be in intel (little-endian)
- * format.
- */
-
-static int
-_my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
-{ /* BB not a very good conversion routine - change/fix */
- int i;
- __u16 val;
-
- for (i = 0; i < len; i++) {
- val = *src;
- SSVAL(dst, 0, val);
- dst++;
- src++;
- if (val == 0)
- break;
- }
- return i;
-}
-
/*
* Creates the MD4 Hash of the users password in NT UNICODE.
*/
int
-E_md4hash(const unsigned char *passwd, unsigned char *p16)
+E_md4hash(const unsigned char *passwd, unsigned char *p16,
+ const struct nls_table *codepage)
{
int rc;
int len;
- __u16 wpwd[129];
+ __le16 wpwd[129];
/* Password cannot be longer than 128 characters */
- if (passwd) {
- len = strlen((char *) passwd);
- if (len > 128)
- len = 128;
-
- /* Password must be converted to NT unicode */
- _my_mbstowcs(wpwd, passwd, len);
- } else
+ if (passwd) /* Password must be converted to NT unicode */
+ len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
+ else {
len = 0;
+ *wpwd = 0; /* Ensure string is null terminated */
+ }
- wpwd[len] = 0; /* Ensure string is null terminated */
- /* Calculate length in bytes */
- len = _my_wcslen(wpwd) * sizeof(__u16);
-
- rc = mdfour(p16, (unsigned char *) wpwd, len);
- memset(wpwd, 0, 129 * 2);
+ rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16));
+ memset(wpwd, 0, 129 * sizeof(__le16));
return rc;
}
-#if 0 /* currently unused */
-/* Does both the NT and LM owfs of a user's password */
-static void
-nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
-{
- char passwd[514];
-
- memset(passwd, '\0', 514);
- if (strlen(pwd) < 513)
- strcpy(passwd, pwd);
- else
- memcpy(passwd, pwd, 512);
- /* Calculate the MD4 hash (NT compatible) of the password */
- memset(nt_p16, '\0', 16);
- E_md4hash(passwd, nt_p16);
-
- /* Mangle the passwords into Lanman format */
- passwd[14] = '\0';
-/* strupper(passwd); */
-
- /* Calculate the SMB (lanman) hash functions of the password */
-
- memset(p16, '\0', 16);
- E_P16((unsigned char *) passwd, (unsigned char *) p16);
-
- /* clear out local copy of user's password (just being paranoid). */
- memset(passwd, '\0', sizeof(passwd));
-}
-#endif
-
-/* Does the NTLMv2 owfs of a user's password */
-#if 0 /* function not needed yet - but will be soon */
-static void
-ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
- const char *domain_n, unsigned char kr_buf[16],
- const struct nls_table *nls_codepage)
-{
- wchar_t *user_u;
- wchar_t *dom_u;
- int user_l, domain_l;
- struct HMACMD5Context ctx;
-
- /* might as well do one alloc to hold both (user_u and dom_u) */
- user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL);
- if (user_u == NULL)
- return;
- dom_u = user_u + 1024;
-
- /* push_ucs2(NULL, user_u, user_n, (user_l+1)*2,
- STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER);
- push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2,
- STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */
-
- /* BB user and domain may need to be uppercased */
- user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage);
- domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage);
-
- user_l++; /* trailing null */
- domain_l++;
-
- hmac_md5_init_limK_to_64(owf, 16, &ctx);
- hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx);
- hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx);
- hmac_md5_final(kr_buf, &ctx);
-
- kfree(user_u);
-}
-#endif
-
-/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
-#if 0 /* currently unused */
-static void
-NTLMSSPOWFencrypt(unsigned char passwd[8],
- unsigned char *ntlmchalresp, unsigned char p24[24])
-{
- unsigned char p21[21];
-
- memset(p21, '\0', 21);
- memcpy(p21, passwd, 8);
- memset(p21 + 8, 0xbd, 8);
-
- E_P24(p21, ntlmchalresp, p24);
-}
-#endif
-
/* Does the NT MD4 hash then des encryption. */
int
-SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
+SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24,
+ const struct nls_table *codepage)
{
int rc;
unsigned char p16[16], p21[21];
@@ -360,7 +236,7 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
memset(p16, '\0', 16);
memset(p21, '\0', 21);
- rc = E_md4hash(passwd, p16);
+ rc = E_md4hash(passwd, p16, codepage);
if (rc) {
cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
return rc;
@@ -369,39 +245,3 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
rc = E_P24(p21, c8, p24);
return rc;
}
-
-
-/* Does the md5 encryption from the NT hash for NTLMv2. */
-/* These routines will be needed later */
-#if 0
-static void
-SMBOWFencrypt_ntv2(const unsigned char kr[16],
- const struct data_blob *srv_chal,
- const struct data_blob *cli_chal, unsigned char resp_buf[16])
-{
- struct HMACMD5Context ctx;
-
- hmac_md5_init_limK_to_64(kr, 16, &ctx);
- hmac_md5_update(srv_chal->data, srv_chal->length, &ctx);
- hmac_md5_update(cli_chal->data, cli_chal->length, &ctx);
- hmac_md5_final(resp_buf, &ctx);
-}
-
-static void
-SMBsesskeygen_ntv2(const unsigned char kr[16],
- const unsigned char *nt_resp, __u8 sess_key[16])
-{
- struct HMACMD5Context ctx;
-
- hmac_md5_init_limK_to_64(kr, 16, &ctx);
- hmac_md5_update(nt_resp, 16, &ctx);
- hmac_md5_final((unsigned char *) sess_key, &ctx);
-}
-
-static void
-SMBsesskeygen_ntv1(const unsigned char kr[16],
- const unsigned char *nt_resp, __u8 sess_key[16])
-{
- mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16);
-}
-#endif
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 10ca6b2c26b..0cc9584f588 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -26,6 +26,7 @@
#include <linux/wait.h>
#include <linux/net.h>
#include <linux/delay.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
#include <linux/mempool.h>
@@ -324,7 +325,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
{
int error;
- error = wait_event_killable(server->response_q,
+ error = wait_event_freezekillable(server->response_q,
midQ->midState != MID_REQUEST_SUBMITTED);
if (error < 0)
return -ERESTARTSYS;
@@ -339,8 +340,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
*/
int
cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
- unsigned int nvec, mid_callback_t *callback, void *cbdata,
- bool ignore_pend)
+ unsigned int nvec, mid_receive_t *receive,
+ mid_callback_t *callback, void *cbdata, bool ignore_pend)
{
int rc;
struct mid_q_entry *mid;
@@ -374,6 +375,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
goto out_err;
}
+ mid->receive = receive;
mid->callback = callback;
mid->callback_data = cbdata;
mid->midState = MID_REQUEST_SUBMITTED;
@@ -496,13 +498,18 @@ int
cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
bool log_error)
{
- dump_smb(mid->resp_buf,
- min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length)));
+ unsigned int len = be32_to_cpu(mid->resp_buf->smb_buf_length) + 4;
+
+ dump_smb(mid->resp_buf, min_t(u32, 92, len));
/* convert the length into a more usable form */
if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ struct kvec iov;
+
+ iov.iov_base = mid->resp_buf;
+ iov.iov_len = len;
/* FIXME: add code to kill session */
- if (cifs_verify_signature(mid->resp_buf, server,
+ if (cifs_verify_signature(&iov, 1, server,
mid->sequence_number + 1) != 0)
cERROR(1, "Unexpected SMB signature");
}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 2a22fb2989e..45f07c46f3e 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/posix_acl_xattr.h>
#include <linux/slab.h>
+#include <linux/xattr.h>
#include "cifsfs.h"
#include "cifspdu.h"
#include "cifsglob.h"
@@ -31,16 +32,8 @@
#define MAX_EA_VALUE_SIZE 65535
#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
-#define CIFS_XATTR_USER_PREFIX "user."
-#define CIFS_XATTR_SYSTEM_PREFIX "system."
-#define CIFS_XATTR_OS2_PREFIX "os2."
-#define CIFS_XATTR_SECURITY_PREFIX "security."
-#define CIFS_XATTR_TRUSTED_PREFIX "trusted."
-#define XATTR_TRUSTED_PREFIX_LEN 8
-#define XATTR_SECURITY_PREFIX_LEN 9
-/* BB need to add server (Samba e.g) support for security and trusted prefix */
-
+/* BB need to add server (Samba e.g) support for security and trusted prefix */
int cifs_removexattr(struct dentry *direntry, const char *ea_name)
{
@@ -76,8 +69,8 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
}
if (ea_name == NULL) {
cFYI(1, "Null xattr names not supported");
- } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
- && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
+ } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+ && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) {
cFYI(1,
"illegal xattr request %s (only user namespace supported)",
ea_name);
@@ -88,7 +81,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto remove_ea_exit;
- ea_name += 5; /* skip past user. prefix */
+ ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL,
(__u16)0, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -149,21 +142,23 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
if (ea_name == NULL) {
cFYI(1, "Null xattr names not supported");
- } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
+ } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+ == 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto set_ea_exit;
if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
cFYI(1, "attempt to set cifs inode metadata");
- ea_name += 5; /* skip past user. prefix */
+ ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
(__u16)value_size, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
- } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
+ } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
+ == 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto set_ea_exit;
- ea_name += 4; /* skip past os2. prefix */
+ ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
(__u16)value_size, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -178,7 +173,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
#ifdef CONFIG_CIFS_ACL
memcpy(pacl, ea_value, value_size);
rc = set_cifs_acl(pacl, value_size,
- direntry->d_inode, full_path);
+ direntry->d_inode, full_path, CIFS_ACL_DACL);
if (rc == 0) /* force revalidate of the inode */
CIFS_I(direntry->d_inode)->time = 0;
kfree(pacl);
@@ -269,7 +264,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
/* return alt name if available as pseudo attr */
if (ea_name == NULL) {
cFYI(1, "Null xattr names not supported");
- } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
+ } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+ == 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto get_ea_exit;
@@ -277,15 +273,15 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
cFYI(1, "attempt to query cifs inode metadata");
/* revalidate/getattr then populate from inode */
} /* BB add else when above is implemented */
- ea_name += 5; /* skip past user. prefix */
+ ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
buf_size, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
- } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
+ } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto get_ea_exit;
- ea_name += 4; /* skip past os2. prefix */
+ ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
buf_size, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -339,10 +335,10 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
cFYI(1, "Query CIFS ACL not supported yet");
#endif /* CONFIG_CIFS_ACL */
} else if (strncmp(ea_name,
- CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
+ XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
cFYI(1, "Trusted xattr namespace not supported yet");
} else if (strncmp(ea_name,
- CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
+ XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
cFYI(1, "Security xattr namespace not supported yet");
} else
cFYI(1,
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 2bdbcc11b37..854ace71268 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -104,7 +104,7 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
if (attr->va_gid != -1)
inode->i_gid = (gid_t) attr->va_gid;
if (attr->va_nlink != -1)
- inode->i_nlink = attr->va_nlink;
+ set_nlink(inode, attr->va_nlink);
if (attr->va_size != -1)
inode->i_size = attr->va_size;
if (attr->va_size != -1)
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index 44e17e9c21a..cc0ea9fe5ec 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -59,12 +59,11 @@ void coda_sysctl_clean(void);
#define CODA_ALLOC(ptr, cast, size) do { \
if (size < PAGE_SIZE) \
- ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
+ ptr = kzalloc((unsigned long) size, GFP_KERNEL); \
else \
- ptr = (cast)vmalloc((unsigned long) size); \
+ ptr = (cast)vzalloc((unsigned long) size); \
if (!ptr) \
printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
- else memset( ptr, 0, size ); \
} while (0)
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 0239433f50c..28e7e135cfa 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -340,7 +340,7 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
if (!error) {
/* VFS may delete the child */
if (de->d_inode)
- de->d_inode->i_nlink = 0;
+ clear_nlink(de->d_inode);
/* fix the link count of the parent */
coda_dir_drop_nlink(dir);
diff --git a/fs/compat.c b/fs/compat.c
index 58b1da45989..c98787536bb 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -37,7 +37,6 @@
#include <linux/dirent.h>
#include <linux/fsnotify.h>
#include <linux/highuid.h>
-#include <linux/nfsd/syscall.h>
#include <linux/personality.h>
#include <linux/rwsem.h>
#include <linux/tsacct_kern.h>
@@ -247,11 +246,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
__put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
__put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
__put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
- __put_user(0, &ubuf->f_spare[0]) ||
- __put_user(0, &ubuf->f_spare[1]) ||
- __put_user(0, &ubuf->f_spare[2]) ||
- __put_user(0, &ubuf->f_spare[3]) ||
- __put_user(0, &ubuf->f_spare[4]))
+ __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+ __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
return -EFAULT;
return 0;
}
@@ -550,7 +546,7 @@ out:
ssize_t compat_rw_copy_check_uvector(int type,
const struct compat_iovec __user *uvector, unsigned long nr_segs,
unsigned long fast_segs, struct iovec *fast_pointer,
- struct iovec **ret_pointer)
+ struct iovec **ret_pointer, int check_access)
{
compat_ssize_t tot_len;
struct iovec *iov = *ret_pointer = fast_pointer;
@@ -597,7 +593,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
}
if (len < 0) /* size_t not fitting in compat_ssize_t .. */
goto out;
- if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+ if (check_access &&
+ !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
ret = -EFAULT;
goto out;
}
@@ -1111,7 +1108,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
goto out;
tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
- UIO_FASTIOV, iovstack, &iov);
+ UIO_FASTIOV, iovstack, &iov, 1);
if (tot_len == 0) {
ret = 0;
goto out;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c83f4768eea..9d8715c45f2 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -23,7 +23,8 @@
*
* configfs Copyright (C) 2005 Oracle. All rights reserved.
*
- * Please see Documentation/filesystems/configfs.txt for more information.
+ * Please see Documentation/filesystems/configfs/configfs.txt for more
+ * information.
*/
#undef DEBUG
@@ -291,7 +292,7 @@ int __init configfs_inode_init(void)
return bdi_init(&configfs_backing_dev_info);
}
-void __exit configfs_inode_exit(void)
+void configfs_inode_exit(void)
{
bdi_destroy(&configfs_backing_dev_info);
}
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 76dc4c3e5d5..50cee7f9110 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -23,7 +23,7 @@
*
* configfs Copyright (C) 2005 Oracle. All rights reserved.
*
- * Please see the file Documentation/filesystems/configfs.txt for
+ * Please see the file Documentation/filesystems/configfs/configfs.txt for
* critical information about using the config_item interface.
*/
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index ecc62178bed..276e15cafd5 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -143,28 +143,26 @@ static int __init configfs_init(void)
goto out;
config_kobj = kobject_create_and_add("config", kernel_kobj);
- if (!config_kobj) {
- kmem_cache_destroy(configfs_dir_cachep);
- configfs_dir_cachep = NULL;
- goto out;
- }
+ if (!config_kobj)
+ goto out2;
+
+ err = configfs_inode_init();
+ if (err)
+ goto out3;
err = register_filesystem(&configfs_fs_type);
- if (err) {
- printk(KERN_ERR "configfs: Unable to register filesystem!\n");
- kobject_put(config_kobj);
- kmem_cache_destroy(configfs_dir_cachep);
- configfs_dir_cachep = NULL;
- goto out;
- }
+ if (err)
+ goto out4;
- err = configfs_inode_init();
- if (err) {
- unregister_filesystem(&configfs_fs_type);
- kobject_put(config_kobj);
- kmem_cache_destroy(configfs_dir_cachep);
- configfs_dir_cachep = NULL;
- }
+ return 0;
+out4:
+ printk(KERN_ERR "configfs: Unable to register filesystem!\n");
+ configfs_inode_exit();
+out3:
+ kobject_put(config_kobj);
+out2:
+ kmem_cache_destroy(configfs_dir_cachep);
+ configfs_dir_cachep = NULL;
out:
return err;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index a88948b8bd1..89509b5a090 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -36,6 +36,7 @@
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/prefetch.h>
+#include <linux/ratelimit.h>
#include "internal.h"
/*
@@ -225,7 +226,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
}
/*
- * dentry_lru_(add|del|move_tail) must be called with d_lock held.
+ * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held.
*/
static void dentry_lru_add(struct dentry *dentry)
{
@@ -245,6 +246,9 @@ static void __dentry_lru_del(struct dentry *dentry)
dentry_stat.nr_unused--;
}
+/*
+ * Remove a dentry with references from the LRU.
+ */
static void dentry_lru_del(struct dentry *dentry)
{
if (!list_empty(&dentry->d_lru)) {
@@ -254,6 +258,23 @@ static void dentry_lru_del(struct dentry *dentry)
}
}
+/*
+ * Remove a dentry that is unreferenced and about to be pruned
+ * (unhashed and destroyed) from the LRU, and inform the file system.
+ * This wrapper should be called _prior_ to unhashing a victim dentry.
+ */
+static void dentry_lru_prune(struct dentry *dentry)
+{
+ if (!list_empty(&dentry->d_lru)) {
+ if (dentry->d_flags & DCACHE_OP_PRUNE)
+ dentry->d_op->d_prune(dentry);
+
+ spin_lock(&dcache_lru_lock);
+ __dentry_lru_del(dentry);
+ spin_unlock(&dcache_lru_lock);
+ }
+}
+
static void dentry_lru_move_tail(struct dentry *dentry)
{
spin_lock(&dcache_lru_lock);
@@ -403,8 +424,12 @@ relock:
if (ref)
dentry->d_count--;
- /* if dentry was on the d_lru list delete it from there */
- dentry_lru_del(dentry);
+ /*
+ * if dentry was on the d_lru list delete it from there.
+ * inform the fs via d_prune that this dentry is about to be
+ * unhashed and destroyed.
+ */
+ dentry_lru_prune(dentry);
/* if it was on the hash then remove it */
__d_drop(dentry);
return d_kill(dentry, parent);
@@ -522,9 +547,11 @@ int d_invalidate(struct dentry * dentry)
* would make it unreachable from the root,
* we might still populate it if it was a
* working directory or similar).
+ * We also need to leave mountpoints alone,
+ * directory or not.
*/
- if (dentry->d_count > 1) {
- if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
+ if (dentry->d_count > 1 && dentry->d_inode) {
+ if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
spin_unlock(&dentry->d_lock);
return -EBUSY;
}
@@ -854,8 +881,12 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
do {
struct inode *inode;
- /* detach from the system */
- dentry_lru_del(dentry);
+ /*
+ * remove the dentry from the lru, and inform
+ * the fs that this dentry is about to be
+ * unhashed and destroyed.
+ */
+ dentry_lru_prune(dentry);
__d_shrink(dentry);
if (dentry->d_count != 0) {
@@ -1283,6 +1314,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
dentry->d_flags |= DCACHE_OP_REVALIDATE;
if (op->d_delete)
dentry->d_flags |= DCACHE_OP_DELETE;
+ if (op->d_prune)
+ dentry->d_flags |= DCACHE_OP_PRUNE;
}
EXPORT_SYMBOL(d_set_d_op);
@@ -2351,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
actual = __d_unalias(inode, dentry, alias);
}
write_sequnlock(&rename_lock);
- if (IS_ERR(actual))
+ if (IS_ERR(actual)) {
+ if (PTR_ERR(actual) == -ELOOP)
+ pr_warn_ratelimited(
+ "VFS: Lookup of '%s' in %s %s"
+ " would have caused loop\n",
+ dentry->d_name.name,
+ inode->i_sb->s_type->name,
+ inode->i_sb->s_id);
dput(alias);
+ }
goto out_nolock;
}
}
@@ -2398,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
/**
* prepend_path - Prepend path string to a buffer
* @path: the dentry/vfsmount to report
- * @root: root vfsmnt/dentry (may be modified by this function)
+ * @root: root vfsmnt/dentry
* @buffer: pointer to the end of the buffer
* @buflen: pointer to buffer length
*
* Caller holds the rename_lock.
- *
- * If path is not reachable from the supplied root, then the value of
- * root is changed (without modifying refcounts).
*/
-static int prepend_path(const struct path *path, struct path *root,
+static int prepend_path(const struct path *path,
+ const struct path *root,
char **buffer, int *buflen)
{
struct dentry *dentry = path->dentry;
@@ -2442,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root,
dentry = parent;
}
-out:
if (!error && !slash)
error = prepend(buffer, buflen, "/", 1);
+out:
br_read_unlock(vfsmount_lock);
return error;
@@ -2459,15 +2498,17 @@ global_root:
WARN(1, "Root dentry has weird name <%.*s>\n",
(int) dentry->d_name.len, dentry->d_name.name);
}
- root->mnt = vfsmnt;
- root->dentry = dentry;
+ if (!slash)
+ error = prepend(buffer, buflen, "/", 1);
+ if (!error)
+ error = vfsmnt->mnt_ns ? 1 : 2;
goto out;
}
/**
* __d_path - return the path of a dentry
* @path: the dentry/vfsmount to report
- * @root: root vfsmnt/dentry (may be modified by this function)
+ * @root: root vfsmnt/dentry
* @buf: buffer to return value in
* @buflen: buffer length
*
@@ -2478,10 +2519,10 @@ global_root:
*
* "buflen" should be positive.
*
- * If path is not reachable from the supplied root, then the value of
- * root is changed (without modifying refcounts).
+ * If the path is not reachable from the supplied root, return %NULL.
*/
-char *__d_path(const struct path *path, struct path *root,
+char *__d_path(const struct path *path,
+ const struct path *root,
char *buf, int buflen)
{
char *res = buf + buflen;
@@ -2492,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root,
error = prepend_path(path, root, &res, &buflen);
write_sequnlock(&rename_lock);
- if (error)
+ if (error < 0)
+ return ERR_PTR(error);
+ if (error > 0)
+ return NULL;
+ return res;
+}
+
+char *d_absolute_path(const struct path *path,
+ char *buf, int buflen)
+{
+ struct path root = {};
+ char *res = buf + buflen;
+ int error;
+
+ prepend(&res, &buflen, "\0", 1);
+ write_seqlock(&rename_lock);
+ error = prepend_path(path, &root, &res, &buflen);
+ write_sequnlock(&rename_lock);
+
+ if (error > 1)
+ error = -EINVAL;
+ if (error < 0)
return ERR_PTR(error);
return res;
}
@@ -2500,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root,
/*
* same as __d_path but appends "(deleted)" for unlinked files.
*/
-static int path_with_deleted(const struct path *path, struct path *root,
- char **buf, int *buflen)
+static int path_with_deleted(const struct path *path,
+ const struct path *root,
+ char **buf, int *buflen)
{
prepend(buf, buflen, "\0", 1);
if (d_unlinked(path->dentry)) {
@@ -2538,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen)
{
char *res = buf + buflen;
struct path root;
- struct path tmp;
int error;
/*
@@ -2553,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen)
get_fs_root(current->fs, &root);
write_seqlock(&rename_lock);
- tmp = root;
- error = path_with_deleted(path, &tmp, &res, &buflen);
- if (error)
+ error = path_with_deleted(path, &root, &res, &buflen);
+ if (error < 0)
res = ERR_PTR(error);
write_sequnlock(&rename_lock);
path_put(&root);
@@ -2576,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
{
char *res = buf + buflen;
struct path root;
- struct path tmp;
int error;
if (path->dentry->d_op && path->dentry->d_op->d_dname)
@@ -2584,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
get_fs_root(current->fs, &root);
write_seqlock(&rename_lock);
- tmp = root;
- error = path_with_deleted(path, &tmp, &res, &buflen);
- if (!error && !path_equal(&tmp, &root))
+ error = path_with_deleted(path, &root, &res, &buflen);
+ if (error > 0)
error = prepend_unreachable(&res, &buflen);
write_sequnlock(&rename_lock);
path_put(&root);
@@ -2717,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
write_seqlock(&rename_lock);
if (!d_unlinked(pwd.dentry)) {
unsigned long len;
- struct path tmp = root;
char *cwd = page + PAGE_SIZE;
int buflen = PAGE_SIZE;
prepend(&cwd, &buflen, "\0", 1);
- error = prepend_path(&pwd, &tmp, &cwd, &buflen);
+ error = prepend_path(&pwd, &root, &cwd, &buflen);
write_sequnlock(&rename_lock);
- if (error)
+ if (error < 0)
goto out;
/* Unreachable from current root */
- if (!path_equal(&tmp, &root)) {
+ if (error > 0) {
error = prepend_unreachable(&cwd, &buflen);
if (error)
goto out;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e7a7a2f0732..f3a257d7a98 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -1,5 +1,5 @@
/*
- * file.c - part of debugfs, a tiny little debug file system
+ * inode.c - part of debugfs, a tiny little debug file system
*
* Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
* Copyright (C) 2004 IBM Inc.
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 2f27e578d46..d5d5297efe9 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -307,7 +307,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
s->s_root = d_alloc_root(inode);
if (s->s_root)
@@ -549,7 +549,7 @@ void devpts_pty_kill(struct tty_struct *tty)
dentry = d_find_alias(inode);
- inode->i_nlink--;
+ drop_nlink(inode);
d_delete(dentry);
dput(dentry); /* d_alloc_name() in devpts_pty_new() */
dput(dentry); /* d_find_alias above */
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 44a360ca804..d740ab67ff6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -39,7 +39,7 @@
/*
* How many user pages to map in one call to get_user_pages(). This determines
- * the size of a structure on the stack.
+ * the size of a structure in the slab cache
*/
#define DIO_PAGES 64
@@ -55,13 +55,10 @@
* blocksize.
*/
-struct dio {
- /* BIO submission state */
+/* dio_state only used in the submission path */
+
+struct dio_submit {
struct bio *bio; /* bio under assembly */
- struct inode *inode;
- int rw;
- loff_t i_size; /* i_size when submitted */
- int flags; /* doesn't change */
unsigned blkbits; /* doesn't change */
unsigned blkfactor; /* When we're using an alignment which
is finer than the filesystem's soft
@@ -76,18 +73,17 @@ struct dio {
sector_t block_in_file; /* Current offset into the underlying
file in dio_block units. */
unsigned blocks_available; /* At block_in_file. changes */
+ int reap_counter; /* rate limit reaping */
sector_t final_block_in_request;/* doesn't change */
unsigned first_block_in_page; /* doesn't change, Used only once */
int boundary; /* prev block is at a boundary */
- int reap_counter; /* rate limit reaping */
get_block_t *get_block; /* block mapping function */
- dio_iodone_t *end_io; /* IO completion function */
dio_submit_t *submit_io; /* IO submition function */
+
loff_t logical_offset_in_bio; /* current first logical block in bio */
sector_t final_block_in_bio; /* current final block in bio + 1 */
sector_t next_block_for_io; /* next block to be put under IO,
in dio_blocks units */
- struct buffer_head map_bh; /* last get_block() result */
/*
* Deferred addition of a page to the dio. These variables are
@@ -100,18 +96,6 @@ struct dio {
sector_t cur_page_block; /* Where it starts */
loff_t cur_page_fs_offset; /* Offset in file */
- /* BIO completion state */
- spinlock_t bio_lock; /* protects BIO fields below */
- unsigned long refcount; /* direct_io_worker() and bios */
- struct bio *bio_list; /* singly linked via bi_private */
- struct task_struct *waiter; /* waiting task (NULL if none) */
-
- /* AIO related stuff */
- struct kiocb *iocb; /* kiocb */
- int is_async; /* is IO async ? */
- int io_error; /* IO error in completion path */
- ssize_t result; /* IO result */
-
/*
* Page fetching state. These variables belong to dio_refill_pages().
*/
@@ -125,7 +109,30 @@ struct dio {
*/
unsigned head; /* next page to process */
unsigned tail; /* last valid page + 1 */
+};
+
+/* dio_state communicated between submission path and end_io */
+struct dio {
+ int flags; /* doesn't change */
+ int rw;
+ struct inode *inode;
+ loff_t i_size; /* i_size when submitted */
+ dio_iodone_t *end_io; /* IO completion function */
+
+ void *private; /* copy from map_bh.b_private */
+
+ /* BIO completion state */
+ spinlock_t bio_lock; /* protects BIO fields below */
int page_errors; /* errno from get_user_pages() */
+ int is_async; /* is IO async ? */
+ int io_error; /* IO error in completion path */
+ unsigned long refcount; /* direct_io_worker() and bios */
+ struct bio *bio_list; /* singly linked via bi_private */
+ struct task_struct *waiter; /* waiting task (NULL if none) */
+
+ /* AIO related stuff */
+ struct kiocb *iocb; /* kiocb */
+ ssize_t result; /* IO result */
/*
* pages[] (and any fields placed after it) are not zeroed out at
@@ -133,7 +140,9 @@ struct dio {
* wish that they not be zeroed.
*/
struct page *pages[DIO_PAGES]; /* page buffer */
-};
+} ____cacheline_aligned_in_smp;
+
+static struct kmem_cache *dio_cache __read_mostly;
static void __inode_dio_wait(struct inode *inode)
{
@@ -182,27 +191,27 @@ EXPORT_SYMBOL_GPL(inode_dio_done);
/*
* How many pages are in the queue?
*/
-static inline unsigned dio_pages_present(struct dio *dio)
+static inline unsigned dio_pages_present(struct dio_submit *sdio)
{
- return dio->tail - dio->head;
+ return sdio->tail - sdio->head;
}
/*
* Go grab and pin some userspace pages. Typically we'll get 64 at a time.
*/
-static int dio_refill_pages(struct dio *dio)
+static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
{
int ret;
int nr_pages;
- nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
+ nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
ret = get_user_pages_fast(
- dio->curr_user_address, /* Where from? */
+ sdio->curr_user_address, /* Where from? */
nr_pages, /* How many pages? */
dio->rw == READ, /* Write to memory? */
&dio->pages[0]); /* Put results here */
- if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
+ if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
struct page *page = ZERO_PAGE(0);
/*
* A memory fault, but the filesystem has some outstanding
@@ -213,17 +222,17 @@ static int dio_refill_pages(struct dio *dio)
dio->page_errors = ret;
page_cache_get(page);
dio->pages[0] = page;
- dio->head = 0;
- dio->tail = 1;
+ sdio->head = 0;
+ sdio->tail = 1;
ret = 0;
goto out;
}
if (ret >= 0) {
- dio->curr_user_address += ret * PAGE_SIZE;
- dio->curr_page += ret;
- dio->head = 0;
- dio->tail = ret;
+ sdio->curr_user_address += ret * PAGE_SIZE;
+ sdio->curr_page += ret;
+ sdio->head = 0;
+ sdio->tail = ret;
ret = 0;
}
out:
@@ -236,17 +245,18 @@ out:
* decent number of pages, less frequently. To provide nicer use of the
* L1 cache.
*/
-static struct page *dio_get_page(struct dio *dio)
+static inline struct page *dio_get_page(struct dio *dio,
+ struct dio_submit *sdio)
{
- if (dio_pages_present(dio) == 0) {
+ if (dio_pages_present(sdio) == 0) {
int ret;
- ret = dio_refill_pages(dio);
+ ret = dio_refill_pages(dio, sdio);
if (ret)
return ERR_PTR(ret);
- BUG_ON(dio_pages_present(dio) == 0);
+ BUG_ON(dio_pages_present(sdio) == 0);
}
- return dio->pages[dio->head++];
+ return dio->pages[sdio->head++];
}
/**
@@ -292,7 +302,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
if (dio->end_io && dio->result) {
dio->end_io(dio->iocb, offset, transferred,
- dio->map_bh.b_private, ret, is_async);
+ dio->private, ret, is_async);
} else {
if (is_async)
aio_complete(dio->iocb, ret, 0);
@@ -323,7 +333,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
if (remaining == 0) {
dio_complete(dio, dio->iocb->ki_pos, 0, true);
- kfree(dio);
+ kmem_cache_free(dio_cache, dio);
}
}
@@ -367,9 +377,10 @@ void dio_end_io(struct bio *bio, int error)
}
EXPORT_SYMBOL_GPL(dio_end_io);
-static void
-dio_bio_alloc(struct dio *dio, struct block_device *bdev,
- sector_t first_sector, int nr_vecs)
+static inline void
+dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
+ struct block_device *bdev,
+ sector_t first_sector, int nr_vecs)
{
struct bio *bio;
@@ -386,8 +397,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
else
bio->bi_end_io = dio_bio_end_io;
- dio->bio = bio;
- dio->logical_offset_in_bio = dio->cur_page_fs_offset;
+ sdio->bio = bio;
+ sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
/*
@@ -397,9 +408,9 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
*
* bios hold a dio reference between submit_bio and ->end_io.
*/
-static void dio_bio_submit(struct dio *dio)
+static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
{
- struct bio *bio = dio->bio;
+ struct bio *bio = sdio->bio;
unsigned long flags;
bio->bi_private = dio;
@@ -411,24 +422,24 @@ static void dio_bio_submit(struct dio *dio)
if (dio->is_async && dio->rw == READ)
bio_set_pages_dirty(bio);
- if (dio->submit_io)
- dio->submit_io(dio->rw, bio, dio->inode,
- dio->logical_offset_in_bio);
+ if (sdio->submit_io)
+ sdio->submit_io(dio->rw, bio, dio->inode,
+ sdio->logical_offset_in_bio);
else
submit_bio(dio->rw, bio);
- dio->bio = NULL;
- dio->boundary = 0;
- dio->logical_offset_in_bio = 0;
+ sdio->bio = NULL;
+ sdio->boundary = 0;
+ sdio->logical_offset_in_bio = 0;
}
/*
* Release any resources in case of a failure
*/
-static void dio_cleanup(struct dio *dio)
+static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
{
- while (dio_pages_present(dio))
- page_cache_release(dio_get_page(dio));
+ while (dio_pages_present(sdio))
+ page_cache_release(dio_get_page(dio, sdio));
}
/*
@@ -518,11 +529,11 @@ static void dio_await_completion(struct dio *dio)
*
* This also helps to limit the peak amount of pinned userspace memory.
*/
-static int dio_bio_reap(struct dio *dio)
+static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
{
int ret = 0;
- if (dio->reap_counter++ >= 64) {
+ if (sdio->reap_counter++ >= 64) {
while (dio->bio_list) {
unsigned long flags;
struct bio *bio;
@@ -536,14 +547,14 @@ static int dio_bio_reap(struct dio *dio)
if (ret == 0)
ret = ret2;
}
- dio->reap_counter = 0;
+ sdio->reap_counter = 0;
}
return ret;
}
/*
* Call into the fs to map some more disk blocks. We record the current number
- * of available blocks at dio->blocks_available. These are in units of the
+ * of available blocks at sdio->blocks_available. These are in units of the
* fs blocksize, (1 << inode->i_blkbits).
*
* The fs is allowed to map lots of blocks at once. If it wants to do that,
@@ -564,10 +575,10 @@ static int dio_bio_reap(struct dio *dio)
* buffer_mapped(). However the direct-io code will only process holes one
* block at a time - it will repeatedly call get_block() as it walks the hole.
*/
-static int get_more_blocks(struct dio *dio)
+static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
+ struct buffer_head *map_bh)
{
int ret;
- struct buffer_head *map_bh = &dio->map_bh;
sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
unsigned long fs_count; /* Number of filesystem-sized blocks */
unsigned long dio_count;/* Number of dio_block-sized blocks */
@@ -580,11 +591,11 @@ static int get_more_blocks(struct dio *dio)
*/
ret = dio->page_errors;
if (ret == 0) {
- BUG_ON(dio->block_in_file >= dio->final_block_in_request);
- fs_startblk = dio->block_in_file >> dio->blkfactor;
- dio_count = dio->final_block_in_request - dio->block_in_file;
- fs_count = dio_count >> dio->blkfactor;
- blkmask = (1 << dio->blkfactor) - 1;
+ BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
+ fs_startblk = sdio->block_in_file >> sdio->blkfactor;
+ dio_count = sdio->final_block_in_request - sdio->block_in_file;
+ fs_count = dio_count >> sdio->blkfactor;
+ blkmask = (1 << sdio->blkfactor) - 1;
if (dio_count & blkmask)
fs_count++;
@@ -604,13 +615,16 @@ static int get_more_blocks(struct dio *dio)
*/
create = dio->rw & WRITE;
if (dio->flags & DIO_SKIP_HOLES) {
- if (dio->block_in_file < (i_size_read(dio->inode) >>
- dio->blkbits))
+ if (sdio->block_in_file < (i_size_read(dio->inode) >>
+ sdio->blkbits))
create = 0;
}
- ret = (*dio->get_block)(dio->inode, fs_startblk,
+ ret = (*sdio->get_block)(dio->inode, fs_startblk,
map_bh, create);
+
+ /* Store for completion */
+ dio->private = map_bh->b_private;
}
return ret;
}
@@ -618,20 +632,21 @@ static int get_more_blocks(struct dio *dio)
/*
* There is no bio. Make one now.
*/
-static int dio_new_bio(struct dio *dio, sector_t start_sector)
+static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
+ sector_t start_sector, struct buffer_head *map_bh)
{
sector_t sector;
int ret, nr_pages;
- ret = dio_bio_reap(dio);
+ ret = dio_bio_reap(dio, sdio);
if (ret)
goto out;
- sector = start_sector << (dio->blkbits - 9);
- nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+ sector = start_sector << (sdio->blkbits - 9);
+ nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
nr_pages = min(nr_pages, BIO_MAX_PAGES);
BUG_ON(nr_pages <= 0);
- dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
- dio->boundary = 0;
+ dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
+ sdio->boundary = 0;
out:
return ret;
}
@@ -643,21 +658,21 @@ out:
*
* Return zero on success. Non-zero means the caller needs to start a new BIO.
*/
-static int dio_bio_add_page(struct dio *dio)
+static inline int dio_bio_add_page(struct dio_submit *sdio)
{
int ret;
- ret = bio_add_page(dio->bio, dio->cur_page,
- dio->cur_page_len, dio->cur_page_offset);
- if (ret == dio->cur_page_len) {
+ ret = bio_add_page(sdio->bio, sdio->cur_page,
+ sdio->cur_page_len, sdio->cur_page_offset);
+ if (ret == sdio->cur_page_len) {
/*
* Decrement count only, if we are done with this page
*/
- if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE)
- dio->pages_in_io--;
- page_cache_get(dio->cur_page);
- dio->final_block_in_bio = dio->cur_page_block +
- (dio->cur_page_len >> dio->blkbits);
+ if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
+ sdio->pages_in_io--;
+ page_cache_get(sdio->cur_page);
+ sdio->final_block_in_bio = sdio->cur_page_block +
+ (sdio->cur_page_len >> sdio->blkbits);
ret = 0;
} else {
ret = 1;
@@ -675,14 +690,15 @@ static int dio_bio_add_page(struct dio *dio)
* The caller of this function is responsible for removing cur_page from the
* dio, and for dropping the refcount which came from that presence.
*/
-static int dio_send_cur_page(struct dio *dio)
+static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
+ struct buffer_head *map_bh)
{
int ret = 0;
- if (dio->bio) {
- loff_t cur_offset = dio->cur_page_fs_offset;
- loff_t bio_next_offset = dio->logical_offset_in_bio +
- dio->bio->bi_size;
+ if (sdio->bio) {
+ loff_t cur_offset = sdio->cur_page_fs_offset;
+ loff_t bio_next_offset = sdio->logical_offset_in_bio +
+ sdio->bio->bi_size;
/*
* See whether this new request is contiguous with the old.
@@ -698,28 +714,28 @@ static int dio_send_cur_page(struct dio *dio)
* be the next logical offset in the bio, submit the bio we
* have.
*/
- if (dio->final_block_in_bio != dio->cur_page_block ||
+ if (sdio->final_block_in_bio != sdio->cur_page_block ||
cur_offset != bio_next_offset)
- dio_bio_submit(dio);
+ dio_bio_submit(dio, sdio);
/*
* Submit now if the underlying fs is about to perform a
* metadata read
*/
- else if (dio->boundary)
- dio_bio_submit(dio);
+ else if (sdio->boundary)
+ dio_bio_submit(dio, sdio);
}
- if (dio->bio == NULL) {
- ret = dio_new_bio(dio, dio->cur_page_block);
+ if (sdio->bio == NULL) {
+ ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
if (ret)
goto out;
}
- if (dio_bio_add_page(dio) != 0) {
- dio_bio_submit(dio);
- ret = dio_new_bio(dio, dio->cur_page_block);
+ if (dio_bio_add_page(sdio) != 0) {
+ dio_bio_submit(dio, sdio);
+ ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
if (ret == 0) {
- ret = dio_bio_add_page(dio);
+ ret = dio_bio_add_page(sdio);
BUG_ON(ret != 0);
}
}
@@ -744,9 +760,10 @@ out:
* If that doesn't work out then we put the old page into the bio and add this
* page to the dio instead.
*/
-static int
-submit_page_section(struct dio *dio, struct page *page,
- unsigned offset, unsigned len, sector_t blocknr)
+static inline int
+submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
+ unsigned offset, unsigned len, sector_t blocknr,
+ struct buffer_head *map_bh)
{
int ret = 0;
@@ -760,20 +777,20 @@ submit_page_section(struct dio *dio, struct page *page,
/*
* Can we just grow the current page's presence in the dio?
*/
- if ( (dio->cur_page == page) &&
- (dio->cur_page_offset + dio->cur_page_len == offset) &&
- (dio->cur_page_block +
- (dio->cur_page_len >> dio->blkbits) == blocknr)) {
- dio->cur_page_len += len;
+ if (sdio->cur_page == page &&
+ sdio->cur_page_offset + sdio->cur_page_len == offset &&
+ sdio->cur_page_block +
+ (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
+ sdio->cur_page_len += len;
/*
- * If dio->boundary then we want to schedule the IO now to
+ * If sdio->boundary then we want to schedule the IO now to
* avoid metadata seeks.
*/
- if (dio->boundary) {
- ret = dio_send_cur_page(dio);
- page_cache_release(dio->cur_page);
- dio->cur_page = NULL;
+ if (sdio->boundary) {
+ ret = dio_send_cur_page(dio, sdio, map_bh);
+ page_cache_release(sdio->cur_page);
+ sdio->cur_page = NULL;
}
goto out;
}
@@ -781,20 +798,20 @@ submit_page_section(struct dio *dio, struct page *page,
/*
* If there's a deferred page already there then send it.
*/
- if (dio->cur_page) {
- ret = dio_send_cur_page(dio);
- page_cache_release(dio->cur_page);
- dio->cur_page = NULL;
+ if (sdio->cur_page) {
+ ret = dio_send_cur_page(dio, sdio, map_bh);
+ page_cache_release(sdio->cur_page);
+ sdio->cur_page = NULL;
if (ret)
goto out;
}
page_cache_get(page); /* It is in dio */
- dio->cur_page = page;
- dio->cur_page_offset = offset;
- dio->cur_page_len = len;
- dio->cur_page_block = blocknr;
- dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
+ sdio->cur_page = page;
+ sdio->cur_page_offset = offset;
+ sdio->cur_page_len = len;
+ sdio->cur_page_block = blocknr;
+ sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
out:
return ret;
}
@@ -804,16 +821,16 @@ out:
* file blocks. Only called for S_ISREG files - blockdevs do not set
* buffer_new
*/
-static void clean_blockdev_aliases(struct dio *dio)
+static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
{
unsigned i;
unsigned nblocks;
- nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits;
+ nblocks = map_bh->b_size >> dio->inode->i_blkbits;
for (i = 0; i < nblocks; i++) {
- unmap_underlying_metadata(dio->map_bh.b_bdev,
- dio->map_bh.b_blocknr + i);
+ unmap_underlying_metadata(map_bh->b_bdev,
+ map_bh->b_blocknr + i);
}
}
@@ -826,19 +843,20 @@ static void clean_blockdev_aliases(struct dio *dio)
* `end' is zero if we're doing the start of the IO, 1 at the end of the
* IO.
*/
-static void dio_zero_block(struct dio *dio, int end)
+static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
+ int end, struct buffer_head *map_bh)
{
unsigned dio_blocks_per_fs_block;
unsigned this_chunk_blocks; /* In dio_blocks */
unsigned this_chunk_bytes;
struct page *page;
- dio->start_zero_done = 1;
- if (!dio->blkfactor || !buffer_new(&dio->map_bh))
+ sdio->start_zero_done = 1;
+ if (!sdio->blkfactor || !buffer_new(map_bh))
return;
- dio_blocks_per_fs_block = 1 << dio->blkfactor;
- this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
+ dio_blocks_per_fs_block = 1 << sdio->blkfactor;
+ this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
if (!this_chunk_blocks)
return;
@@ -850,14 +868,14 @@ static void dio_zero_block(struct dio *dio, int end)
if (end)
this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
- this_chunk_bytes = this_chunk_blocks << dio->blkbits;
+ this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
page = ZERO_PAGE(0);
- if (submit_page_section(dio, page, 0, this_chunk_bytes,
- dio->next_block_for_io))
+ if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
+ sdio->next_block_for_io, map_bh))
return;
- dio->next_block_for_io += this_chunk_blocks;
+ sdio->next_block_for_io += this_chunk_blocks;
}
/*
@@ -876,20 +894,20 @@ static void dio_zero_block(struct dio *dio, int end)
* it should set b_size to PAGE_SIZE or more inside get_block(). This gives
* fine alignment but still allows this function to work in PAGE_SIZE units.
*/
-static int do_direct_IO(struct dio *dio)
+static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
+ struct buffer_head *map_bh)
{
- const unsigned blkbits = dio->blkbits;
+ const unsigned blkbits = sdio->blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
struct page *page;
unsigned block_in_page;
- struct buffer_head *map_bh = &dio->map_bh;
int ret = 0;
/* The I/O can start at any block offset within the first page */
- block_in_page = dio->first_block_in_page;
+ block_in_page = sdio->first_block_in_page;
- while (dio->block_in_file < dio->final_block_in_request) {
- page = dio_get_page(dio);
+ while (sdio->block_in_file < sdio->final_block_in_request) {
+ page = dio_get_page(dio, sdio);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
@@ -901,14 +919,14 @@ static int do_direct_IO(struct dio *dio)
unsigned this_chunk_blocks; /* # of blocks */
unsigned u;
- if (dio->blocks_available == 0) {
+ if (sdio->blocks_available == 0) {
/*
* Need to go and map some more disk
*/
unsigned long blkmask;
unsigned long dio_remainder;
- ret = get_more_blocks(dio);
+ ret = get_more_blocks(dio, sdio, map_bh);
if (ret) {
page_cache_release(page);
goto out;
@@ -916,18 +934,18 @@ static int do_direct_IO(struct dio *dio)
if (!buffer_mapped(map_bh))
goto do_holes;
- dio->blocks_available =
- map_bh->b_size >> dio->blkbits;
- dio->next_block_for_io =
- map_bh->b_blocknr << dio->blkfactor;
+ sdio->blocks_available =
+ map_bh->b_size >> sdio->blkbits;
+ sdio->next_block_for_io =
+ map_bh->b_blocknr << sdio->blkfactor;
if (buffer_new(map_bh))
- clean_blockdev_aliases(dio);
+ clean_blockdev_aliases(dio, map_bh);
- if (!dio->blkfactor)
+ if (!sdio->blkfactor)
goto do_holes;
- blkmask = (1 << dio->blkfactor) - 1;
- dio_remainder = (dio->block_in_file & blkmask);
+ blkmask = (1 << sdio->blkfactor) - 1;
+ dio_remainder = (sdio->block_in_file & blkmask);
/*
* If we are at the start of IO and that IO
@@ -941,8 +959,8 @@ static int do_direct_IO(struct dio *dio)
* on-disk
*/
if (!buffer_new(map_bh))
- dio->next_block_for_io += dio_remainder;
- dio->blocks_available -= dio_remainder;
+ sdio->next_block_for_io += dio_remainder;
+ sdio->blocks_available -= dio_remainder;
}
do_holes:
/* Handle holes */
@@ -961,7 +979,7 @@ do_holes:
*/
i_size_aligned = ALIGN(i_size_read(dio->inode),
1 << blkbits);
- if (dio->block_in_file >=
+ if (sdio->block_in_file >=
i_size_aligned >> blkbits) {
/* We hit eof */
page_cache_release(page);
@@ -969,7 +987,7 @@ do_holes:
}
zero_user(page, block_in_page << blkbits,
1 << blkbits);
- dio->block_in_file++;
+ sdio->block_in_file++;
block_in_page++;
goto next_block;
}
@@ -979,38 +997,41 @@ do_holes:
* is finer than the underlying fs, go check to see if
* we must zero out the start of this block.
*/
- if (unlikely(dio->blkfactor && !dio->start_zero_done))
- dio_zero_block(dio, 0);
+ if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
+ dio_zero_block(dio, sdio, 0, map_bh);
/*
* Work out, in this_chunk_blocks, how much disk we
* can add to this page
*/
- this_chunk_blocks = dio->blocks_available;
+ this_chunk_blocks = sdio->blocks_available;
u = (PAGE_SIZE - offset_in_page) >> blkbits;
if (this_chunk_blocks > u)
this_chunk_blocks = u;
- u = dio->final_block_in_request - dio->block_in_file;
+ u = sdio->final_block_in_request - sdio->block_in_file;
if (this_chunk_blocks > u)
this_chunk_blocks = u;
this_chunk_bytes = this_chunk_blocks << blkbits;
BUG_ON(this_chunk_bytes == 0);
- dio->boundary = buffer_boundary(map_bh);
- ret = submit_page_section(dio, page, offset_in_page,
- this_chunk_bytes, dio->next_block_for_io);
+ sdio->boundary = buffer_boundary(map_bh);
+ ret = submit_page_section(dio, sdio, page,
+ offset_in_page,
+ this_chunk_bytes,
+ sdio->next_block_for_io,
+ map_bh);
if (ret) {
page_cache_release(page);
goto out;
}
- dio->next_block_for_io += this_chunk_blocks;
+ sdio->next_block_for_io += this_chunk_blocks;
- dio->block_in_file += this_chunk_blocks;
+ sdio->block_in_file += this_chunk_blocks;
block_in_page += this_chunk_blocks;
- dio->blocks_available -= this_chunk_blocks;
+ sdio->blocks_available -= this_chunk_blocks;
next_block:
- BUG_ON(dio->block_in_file > dio->final_block_in_request);
- if (dio->block_in_file == dio->final_block_in_request)
+ BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
+ if (sdio->block_in_file == sdio->final_block_in_request)
break;
}
@@ -1022,135 +1043,10 @@ out:
return ret;
}
-static ssize_t
-direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs,
- unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
- dio_submit_t submit_io, struct dio *dio)
+static inline int drop_refcount(struct dio *dio)
{
- unsigned long user_addr;
+ int ret2;
unsigned long flags;
- int seg;
- ssize_t ret = 0;
- ssize_t ret2;
- size_t bytes;
-
- dio->inode = inode;
- dio->rw = rw;
- dio->blkbits = blkbits;
- dio->blkfactor = inode->i_blkbits - blkbits;
- dio->block_in_file = offset >> blkbits;
-
- dio->get_block = get_block;
- dio->end_io = end_io;
- dio->submit_io = submit_io;
- dio->final_block_in_bio = -1;
- dio->next_block_for_io = -1;
-
- dio->iocb = iocb;
- dio->i_size = i_size_read(inode);
-
- spin_lock_init(&dio->bio_lock);
- dio->refcount = 1;
-
- /*
- * In case of non-aligned buffers, we may need 2 more
- * pages since we need to zero out first and last block.
- */
- if (unlikely(dio->blkfactor))
- dio->pages_in_io = 2;
-
- for (seg = 0; seg < nr_segs; seg++) {
- user_addr = (unsigned long)iov[seg].iov_base;
- dio->pages_in_io +=
- ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
- - user_addr/PAGE_SIZE);
- }
-
- for (seg = 0; seg < nr_segs; seg++) {
- user_addr = (unsigned long)iov[seg].iov_base;
- dio->size += bytes = iov[seg].iov_len;
-
- /* Index into the first page of the first block */
- dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
- dio->final_block_in_request = dio->block_in_file +
- (bytes >> blkbits);
- /* Page fetching state */
- dio->head = 0;
- dio->tail = 0;
- dio->curr_page = 0;
-
- dio->total_pages = 0;
- if (user_addr & (PAGE_SIZE-1)) {
- dio->total_pages++;
- bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
- }
- dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
- dio->curr_user_address = user_addr;
-
- ret = do_direct_IO(dio);
-
- dio->result += iov[seg].iov_len -
- ((dio->final_block_in_request - dio->block_in_file) <<
- blkbits);
-
- if (ret) {
- dio_cleanup(dio);
- break;
- }
- } /* end iovec loop */
-
- if (ret == -ENOTBLK) {
- /*
- * The remaining part of the request will be
- * be handled by buffered I/O when we return
- */
- ret = 0;
- }
- /*
- * There may be some unwritten disk at the end of a part-written
- * fs-block-sized block. Go zero that now.
- */
- dio_zero_block(dio, 1);
-
- if (dio->cur_page) {
- ret2 = dio_send_cur_page(dio);
- if (ret == 0)
- ret = ret2;
- page_cache_release(dio->cur_page);
- dio->cur_page = NULL;
- }
- if (dio->bio)
- dio_bio_submit(dio);
-
- /*
- * It is possible that, we return short IO due to end of file.
- * In that case, we need to release all the pages we got hold on.
- */
- dio_cleanup(dio);
-
- /*
- * All block lookups have been performed. For READ requests
- * we can let i_mutex go now that its achieved its purpose
- * of protecting us from looking up uninitialized blocks.
- */
- if (rw == READ && (dio->flags & DIO_LOCKING))
- mutex_unlock(&dio->inode->i_mutex);
-
- /*
- * The only time we want to leave bios in flight is when a successful
- * partial aio read or full aio write have been setup. In that case
- * bio completion will call aio_complete. The only time it's safe to
- * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
- * This had *better* be the only place that raises -EIOCBQUEUED.
- */
- BUG_ON(ret == -EIOCBQUEUED);
- if (dio->is_async && ret == 0 && dio->result &&
- ((rw & READ) || (dio->result == dio->size)))
- ret = -EIOCBQUEUED;
-
- if (ret != -EIOCBQUEUED)
- dio_await_completion(dio);
/*
* Sync will always be dropping the final ref and completing the
@@ -1166,14 +1062,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
spin_lock_irqsave(&dio->bio_lock, flags);
ret2 = --dio->refcount;
spin_unlock_irqrestore(&dio->bio_lock, flags);
-
- if (ret2 == 0) {
- ret = dio_complete(dio, offset, ret, false);
- kfree(dio);
- } else
- BUG_ON(ret != -EIOCBQUEUED);
-
- return ret;
+ return ret2;
}
/*
@@ -1195,6 +1084,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
* expected that filesystem provide exclusion between new direct I/O
* and truncates. For DIO_LOCKING filesystems this is done by i_mutex,
* but other filesystems need to take care of this on their own.
+ *
+ * NOTE: if you pass "sdio" to anything by pointer make sure that function
+ * is always inlined. Otherwise gcc is unable to split the structure into
+ * individual fields and will generate much worse code. This is important
+ * for the whole file.
*/
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1211,6 +1105,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
+ struct dio_submit sdio = { 0, };
+ unsigned long user_addr;
+ size_t bytes;
+ struct buffer_head map_bh = { 0, };
if (rw & WRITE)
rw = WRITE_ODIRECT;
@@ -1244,7 +1142,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (rw == READ && end == offset)
return 0;
- dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+ dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
retval = -ENOMEM;
if (!dio)
goto out;
@@ -1268,7 +1166,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
end - 1);
if (retval) {
mutex_unlock(&inode->i_mutex);
- kfree(dio);
+ kmem_cache_free(dio_cache, dio);
goto out;
}
}
@@ -1288,11 +1186,141 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
(end > i_size_read(inode)));
- retval = direct_io_worker(rw, iocb, inode, iov, offset,
- nr_segs, blkbits, get_block, end_io,
- submit_io, dio);
+ retval = 0;
+
+ dio->inode = inode;
+ dio->rw = rw;
+ sdio.blkbits = blkbits;
+ sdio.blkfactor = inode->i_blkbits - blkbits;
+ sdio.block_in_file = offset >> blkbits;
+
+ sdio.get_block = get_block;
+ dio->end_io = end_io;
+ sdio.submit_io = submit_io;
+ sdio.final_block_in_bio = -1;
+ sdio.next_block_for_io = -1;
+
+ dio->iocb = iocb;
+ dio->i_size = i_size_read(inode);
+
+ spin_lock_init(&dio->bio_lock);
+ dio->refcount = 1;
+
+ /*
+ * In case of non-aligned buffers, we may need 2 more
+ * pages since we need to zero out first and last block.
+ */
+ if (unlikely(sdio.blkfactor))
+ sdio.pages_in_io = 2;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ user_addr = (unsigned long)iov[seg].iov_base;
+ sdio.pages_in_io +=
+ ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
+ PAGE_SIZE - user_addr / PAGE_SIZE);
+ }
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ user_addr = (unsigned long)iov[seg].iov_base;
+ sdio.size += bytes = iov[seg].iov_len;
+
+ /* Index into the first page of the first block */
+ sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+ sdio.final_block_in_request = sdio.block_in_file +
+ (bytes >> blkbits);
+ /* Page fetching state */
+ sdio.head = 0;
+ sdio.tail = 0;
+ sdio.curr_page = 0;
+
+ sdio.total_pages = 0;
+ if (user_addr & (PAGE_SIZE-1)) {
+ sdio.total_pages++;
+ bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+ }
+ sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+ sdio.curr_user_address = user_addr;
+
+ retval = do_direct_IO(dio, &sdio, &map_bh);
+
+ dio->result += iov[seg].iov_len -
+ ((sdio.final_block_in_request - sdio.block_in_file) <<
+ blkbits);
+
+ if (retval) {
+ dio_cleanup(dio, &sdio);
+ break;
+ }
+ } /* end iovec loop */
+
+ if (retval == -ENOTBLK) {
+ /*
+ * The remaining part of the request will be
+ * be handled by buffered I/O when we return
+ */
+ retval = 0;
+ }
+ /*
+ * There may be some unwritten disk at the end of a part-written
+ * fs-block-sized block. Go zero that now.
+ */
+ dio_zero_block(dio, &sdio, 1, &map_bh);
+
+ if (sdio.cur_page) {
+ ssize_t ret2;
+
+ ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
+ if (retval == 0)
+ retval = ret2;
+ page_cache_release(sdio.cur_page);
+ sdio.cur_page = NULL;
+ }
+ if (sdio.bio)
+ dio_bio_submit(dio, &sdio);
+
+ /*
+ * It is possible that, we return short IO due to end of file.
+ * In that case, we need to release all the pages we got hold on.
+ */
+ dio_cleanup(dio, &sdio);
+
+ /*
+ * All block lookups have been performed. For READ requests
+ * we can let i_mutex go now that its achieved its purpose
+ * of protecting us from looking up uninitialized blocks.
+ */
+ if (rw == READ && (dio->flags & DIO_LOCKING))
+ mutex_unlock(&dio->inode->i_mutex);
+
+ /*
+ * The only time we want to leave bios in flight is when a successful
+ * partial aio read or full aio write have been setup. In that case
+ * bio completion will call aio_complete. The only time it's safe to
+ * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
+ * This had *better* be the only place that raises -EIOCBQUEUED.
+ */
+ BUG_ON(retval == -EIOCBQUEUED);
+ if (dio->is_async && retval == 0 && dio->result &&
+ ((rw & READ) || (dio->result == sdio.size)))
+ retval = -EIOCBQUEUED;
+
+ if (retval != -EIOCBQUEUED)
+ dio_await_completion(dio);
+
+ if (drop_refcount(dio) == 0) {
+ retval = dio_complete(dio, offset, retval, false);
+ kmem_cache_free(dio_cache, dio);
+ } else
+ BUG_ON(retval != -EIOCBQUEUED);
out:
return retval;
}
EXPORT_SYMBOL(__blockdev_direct_IO);
+
+static __init int dio_init(void)
+{
+ dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
+ return 0;
+}
+module_init(dio_init)
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 58609bde3b9..2a834255c75 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -967,7 +967,7 @@ static void ecryptfs_set_default_crypt_stat_vals(
/**
* ecryptfs_new_file_context
- * @ecryptfs_dentry: The eCryptfs dentry
+ * @ecryptfs_inode: The eCryptfs inode
*
* If the crypto context for the file has not yet been established,
* this is where we do that. Establishing a new crypto context
@@ -984,13 +984,13 @@ static void ecryptfs_set_default_crypt_stat_vals(
*
* Returns zero on success; non-zero otherwise
*/
-int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry)
+int ecryptfs_new_file_context(struct inode *ecryptfs_inode)
{
struct ecryptfs_crypt_stat *crypt_stat =
- &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+ &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
&ecryptfs_superblock_to_private(
- ecryptfs_dentry->d_sb)->mount_crypt_stat;
+ ecryptfs_inode->i_sb)->mount_crypt_stat;
int cipher_name_len;
int rc = 0;
@@ -1299,12 +1299,12 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
}
static int
-ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
+ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode,
char *virt, size_t virt_len)
{
int rc;
- rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
+ rc = ecryptfs_write_lower(ecryptfs_inode, virt,
0, virt_len);
if (rc < 0)
printk(KERN_ERR "%s: Error attempting to write header "
@@ -1338,7 +1338,8 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
/**
* ecryptfs_write_metadata
- * @ecryptfs_dentry: The eCryptfs dentry
+ * @ecryptfs_dentry: The eCryptfs dentry, which should be negative
+ * @ecryptfs_inode: The newly created eCryptfs inode
*
* Write the file headers out. This will likely involve a userspace
* callout, in which the session key is encrypted with one or more
@@ -1348,10 +1349,11 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
*
* Returns zero on success; non-zero on error
*/
-int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
+int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
+ struct inode *ecryptfs_inode)
{
struct ecryptfs_crypt_stat *crypt_stat =
- &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+ &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
unsigned int order;
char *virt;
size_t virt_len;
@@ -1391,7 +1393,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
size);
else
- rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt,
+ rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt,
virt_len);
if (rc) {
printk(KERN_ERR "%s: Error writing metadata out to lower file; "
@@ -1943,7 +1945,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
/* We could either offset on every reverse map or just pad some 0x00's
* at the front here */
-static const unsigned char filename_rev_map[] = {
+static const unsigned char filename_rev_map[256] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
@@ -1959,7 +1961,7 @@ static const unsigned char filename_rev_map[] = {
0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
- 0x3D, 0x3E, 0x3F
+ 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */
};
/**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index b36c5572b3f..a9f29b12fbf 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -514,7 +514,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
#define ecryptfs_printk(type, fmt, arg...) \
__ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
-__attribute__ ((format(printf, 1, 2)))
+__printf(1, 2)
void __ecryptfs_printk(const char *fmt, ...);
extern const struct file_operations ecryptfs_main_fops;
@@ -584,9 +584,10 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode);
int ecryptfs_encrypt_page(struct page *page);
int ecryptfs_decrypt_page(struct page *page);
-int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
+int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
+ struct inode *ecryptfs_inode);
int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
-int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
+int ecryptfs_new_file_context(struct inode *ecryptfs_inode);
void ecryptfs_write_crypt_stat_flags(char *page_virt,
struct ecryptfs_crypt_stat *crypt_stat,
size_t *written);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index c6ac98cf9ba..d3f95f941c4 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -139,6 +139,27 @@ out:
return rc;
}
+static void ecryptfs_vma_close(struct vm_area_struct *vma)
+{
+ filemap_write_and_wait(vma->vm_file->f_mapping);
+}
+
+static const struct vm_operations_struct ecryptfs_file_vm_ops = {
+ .close = ecryptfs_vma_close,
+ .fault = filemap_fault,
+};
+
+static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ int rc;
+
+ rc = generic_file_mmap(file, vma);
+ if (!rc)
+ vma->vm_ops = &ecryptfs_file_vm_ops;
+
+ return rc;
+}
+
struct kmem_cache *ecryptfs_file_info_cache;
/**
@@ -349,7 +370,7 @@ const struct file_operations ecryptfs_main_fops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ecryptfs_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ecryptfs_file_mmap,
.open = ecryptfs_open,
.flush = ecryptfs_flush,
.release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 11f8582d721..32f90a3ae63 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -172,22 +172,23 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
* it. It will also update the eCryptfs directory inode to mimic the
* stat of the lower directory inode.
*
- * Returns zero on success; non-zero on error condition
+ * Returns the new eCryptfs inode on success; an ERR_PTR on error condition
*/
-static int
+static struct inode *
ecryptfs_do_create(struct inode *directory_inode,
struct dentry *ecryptfs_dentry, int mode)
{
int rc;
struct dentry *lower_dentry;
struct dentry *lower_dir_dentry;
+ struct inode *inode;
lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
lower_dir_dentry = lock_parent(lower_dentry);
if (IS_ERR(lower_dir_dentry)) {
ecryptfs_printk(KERN_ERR, "Error locking directory of "
"dentry\n");
- rc = PTR_ERR(lower_dir_dentry);
+ inode = ERR_CAST(lower_dir_dentry);
goto out;
}
rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
@@ -195,20 +196,19 @@ ecryptfs_do_create(struct inode *directory_inode,
if (rc) {
printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
"rc = [%d]\n", __func__, rc);
+ inode = ERR_PTR(rc);
goto out_lock;
}
- rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
- directory_inode->i_sb);
- if (rc) {
- ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
+ inode = __ecryptfs_get_inode(lower_dentry->d_inode,
+ directory_inode->i_sb);
+ if (IS_ERR(inode))
goto out_lock;
- }
fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode);
fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
out_lock:
unlock_dir(lower_dir_dentry);
out:
- return rc;
+ return inode;
}
/**
@@ -219,26 +219,26 @@ out:
*
* Returns zero on success
*/
-static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
+static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
+ struct inode *ecryptfs_inode)
{
struct ecryptfs_crypt_stat *crypt_stat =
- &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+ &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
int rc = 0;
- if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+ if (S_ISDIR(ecryptfs_inode->i_mode)) {
ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
goto out;
}
ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
- rc = ecryptfs_new_file_context(ecryptfs_dentry);
+ rc = ecryptfs_new_file_context(ecryptfs_inode);
if (rc) {
ecryptfs_printk(KERN_ERR, "Error creating new file "
"context; rc = [%d]\n", rc);
goto out;
}
- rc = ecryptfs_get_lower_file(ecryptfs_dentry,
- ecryptfs_dentry->d_inode);
+ rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode);
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
@@ -246,10 +246,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
ecryptfs_dentry->d_name.name, rc);
goto out;
}
- rc = ecryptfs_write_metadata(ecryptfs_dentry);
+ rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode);
if (rc)
printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
- ecryptfs_put_lower_file(ecryptfs_dentry->d_inode);
+ ecryptfs_put_lower_file(ecryptfs_inode);
out:
return rc;
}
@@ -269,18 +269,28 @@ static int
ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
int mode, struct nameidata *nd)
{
+ struct inode *ecryptfs_inode;
int rc;
- /* ecryptfs_do_create() calls ecryptfs_interpose() */
- rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode);
- if (unlikely(rc)) {
+ ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry,
+ mode);
+ if (unlikely(IS_ERR(ecryptfs_inode))) {
ecryptfs_printk(KERN_WARNING, "Failed to create file in"
"lower filesystem\n");
+ rc = PTR_ERR(ecryptfs_inode);
goto out;
}
/* At this point, a file exists on "disk"; we need to make sure
* that this on disk file is prepared to be an ecryptfs file */
- rc = ecryptfs_initialize_file(ecryptfs_dentry);
+ rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode);
+ if (rc) {
+ drop_nlink(ecryptfs_inode);
+ unlock_new_inode(ecryptfs_inode);
+ iput(ecryptfs_inode);
+ goto out;
+ }
+ d_instantiate(ecryptfs_dentry, ecryptfs_inode);
+ unlock_new_inode(ecryptfs_inode);
out:
return rc;
}
@@ -474,8 +484,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
goto out_lock;
fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
- old_dentry->d_inode->i_nlink =
- ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
+ set_nlink(old_dentry->d_inode,
+ ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink);
i_size_write(new_dentry->d_inode, file_size_save);
out_lock:
unlock_dir(lower_dir_dentry);
@@ -499,8 +509,8 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
goto out_unlock;
}
fsstack_copy_attr_times(dir, lower_dir_inode);
- dentry->d_inode->i_nlink =
- ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink;
+ set_nlink(dentry->d_inode,
+ ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink);
dentry->d_inode->i_ctime = dir->i_ctime;
d_drop(dentry);
out_unlock:
@@ -565,7 +575,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
goto out;
fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
- dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+ set_nlink(dir, lower_dir_dentry->d_inode->i_nlink);
out:
unlock_dir(lower_dir_dentry);
if (!dentry->d_inode)
@@ -588,7 +598,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
if (!rc && dentry->d_inode)
clear_nlink(dentry->d_inode);
fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
- dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+ set_nlink(dir, lower_dir_dentry->d_inode->i_nlink);
unlock_dir(lower_dir_dentry);
if (!rc)
d_drop(dentry);
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 9c13412e6c9..bc84f365d75 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -96,7 +96,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
efs_inode = (struct efs_dinode *) (bh->b_data + offset);
inode->i_mode = be16_to_cpu(efs_inode->di_mode);
- inode->i_nlink = be16_to_cpu(efs_inode->di_nlink);
+ set_nlink(inode, be16_to_cpu(efs_inode->di_nlink));
inode->i_uid = (uid_t)be16_to_cpu(efs_inode->di_uid);
inode->i_gid = (gid_t)be16_to_cpu(efs_inode->di_gid);
inode->i_size = be32_to_cpu(efs_inode->di_size);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index fe047d966dc..828e750af23 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -70,6 +70,15 @@
* simultaneous inserts (A into B and B into A) from racing and
* constructing a cycle without either insert observing that it is
* going to.
+ * It is necessary to acquire multiple "ep->mtx"es at once in the
+ * case when one epoll fd is added to another. In this case, we
+ * always acquire the locks in the order of nesting (i.e. after
+ * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
+ * before e2->mtx). Since we disallow cycles of epoll file
+ * descriptors, this ensures that the mutexes are well-ordered. In
+ * order to communicate this nesting to lockdep, when walking a tree
+ * of epoll file descriptors, we use the current recursion depth as
+ * the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
* mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
@@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
* @ep: Pointer to the epoll private data structure.
* @sproc: Pointer to the scan callback.
* @priv: Private opaque data passed to the @sproc callback.
+ * @depth: The current depth of recursive f_op->poll calls.
*
* Returns: The same integer error code returned by the @sproc callback.
*/
static int ep_scan_ready_list(struct eventpoll *ep,
int (*sproc)(struct eventpoll *,
struct list_head *, void *),
- void *priv)
+ void *priv,
+ int depth)
{
int error, pwake = 0;
unsigned long flags;
@@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
* We need to lock this because we could be hit by
* eventpoll_release_file() and epoll_ctl().
*/
- mutex_lock(&ep->mtx);
+ mutex_lock_nested(&ep->mtx, depth);
/*
* Steal the ready list, and re-init the original one to the
@@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
{
- return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
+ return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
}
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
@@ -700,7 +711,7 @@ static const struct file_operations eventpoll_fops = {
.llseek = noop_llseek,
};
-/* Fast test to see if the file is an evenpoll file */
+/* Fast test to see if the file is an eventpoll file */
static inline int is_file_epoll(struct file *f)
{
return f->f_op == &eventpoll_fops;
@@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file)
ep = epi->ep;
list_del_init(&epi->fllink);
- mutex_lock(&ep->mtx);
+ mutex_lock_nested(&ep->mtx, 0);
ep_remove(ep, epi);
mutex_unlock(&ep->mtx);
}
@@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep,
esed.maxevents = maxevents;
esed.events = events;
- return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
+ return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
}
static inline struct timespec ep_set_mstimeout(long ms)
@@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
struct rb_node *rbp;
struct epitem *epi;
- mutex_lock(&ep->mtx);
+ mutex_lock_nested(&ep->mtx, call_nests + 1);
for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
@@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
}
- mutex_lock(&ep->mtx);
+ mutex_lock_nested(&ep->mtx, 0);
/*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
diff --git a/fs/exec.c b/fs/exec.c
index 25dcbe5fc35..36254645b7c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -841,10 +841,6 @@ static int exec_mmap(struct mm_struct *mm)
tsk->mm = mm;
tsk->active_mm = mm;
activate_mm(active_mm, mm);
- if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
- atomic_dec(&old_mm->oom_disable_count);
- atomic_inc(&tsk->mm->oom_disable_count);
- }
task_unlock(tsk);
arch_pick_mmap_layout(mm);
if (old_mm) {
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index c5a5855a6c4..352ba149d23 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -13,7 +13,8 @@
#
# ore module library
-obj-$(CONFIG_ORE) += ore.o
+libore-y := ore.o ore_raid.o
+obj-$(CONFIG_ORE) += libore.o
exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index 70bae414929..da42f32c49b 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,10 +1,17 @@
+# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
+# for every ORE user we do it like this. Any user should add itself here
+# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
+# selected here, and we default to "ON". So in effect it is like been
+# selected by any of the users.
config ORE
tristate
+ depends on EXOFS_FS || PNFS_OBJLAYOUT
+ select ASYNC_XOR
+ default SCSI_OSD_ULD
config EXOFS_FS
tristate "exofs: OSD based file system support"
depends on SCSI_OSD_ULD
- select ORE
help
EXOFS is a file system that uses an OSD storage device,
as its backing storage.
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index f4e442ec744..51f4b4c40f0 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -53,6 +53,10 @@
/* u64 has problems with printk this will cast it to unsigned long long */
#define _LLU(x) (unsigned long long)(x)
+struct exofs_dev {
+ struct ore_dev ored;
+ unsigned did;
+};
/*
* our extension to the in-memory superblock
*/
@@ -66,13 +70,9 @@ struct exofs_sb_info {
u32 s_next_generation; /* next gen # to use */
atomic_t s_curr_pending; /* number of pending commands */
- struct pnfs_osd_data_map data_map; /* Default raid to use
- * FIXME: Needed ?
- */
struct ore_layout layout; /* Default files layout */
struct ore_comp one_comp; /* id & cred of partition id=0*/
- struct ore_components comps; /* comps for the partition */
- struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
+ struct ore_components oc; /* comps for the partition */
};
/*
@@ -86,7 +86,7 @@ struct exofs_i_info {
uint32_t i_dir_start_lookup; /* which page to start lookup */
uint64_t i_commit_size; /* the object's written length */
struct ore_comp one_comp; /* same component for all devices */
- struct ore_components comps; /* inode view of the device table */
+ struct ore_components oc; /* inode view of the device table */
};
static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations;
* bigger and that the device table repeats twice.
* See: exofs_read_lookup_dev_table()
*/
-static inline void exofs_init_comps(struct ore_components *comps,
+static inline void exofs_init_comps(struct ore_components *oc,
struct ore_comp *one_comp,
struct exofs_sb_info *sbi, osd_id oid)
{
@@ -217,13 +217,15 @@ static inline void exofs_init_comps(struct ore_components *comps,
one_comp->obj.id = oid;
exofs_make_credential(one_comp->cred, &one_comp->obj);
- comps->numdevs = sbi->comps.numdevs;
- comps->single_comp = EC_SINGLE_COMP;
- comps->comps = one_comp;
+ oc->first_dev = 0;
+ oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
+ sbi->layout.group_count;
+ oc->single_comp = EC_SINGLE_COMP;
+ oc->comps = one_comp;
/* Round robin device view of the table */
- first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs;
- comps->ods = sbi->comps.ods + first_dev;
+ first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
+ oc->ods = &sbi->oc.ods[first_dev];
}
#endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f39a38fc234..f6dbf7768ce 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,11 +37,7 @@
#define EXOFS_DBGMSG2(M...) do {} while (0)
-enum { BIO_MAX_PAGES_KMALLOC =
- (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
- MAX_PAGES_KMALLOC =
- PAGE_SIZE / sizeof(struct page *),
-};
+enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
unsigned exofs_max_io_pages(struct ore_layout *layout,
unsigned expected_pages)
@@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout,
unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
/* TODO: easily support bio chaining */
- pages = min_t(unsigned, pages,
- layout->group_width * BIO_MAX_PAGES_KMALLOC);
+ pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
return pages;
}
@@ -68,6 +63,7 @@ struct page_collect {
bool read_4_write; /* This means two things: that the read is sync
* And the pages should not be unlocked.
*/
+ struct page *that_locked_page;
};
static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -86,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
pcol->length = 0;
pcol->pg_first = -1;
pcol->read_4_write = false;
+ pcol->that_locked_page = NULL;
}
static void _pcol_reset(struct page_collect *pcol)
@@ -98,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol)
pcol->length = 0;
pcol->pg_first = -1;
pcol->ios = NULL;
+ pcol->that_locked_page = NULL;
/* this is probably the end of the loop but in writes
* it might not end here. don't be left with nothing
@@ -149,14 +147,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page,
return 0;
}
+enum {PAGE_WAS_NOT_IN_IO = 17};
static int update_read_page(struct page *page, int ret)
{
- if (ret == 0) {
+ switch (ret) {
+ case 0:
/* Everything is OK */
SetPageUptodate(page);
if (PageError(page))
ClearPageError(page);
- } else if (ret == -EFAULT) {
+ break;
+ case -EFAULT:
/* In this case we were trying to read something that wasn't on
* disk yet - return a page full of zeroes. This should be OK,
* because the object should be empty (if there was a write
@@ -167,16 +168,22 @@ static int update_read_page(struct page *page, int ret)
SetPageUptodate(page);
if (PageError(page))
ClearPageError(page);
- ret = 0; /* recovered error */
EXOFS_DBGMSG("recovered read error\n");
- } else /* Error */
+ /* fall through */
+ case PAGE_WAS_NOT_IN_IO:
+ ret = 0; /* recovered error */
+ break;
+ default:
SetPageError(page);
-
+ }
return ret;
}
static void update_write_page(struct page *page, int ret)
{
+ if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
+ return; /* don't pass start don't collect $200 */
+
if (ret) {
mapping_set_error(page->mapping, ret);
SetPageError(page);
@@ -190,15 +197,16 @@ static void update_write_page(struct page *page, int ret)
static int __readpages_done(struct page_collect *pcol)
{
int i;
- u64 resid;
u64 good_bytes;
u64 length = 0;
- int ret = ore_check_io(pcol->ios, &resid);
+ int ret = ore_check_io(pcol->ios, NULL);
- if (likely(!ret))
+ if (likely(!ret)) {
good_bytes = pcol->length;
- else
- good_bytes = pcol->length - resid;
+ ret = PAGE_WAS_NOT_IN_IO;
+ } else {
+ good_bytes = 0;
+ }
EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
" length=0x%lx nr_pages=%u\n",
@@ -259,6 +267,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
}
}
+static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
+ struct page_collect *pcol_src, struct page_collect *pcol)
+{
+ /* length was wrong or offset was not page aligned */
+ BUG_ON(pcol_src->nr_pages < ios->nr_pages);
+
+ if (pcol_src->nr_pages > ios->nr_pages) {
+ struct page **src_page;
+ unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
+ unsigned long len_less = pcol_src->length - ios->length;
+ unsigned i;
+ int ret;
+
+ /* This IO was trimmed */
+ pcol_src->nr_pages = ios->nr_pages;
+ pcol_src->length = ios->length;
+
+ /* Left over pages are passed to the next io */
+ pcol->expected_pages += pages_less;
+ pcol->nr_pages = pages_less;
+ pcol->length = len_less;
+ src_page = pcol_src->pages + pcol_src->nr_pages;
+ pcol->pg_first = (*src_page)->index;
+
+ ret = pcol_try_alloc(pcol);
+ if (unlikely(ret))
+ return ret;
+
+ for (i = 0; i < pages_less; ++i)
+ pcol->pages[i] = *src_page++;
+
+ EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
+ "pages_less=0x%x expected_pages=0x%x "
+ "next_offset=0x%llx next_len=0x%lx\n",
+ pcol_src->nr_pages, pages_less, pcol->expected_pages,
+ pcol->pg_first * PAGE_SIZE, pcol->length);
+ }
+ return 0;
+}
+
static int read_exec(struct page_collect *pcol)
{
struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -270,7 +318,7 @@ static int read_exec(struct page_collect *pcol)
return 0;
if (!pcol->ios) {
- int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true,
+ int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
pcol->pg_first << PAGE_CACHE_SHIFT,
pcol->length, &pcol->ios);
@@ -280,7 +328,6 @@ static int read_exec(struct page_collect *pcol)
ios = pcol->ios;
ios->pages = pcol->pages;
- ios->nr_pages = pcol->nr_pages;
if (pcol->read_4_write) {
ore_read(pcol->ios);
@@ -296,17 +343,23 @@ static int read_exec(struct page_collect *pcol)
*pcol_copy = *pcol;
ios->done = readpages_done;
ios->private = pcol_copy;
+
+ /* pages ownership was passed to pcol_copy */
+ _pcol_reset(pcol);
+
+ ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+ if (unlikely(ret))
+ goto err;
+
+ EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+ pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
ret = ore_read(ios);
if (unlikely(ret))
goto err;
atomic_inc(&pcol->sbi->s_curr_pending);
- EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
- oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
-
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
return 0;
err:
@@ -341,6 +394,8 @@ static int readpage_strip(void *data, struct page *page)
EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
page->index);
+ pcol->that_locked_page = page;
+
if (page->index < end_index)
len = PAGE_CACHE_SIZE;
else if (page->index == end_index)
@@ -429,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
return ret;
}
+ ret = read_exec(&pcol);
+ if (unlikely(ret))
+ return ret;
+
return read_exec(&pcol);
}
@@ -462,17 +521,18 @@ static void writepages_done(struct ore_io_state *ios, void *p)
{
struct page_collect *pcol = p;
int i;
- u64 resid;
u64 good_bytes;
u64 length = 0;
- int ret = ore_check_io(ios, &resid);
+ int ret = ore_check_io(ios, NULL);
atomic_dec(&pcol->sbi->s_curr_pending);
- if (likely(!ret))
+ if (likely(!ret)) {
good_bytes = pcol->length;
- else
- good_bytes = pcol->length - resid;
+ ret = PAGE_WAS_NOT_IN_IO;
+ } else {
+ good_bytes = 0;
+ }
EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
" length=0x%lx nr_pages=%u\n",
@@ -505,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p)
EXOFS_DBGMSG2("writepages_done END\n");
}
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
+{
+ struct page_collect *pcol = priv;
+ pgoff_t index = offset / PAGE_SIZE;
+
+ if (!pcol->that_locked_page ||
+ (pcol->that_locked_page->index != index)) {
+ struct page *page = find_get_page(pcol->inode->i_mapping, index);
+
+ if (!page) {
+ page = find_or_create_page(pcol->inode->i_mapping,
+ index, GFP_NOFS);
+ if (unlikely(!page)) {
+ EXOFS_DBGMSG("grab_cache_page Failed "
+ "index=0x%llx\n", _LLU(index));
+ return NULL;
+ }
+ unlock_page(page);
+ }
+ if (PageDirty(page) || PageWriteback(page))
+ *uptodate = true;
+ else
+ *uptodate = PageUptodate(page);
+ EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
+ return page;
+ } else {
+ EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
+ pcol->that_locked_page->index);
+ *uptodate = true;
+ return pcol->that_locked_page;
+ }
+}
+
+static void __r4w_put_page(void *priv, struct page *page)
+{
+ struct page_collect *pcol = priv;
+
+ if (pcol->that_locked_page != page) {
+ EXOFS_DBGMSG("index=0x%lx\n", page->index);
+ page_cache_release(page);
+ return;
+ }
+ EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index);
+}
+
+static const struct _ore_r4w_op _r4w_op = {
+ .get_page = &__r4w_get_page,
+ .put_page = &__r4w_put_page,
+};
+
static int write_exec(struct page_collect *pcol)
{
struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -516,10 +626,9 @@ static int write_exec(struct page_collect *pcol)
return 0;
BUG_ON(pcol->ios);
- ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false,
+ ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
pcol->pg_first << PAGE_CACHE_SHIFT,
pcol->length, &pcol->ios);
-
if (unlikely(ret))
goto err;
@@ -534,10 +643,20 @@ static int write_exec(struct page_collect *pcol)
ios = pcol->ios;
ios->pages = pcol_copy->pages;
- ios->nr_pages = pcol_copy->nr_pages;
ios->done = writepages_done;
+ ios->r4w = &_r4w_op;
ios->private = pcol_copy;
+ /* pages ownership was passed to pcol_copy */
+ _pcol_reset(pcol);
+
+ ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+ if (unlikely(ret))
+ goto err;
+
+ EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+ pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
ret = ore_write(ios);
if (unlikely(ret)) {
EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +664,6 @@ static int write_exec(struct page_collect *pcol)
}
atomic_inc(&pcol->sbi->s_curr_pending);
- EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
- pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
- pcol->length);
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
return 0;
err:
@@ -689,14 +803,33 @@ static int exofs_writepages(struct address_space *mapping,
_pcol_init(&pcol, expected_pages, mapping->host);
ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
- if (ret) {
+ if (unlikely(ret)) {
EXOFS_ERR("write_cache_pages => %d\n", ret);
return ret;
}
- return write_exec(&pcol);
+ ret = write_exec(&pcol);
+ if (unlikely(ret))
+ return ret;
+
+ if (wbc->sync_mode == WB_SYNC_ALL) {
+ return write_exec(&pcol); /* pump the last reminder */
+ } else if (pcol.nr_pages) {
+ /* not SYNC let the reminder join the next writeout */
+ unsigned i;
+
+ for (i = 0; i < pcol.nr_pages; i++) {
+ struct page *page = pcol.pages[i];
+
+ end_page_writeback(page);
+ set_page_dirty(page);
+ unlock_page(page);
+ }
+ }
+ return 0;
}
+/*
static int exofs_writepage(struct page *page, struct writeback_control *wbc)
{
struct page_collect pcol;
@@ -712,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
return write_exec(&pcol);
}
-
+*/
/* i_mutex held using inode->i_size directly */
static void _write_failed(struct inode *inode, loff_t to)
{
@@ -818,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset)
const struct address_space_operations exofs_aops = {
.readpage = exofs_readpage,
.readpages = exofs_readpages,
- .writepage = exofs_writepage,
+ .writepage = NULL,
.writepages = exofs_writepages,
.write_begin = exofs_write_begin_export,
.write_end = exofs_write_end,
@@ -860,7 +993,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize);
+ ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
if (likely(!ret))
truncate_setsize(inode, newsize);
@@ -927,14 +1060,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
struct exofs_on_disk_inode_layout *layout;
int ret;
- ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
return ret;
}
- attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
- attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
+ attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
+ attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
ios->in_attr = attrs;
ios->in_attr_len = ARRAY_SIZE(attrs);
@@ -1018,7 +1151,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
return inode;
oi = exofs_i(inode);
__oi_init(oi);
- exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+ exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
exofs_oi_objno(oi));
/* read the inode from the osd */
@@ -1032,7 +1165,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
inode->i_mode = le16_to_cpu(fcb.i_mode);
inode->i_uid = le32_to_cpu(fcb.i_uid);
inode->i_gid = le32_to_cpu(fcb.i_gid);
- inode->i_nlink = le16_to_cpu(fcb.i_links_count);
+ set_nlink(inode, le16_to_cpu(fcb.i_links_count));
inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
@@ -1172,13 +1305,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
spin_unlock(&sbi->s_next_gen_lock);
insert_inode_hash(inode);
- exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+ exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
exofs_oi_objno(oi));
exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
mark_inode_dirty(inode);
- ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
return ERR_PTR(ret);
@@ -1267,7 +1400,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
} else
memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
- ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
goto free_args;
@@ -1350,7 +1483,7 @@ void exofs_evict_inode(struct inode *inode)
/* ignore the error, attempt a remove anyway */
/* Now Remove the OSD objects */
- ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
return;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 25305af8819..d271ad83720 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -23,77 +23,289 @@
*/
#include <linux/slab.h>
+#include <linux/module.h>
#include <asm/div64.h>
+#include <linux/lcm.h>
-#include <scsi/osd_ore.h>
+#include "ore_raid.h"
-#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
+MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
+MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
+MODULE_LICENSE("GPL");
+
+/* ore_verify_layout does a couple of things:
+ * 1. Given a minimum number of needed parameters fixes up the rest of the
+ * members to be operatonals for the ore. The needed parameters are those
+ * that are defined by the pnfs-objects layout STD.
+ * 2. Check to see if the current ore code actually supports these parameters
+ * for example stripe_unit must be a multple of the system PAGE_SIZE,
+ * and etc...
+ * 3. Cache some havily used calculations that will be needed by users.
+ */
+
+enum { BIO_MAX_PAGES_KMALLOC =
+ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
-#ifdef CONFIG_EXOFS_DEBUG
-#define ORE_DBGMSG(fmt, a...) \
- printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define ORE_DBGMSG(fmt, a...) \
- do { if (0) printk(fmt, ##a); } while (0)
-#endif
+int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
+{
+ u64 stripe_length;
+
+ switch (layout->raid_algorithm) {
+ case PNFS_OSD_RAID_0:
+ layout->parity = 0;
+ break;
+ case PNFS_OSD_RAID_5:
+ layout->parity = 1;
+ break;
+ case PNFS_OSD_RAID_PQ:
+ case PNFS_OSD_RAID_4:
+ default:
+ ORE_ERR("Only RAID_0/5 for now\n");
+ return -EINVAL;
+ }
+ if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
+ ORE_ERR("Stripe Unit(0x%llx)"
+ " must be Multples of PAGE_SIZE(0x%lx)\n",
+ _LLU(layout->stripe_unit), PAGE_SIZE);
+ return -EINVAL;
+ }
+ if (layout->group_width) {
+ if (!layout->group_depth) {
+ ORE_ERR("group_depth == 0 && group_width != 0\n");
+ return -EINVAL;
+ }
+ if (total_comps < (layout->group_width * layout->mirrors_p1)) {
+ ORE_ERR("Data Map wrong, "
+ "numdevs=%d < group_width=%d * mirrors=%d\n",
+ total_comps, layout->group_width,
+ layout->mirrors_p1);
+ return -EINVAL;
+ }
+ layout->group_count = total_comps / layout->mirrors_p1 /
+ layout->group_width;
+ } else {
+ if (layout->group_depth) {
+ printk(KERN_NOTICE "Warning: group_depth ignored "
+ "group_width == 0 && group_depth == %lld\n",
+ _LLU(layout->group_depth));
+ }
+ layout->group_width = total_comps / layout->mirrors_p1;
+ layout->group_depth = -1;
+ layout->group_count = 1;
+ }
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
+ stripe_length = (u64)layout->group_width * layout->stripe_unit;
+ if (stripe_length >= (1ULL << 32)) {
+ ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
+ _LLU(stripe_length));
+ return -EINVAL;
+ }
-#define ORE_DBGMSG2(M...) do {} while (0)
-/* #define ORE_DBGMSG2 ORE_DBGMSG */
+ layout->max_io_length =
+ (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
+ layout->group_width;
+ if (layout->parity) {
+ unsigned stripe_length =
+ (layout->group_width - layout->parity) *
+ layout->stripe_unit;
-MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
-MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
-MODULE_LICENSE("GPL");
+ layout->max_io_length /= stripe_length;
+ layout->max_io_length *= stripe_length;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ore_verify_layout);
static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
{
- return ios->comps->comps[index & ios->comps->single_comp].cred;
+ return ios->oc->comps[index & ios->oc->single_comp].cred;
}
static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
{
- return &ios->comps->comps[index & ios->comps->single_comp].obj;
+ return &ios->oc->comps[index & ios->oc->single_comp].obj;
}
static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
{
- return ios->comps->ods[index];
+ ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
+ ios->oc->first_dev, ios->oc->numdevs, index,
+ ios->oc->ods);
+
+ return ore_comp_dev(ios->oc, index);
}
-int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
+int _ore_get_io_state(struct ore_layout *layout,
+ struct ore_components *oc, unsigned numdevs,
+ unsigned sgs_per_dev, unsigned num_par_pages,
+ struct ore_io_state **pios)
+{
+ struct ore_io_state *ios;
+ struct page **pages;
+ struct osd_sg_entry *sgilist;
+ struct __alloc_all_io_state {
+ struct ore_io_state ios;
+ struct ore_per_dev_state per_dev[numdevs];
+ union {
+ struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+ struct page *pages[num_par_pages];
+ };
+ } *_aios;
+
+ if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
+ _aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
+ if (unlikely(!_aios)) {
+ ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
+ sizeof(*_aios));
+ *pios = NULL;
+ return -ENOMEM;
+ }
+ pages = num_par_pages ? _aios->pages : NULL;
+ sgilist = sgs_per_dev ? _aios->sglist : NULL;
+ ios = &_aios->ios;
+ } else {
+ struct __alloc_small_io_state {
+ struct ore_io_state ios;
+ struct ore_per_dev_state per_dev[numdevs];
+ } *_aio_small;
+ union __extra_part {
+ struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+ struct page *pages[num_par_pages];
+ } *extra_part;
+
+ _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
+ if (unlikely(!_aio_small)) {
+ ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
+ sizeof(*_aio_small));
+ *pios = NULL;
+ return -ENOMEM;
+ }
+ extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
+ if (unlikely(!extra_part)) {
+ ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
+ sizeof(*extra_part));
+ kfree(_aio_small);
+ *pios = NULL;
+ return -ENOMEM;
+ }
+
+ pages = num_par_pages ? extra_part->pages : NULL;
+ sgilist = sgs_per_dev ? extra_part->sglist : NULL;
+ /* In this case the per_dev[0].sgilist holds the pointer to
+ * be freed
+ */
+ ios = &_aio_small->ios;
+ ios->extra_part_alloc = true;
+ }
+
+ if (pages) {
+ ios->parity_pages = pages;
+ ios->max_par_pages = num_par_pages;
+ }
+ if (sgilist) {
+ unsigned d;
+
+ for (d = 0; d < numdevs; ++d) {
+ ios->per_dev[d].sglist = sgilist;
+ sgilist += sgs_per_dev;
+ }
+ ios->sgs_per_dev = sgs_per_dev;
+ }
+
+ ios->layout = layout;
+ ios->oc = oc;
+ *pios = ios;
+ return 0;
+}
+
+/* Allocate an io_state for only a single group of devices
+ *
+ * If a user needs to call ore_read/write() this version must be used becase it
+ * allocates extra stuff for striping and raid.
+ * The ore might decide to only IO less then @length bytes do to alignmets
+ * and constrains as follows:
+ * - The IO cannot cross group boundary.
+ * - In raid5/6 The end of the IO must align at end of a stripe eg.
+ * (@offset + @length) % strip_size == 0. Or the complete range is within a
+ * single stripe.
+ * - Memory condition only permitted a shorter IO. (A user can use @length=~0
+ * And check the returned ios->length for max_io_size.)
+ *
+ * The caller must check returned ios->length (and/or ios->nr_pages) and
+ * re-issue these pages that fall outside of ios->length
+ */
+int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
bool is_reading, u64 offset, u64 length,
struct ore_io_state **pios)
{
struct ore_io_state *ios;
+ unsigned numdevs = layout->group_width * layout->mirrors_p1;
+ unsigned sgs_per_dev = 0, max_par_pages = 0;
+ int ret;
- /*TODO: Maybe use kmem_cach per sbi of size
- * exofs_io_state_size(layout->s_numdevs)
- */
- ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL);
- if (unlikely(!ios)) {
- ORE_DBGMSG("Failed kzalloc bytes=%d\n",
- ore_io_state_size(comps->numdevs));
- *pios = NULL;
- return -ENOMEM;
+ if (layout->parity && length) {
+ unsigned data_devs = layout->group_width - layout->parity;
+ unsigned stripe_size = layout->stripe_unit * data_devs;
+ unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+ u32 remainder;
+ u64 num_stripes;
+ u64 num_raid_units;
+
+ num_stripes = div_u64_rem(length, stripe_size, &remainder);
+ if (remainder)
+ ++num_stripes;
+
+ num_raid_units = num_stripes * layout->parity;
+
+ if (is_reading) {
+ /* For reads add per_dev sglist array */
+ /* TODO: Raid 6 we need twice more. Actually:
+ * num_stripes / LCMdP(W,P);
+ * if (W%P != 0) num_stripes *= parity;
+ */
+
+ /* first/last seg is split */
+ num_raid_units += layout->group_width;
+ sgs_per_dev = div_u64(num_raid_units, data_devs);
+ } else {
+ /* For Writes add parity pages array. */
+ max_par_pages = num_raid_units * pages_in_unit *
+ sizeof(struct page *);
+ }
}
- ios->layout = layout;
- ios->comps = comps;
- ios->offset = offset;
- ios->length = length;
+ ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
+ pios);
+ if (unlikely(ret))
+ return ret;
+
+ ios = *pios;
ios->reading = is_reading;
+ ios->offset = offset;
+
+ if (length) {
+ ore_calc_stripe_info(layout, offset, length, &ios->si);
+ ios->length = ios->si.length;
+ ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+ if (layout->parity)
+ _ore_post_alloc_raid_stuff(ios);
+ }
- *pios = ios;
return 0;
}
EXPORT_SYMBOL(ore_get_rw_state);
-int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
- struct ore_io_state **ios)
+/* Allocate an io_state for all the devices in the comps array
+ *
+ * This version of io_state allocation is used mostly by create/remove
+ * and trunc where we currently need all the devices. The only wastful
+ * bit is the read/write_attributes with no IO. Those sites should
+ * be converted to use ore_get_rw_state() with length=0
+ */
+int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
+ struct ore_io_state **pios)
{
- return ore_get_rw_state(layout, comps, true, 0, 0, ios);
+ return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
}
EXPORT_SYMBOL(ore_get_io_state);
@@ -111,6 +323,7 @@ void ore_put_io_state(struct ore_io_state *ios)
bio_put(per_dev->bio);
}
+ _ore_free_raid_stuff(ios);
kfree(ios);
}
}
@@ -138,7 +351,7 @@ static void _done_io(struct osd_request *or, void *p)
kref_put(&ios->kref, _last_io);
}
-static int ore_io_execute(struct ore_io_state *ios)
+int ore_io_execute(struct ore_io_state *ios)
{
DECLARE_COMPLETION_ONSTACK(wait);
bool sync = (ios->done == NULL);
@@ -198,7 +411,7 @@ static void _clear_bio(struct bio *bio)
}
}
-int ore_check_io(struct ore_io_state *ios, u64 *resid)
+int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
{
enum osd_err_priority acumulated_osd_err = 0;
int acumulated_lin_err = 0;
@@ -206,7 +419,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
for (i = 0; i < ios->numdevs; i++) {
struct osd_sense_info osi;
- struct osd_request *or = ios->per_dev[i].or;
+ struct ore_per_dev_state *per_dev = &ios->per_dev[i];
+ struct osd_request *or = per_dev->or;
int ret;
if (unlikely(!or))
@@ -218,29 +432,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
/* start read offset passed endof file */
- _clear_bio(ios->per_dev[i].bio);
+ _clear_bio(per_dev->bio);
ORE_DBGMSG("start read offset passed end of file "
"offset=0x%llx, length=0x%llx\n",
- _LLU(ios->per_dev[i].offset),
- _LLU(ios->per_dev[i].length));
+ _LLU(per_dev->offset),
+ _LLU(per_dev->length));
continue; /* we recovered */
}
+ if (on_dev_error) {
+ u64 residual = ios->reading ?
+ or->in.residual : or->out.residual;
+ u64 offset = (ios->offset + ios->length) - residual;
+ struct ore_dev *od = ios->oc->ods[
+ per_dev->dev - ios->oc->first_dev];
+
+ on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
+ offset, residual);
+ }
if (osi.osd_err_pri >= acumulated_osd_err) {
acumulated_osd_err = osi.osd_err_pri;
acumulated_lin_err = ret;
}
}
- /* TODO: raid specific residual calculations */
- if (resid) {
- if (likely(!acumulated_lin_err))
- *resid = 0;
- else
- *resid = ios->length;
- }
-
return acumulated_lin_err;
}
EXPORT_SYMBOL(ore_check_io);
@@ -248,61 +464,65 @@ EXPORT_SYMBOL(ore_check_io);
/*
* L - logical offset into the file
*
- * U - The number of bytes in a stripe within a group
+ * D - number of Data devices
+ * D = group_width - parity
*
- * U = stripe_unit * group_width
+ * U - The number of bytes in a stripe within a group
+ * U = stripe_unit * D
*
* T - The number of bytes striped within a group of component objects
* (before advancing to the next group)
- *
- * T = stripe_unit * group_width * group_depth
+ * T = U * group_depth
*
* S - The number of bytes striped across all component objects
* before the pattern repeats
+ * S = T * group_count
*
- * S = stripe_unit * group_width * group_depth * group_count
- *
- * M - The "major" (i.e., across all components) stripe number
- *
+ * M - The "major" (i.e., across all components) cycle number
* M = L / S
*
- * G - Counts the groups from the beginning of the major stripe
- *
+ * G - Counts the groups from the beginning of the major cycle
* G = (L - (M * S)) / T [or (L % S) / T]
*
* H - The byte offset within the group
- *
* H = (L - (M * S)) % T [or (L % S) % T]
*
* N - The "minor" (i.e., across the group) stripe number
- *
* N = H / U
*
* C - The component index coresponding to L
*
- * C = (H - (N * U)) / stripe_unit + G * group_width
- * [or (L % U) / stripe_unit + G * group_width]
+ * C = (H - (N * U)) / stripe_unit + G * D
+ * [or (L % U) / stripe_unit + G * D]
*
* O - The component offset coresponding to L
- *
* O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
+ *
+ * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
+ * divide by parity
+ * LCMdP = lcm(group_width, parity) / parity
+ *
+ * R - The parity Rotation stripe
+ * (Note parity cycle always starts at a group's boundary)
+ * R = N % LCMdP
+ *
+ * I = the first parity device index
+ * I = (group_width + group_width - R*parity - parity) % group_width
+ *
+ * Craid - The component index Rotated
+ * Craid = (group_width + C - R*parity) % group_width
+ * (We add the group_width to avoid negative numbers modulo math)
*/
-struct _striping_info {
- u64 obj_offset;
- u64 group_length;
- u64 M; /* for truncate */
- unsigned dev;
- unsigned unit_off;
-};
-
-static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
- struct _striping_info *si)
+void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+ u64 length, struct ore_striping_info *si)
{
u32 stripe_unit = layout->stripe_unit;
u32 group_width = layout->group_width;
u64 group_depth = layout->group_depth;
+ u32 parity = layout->parity;
- u32 U = stripe_unit * group_width;
+ u32 D = group_width - parity;
+ u32 U = D * stripe_unit;
u64 T = U * group_depth;
u64 S = T * layout->group_count;
u64 M = div64_u64(file_offset, S);
@@ -318,39 +538,65 @@ static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
u32 N = div_u64(H, U);
/* "H - (N * U)" is just "H % U" so it's bound to u32 */
- si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
- si->dev *= layout->mirrors_p1;
+ u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
div_u64_rem(file_offset, stripe_unit, &si->unit_off);
si->obj_offset = si->unit_off + (N * stripe_unit) +
(M * group_depth * stripe_unit);
- si->group_length = T - H;
+ if (parity) {
+ u32 LCMdP = lcm(group_width, parity) / parity;
+ /* R = N % LCMdP; */
+ u32 RxP = (N % LCMdP) * parity;
+ u32 first_dev = C - C % group_width;
+
+ si->par_dev = (group_width + group_width - parity - RxP) %
+ group_width + first_dev;
+ si->dev = (group_width + C - RxP) % group_width + first_dev;
+ si->bytes_in_stripe = U;
+ si->first_stripe_start = M * S + G * T + N * U;
+ } else {
+ /* Make the math correct see _prepare_one_group */
+ si->par_dev = group_width;
+ si->dev = C;
+ }
+
+ si->dev *= layout->mirrors_p1;
+ si->par_dev *= layout->mirrors_p1;
+ si->offset = file_offset;
+ si->length = T - H;
+ if (si->length > length)
+ si->length = length;
si->M = M;
}
+EXPORT_SYMBOL(ore_calc_stripe_info);
-static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
- unsigned pgbase, struct ore_per_dev_state *per_dev,
- int cur_len)
+int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
+ unsigned pgbase, struct page **pages,
+ struct ore_per_dev_state *per_dev, int cur_len)
{
unsigned pg = *cur_pg;
struct request_queue *q =
osd_request_queue(_ios_od(ios, per_dev->dev));
-
- per_dev->length += cur_len;
+ unsigned len = cur_len;
+ int ret;
if (per_dev->bio == NULL) {
unsigned pages_in_stripe = ios->layout->group_width *
(ios->layout->stripe_unit / PAGE_SIZE);
- unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
- ios->layout->group_width;
+ unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
+ (ios->layout->group_width -
+ ios->layout->parity);
+ unsigned bio_size = (nr_pages + pages_in_stripe) /
+ ios->layout->group_width;
per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
if (unlikely(!per_dev->bio)) {
ORE_DBGMSG("Failed to allocate BIO size=%u\n",
bio_size);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
}
@@ -358,64 +604,90 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
unsigned added_len;
- BUG_ON(ios->nr_pages <= pg);
cur_len -= pglen;
- added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
+ added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
pglen, pgbase);
- if (unlikely(pglen != added_len))
- return -ENOMEM;
+ if (unlikely(pglen != added_len)) {
+ ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
+ per_dev->bio->bi_vcnt);
+ ret = -ENOMEM;
+ goto out;
+ }
+ _add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
+
pgbase = 0;
++pg;
}
BUG_ON(cur_len);
+ per_dev->length += len;
*cur_pg = pg;
- return 0;
+ ret = 0;
+out: /* we fail the complete unit on an error eg don't advance
+ * per_dev->length and cur_pg. This means that we might have a bigger
+ * bio than the CDB requested length (per_dev->length). That's fine
+ * only the oposite is fatal.
+ */
+ return ret;
}
-static int _prepare_one_group(struct ore_io_state *ios, u64 length,
- struct _striping_info *si)
+static int _prepare_for_striping(struct ore_io_state *ios)
{
+ struct ore_striping_info *si = &ios->si;
unsigned stripe_unit = ios->layout->stripe_unit;
unsigned mirrors_p1 = ios->layout->mirrors_p1;
- unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+ unsigned group_width = ios->layout->group_width;
+ unsigned devs_in_group = group_width * mirrors_p1;
unsigned dev = si->dev;
unsigned first_dev = dev - (dev % devs_in_group);
- unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+ unsigned dev_order;
unsigned cur_pg = ios->pages_consumed;
+ u64 length = ios->length;
int ret = 0;
+ if (!ios->pages) {
+ ios->numdevs = ios->layout->mirrors_p1;
+ return 0;
+ }
+
+ BUG_ON(length > si->length);
+
+ dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
+ si->cur_comp = dev_order;
+ si->cur_pg = si->unit_off / PAGE_SIZE;
+
while (length) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[dev];
+ unsigned comp = dev - first_dev;
+ struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
unsigned cur_len, page_off = 0;
if (!per_dev->length) {
per_dev->dev = dev;
- if (dev < si->dev) {
- per_dev->offset = si->obj_offset + stripe_unit -
- si->unit_off;
- cur_len = stripe_unit;
- } else if (dev == si->dev) {
+ if (dev == si->dev) {
+ WARN_ON(dev == si->par_dev);
per_dev->offset = si->obj_offset;
cur_len = stripe_unit - si->unit_off;
page_off = si->unit_off & ~PAGE_MASK;
BUG_ON(page_off && (page_off != ios->pgbase));
- } else { /* dev > si->dev */
- per_dev->offset = si->obj_offset - si->unit_off;
+ } else {
+ if (si->cur_comp > dev_order)
+ per_dev->offset =
+ si->obj_offset - si->unit_off;
+ else /* si->cur_comp < dev_order */
+ per_dev->offset =
+ si->obj_offset + stripe_unit -
+ si->unit_off;
cur_len = stripe_unit;
}
-
- if (max_comp < dev)
- max_comp = dev;
} else {
cur_len = stripe_unit;
}
if (cur_len >= length)
cur_len = length;
- ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
- cur_len);
+ ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
+ per_dev, cur_len);
if (unlikely(ret))
goto out;
@@ -423,60 +695,60 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
dev = (dev % devs_in_group) + first_dev;
length -= cur_len;
- }
-out:
- ios->numdevs = max_comp + mirrors_p1;
- ios->pages_consumed = cur_pg;
- return ret;
-}
-
-static int _prepare_for_striping(struct ore_io_state *ios)
-{
- u64 length = ios->length;
- u64 offset = ios->offset;
- struct _striping_info si;
- int ret = 0;
- if (!ios->pages) {
- if (ios->kern_buff) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[0];
+ si->cur_comp = (si->cur_comp + 1) % group_width;
+ if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
+ if (!length && ios->sp2d) {
+ /* If we are writing and this is the very last
+ * stripe. then operate on parity dev.
+ */
+ dev = si->par_dev;
+ }
+ if (ios->sp2d)
+ /* In writes cur_len just means if it's the
+ * last one. See _ore_add_parity_unit.
+ */
+ cur_len = length;
+ per_dev = &ios->per_dev[dev - first_dev];
+ if (!per_dev->length) {
+ /* Only/always the parity unit of the first
+ * stripe will be empty. So this is a chance to
+ * initialize the per_dev info.
+ */
+ per_dev->dev = dev;
+ per_dev->offset = si->obj_offset - si->unit_off;
+ }
- _calc_stripe_info(ios->layout, ios->offset, &si);
- per_dev->offset = si.obj_offset;
- per_dev->dev = si.dev;
+ ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
+ if (unlikely(ret))
+ goto out;
- /* no cross device without page array */
- BUG_ON((ios->layout->group_width > 1) &&
- (si.unit_off + ios->length >
- ios->layout->stripe_unit));
+ /* Rotate next par_dev backwards with wraping */
+ si->par_dev = (devs_in_group + si->par_dev -
+ ios->layout->parity * mirrors_p1) %
+ devs_in_group + first_dev;
+ /* Next stripe, start fresh */
+ si->cur_comp = 0;
+ si->cur_pg = 0;
}
- ios->numdevs = ios->layout->mirrors_p1;
- return 0;
- }
-
- while (length) {
- _calc_stripe_info(ios->layout, offset, &si);
-
- if (length < si.group_length)
- si.group_length = length;
-
- ret = _prepare_one_group(ios, si.group_length, &si);
- if (unlikely(ret))
- goto out;
-
- offset += si.group_length;
- length -= si.group_length;
}
-
out:
- return ret;
+ ios->numdevs = devs_in_group;
+ ios->pages_consumed = cur_pg;
+ if (unlikely(ret)) {
+ if (length == ios->length)
+ return ret;
+ else
+ ios->length -= length;
+ }
+ return 0;
}
int ore_create(struct ore_io_state *ios)
{
int i, ret;
- for (i = 0; i < ios->comps->numdevs; i++) {
+ for (i = 0; i < ios->oc->numdevs; i++) {
struct osd_request *or;
or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -501,7 +773,7 @@ int ore_remove(struct ore_io_state *ios)
{
int i, ret;
- for (i = 0; i < ios->comps->numdevs; i++) {
+ for (i = 0; i < ios->oc->numdevs; i++) {
struct osd_request *or;
or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -543,7 +815,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
goto out;
}
per_dev->or = or;
- per_dev->offset = master_dev->offset;
if (ios->pages) {
struct bio *bio;
@@ -562,6 +833,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
__bio_clone(bio, master_dev->bio);
bio->bi_bdev = NULL;
bio->bi_next = NULL;
+ per_dev->offset = master_dev->offset;
per_dev->length = master_dev->length;
per_dev->bio = bio;
per_dev->dev = dev;
@@ -579,7 +851,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
_LLU(per_dev->offset),
_LLU(per_dev->length), dev);
} else if (ios->kern_buff) {
- ret = osd_req_write_kern(or, _ios_obj(ios, dev),
+ per_dev->offset = ios->si.obj_offset;
+ per_dev->dev = ios->si.dev + dev;
+
+ /* no cross device without page array */
+ BUG_ON((ios->layout->group_width > 1) &&
+ (ios->si.unit_off + ios->length >
+ ios->layout->stripe_unit));
+
+ ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
per_dev->offset,
ios->kern_buff, ios->length);
if (unlikely(ret))
@@ -588,7 +868,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
"length=0x%llx dev=%d\n",
_LLU(_ios_obj(ios, dev)->id),
_LLU(per_dev->offset),
- _LLU(ios->length), dev);
+ _LLU(ios->length), per_dev->dev);
} else {
osd_req_set_attributes(or, _ios_obj(ios, dev));
ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
@@ -614,6 +894,14 @@ int ore_write(struct ore_io_state *ios)
int i;
int ret;
+ if (unlikely(ios->sp2d && !ios->r4w)) {
+ /* A library is attempting a RAID-write without providing
+ * a pages lock interface.
+ */
+ WARN_ON_ONCE(1);
+ return -ENOTSUPP;
+ }
+
ret = _prepare_for_striping(ios);
if (unlikely(ret))
return ret;
@@ -629,7 +917,7 @@ int ore_write(struct ore_io_state *ios)
}
EXPORT_SYMBOL(ore_write);
-static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
{
struct osd_request *or;
struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
@@ -648,22 +936,27 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
per_dev->or = or;
if (ios->pages) {
- osd_req_read(or, obj, per_dev->offset,
- per_dev->bio, per_dev->length);
+ if (per_dev->cur_sg) {
+ /* finalize the last sg_entry */
+ _ore_add_sg_seg(per_dev, 0, false);
+ if (unlikely(!per_dev->cur_sg))
+ return 0; /* Skip parity only device */
+
+ osd_req_read_sg(or, obj, per_dev->bio,
+ per_dev->sglist, per_dev->cur_sg);
+ } else {
+ /* The no raid case */
+ osd_req_read(or, obj, per_dev->offset,
+ per_dev->bio, per_dev->length);
+ }
+
ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
- " dev=%d\n", _LLU(obj->id),
+ " dev=%d sg_len=%d\n", _LLU(obj->id),
_LLU(per_dev->offset), _LLU(per_dev->length),
- first_dev);
- } else if (ios->kern_buff) {
- int ret = osd_req_read_kern(or, obj, per_dev->offset,
- ios->kern_buff, ios->length);
- ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%d ret=>%d\n",
- _LLU(obj->id), _LLU(per_dev->offset),
- _LLU(ios->length), first_dev, ret);
- if (unlikely(ret))
- return ret;
+ first_dev, per_dev->cur_sg);
} else {
+ BUG_ON(ios->kern_buff);
+
osd_req_get_attributes(or, obj);
ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
_LLU(obj->id),
@@ -688,7 +981,7 @@ int ore_read(struct ore_io_state *ios)
return ret;
for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- ret = _read_mirror(ios, i);
+ ret = _ore_read_mirror(ios, i);
if (unlikely(ret))
return ret;
}
@@ -744,31 +1037,29 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
}
struct _trunc_info {
- struct _striping_info si;
+ struct ore_striping_info si;
u64 prev_group_obj_off;
u64 next_group_obj_off;
unsigned first_group_dev;
unsigned nex_group_dev;
- unsigned max_devs;
};
-void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
- struct _trunc_info *ti)
+static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
+ struct _trunc_info *ti)
{
unsigned stripe_unit = layout->stripe_unit;
- _calc_stripe_info(layout, file_offset, &ti->si);
+ ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
ti->prev_group_obj_off = ti->si.M * stripe_unit;
ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
ti->nex_group_dev = ti->first_group_dev + layout->group_width;
- ti->max_devs = layout->group_width * layout->group_count;
}
-int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
+int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
u64 size)
{
struct ore_io_state *ios;
@@ -779,22 +1070,22 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
struct _trunc_info ti;
int i, ret;
- ret = ore_get_io_state(layout, comps, &ios);
+ ret = ore_get_io_state(layout, oc, &ios);
if (unlikely(ret))
return ret;
_calc_trunk_info(ios->layout, size, &ti);
- size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs),
+ size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
GFP_KERNEL);
if (unlikely(!size_attrs)) {
ret = -ENOMEM;
goto out;
}
- ios->numdevs = ios->comps->numdevs;
+ ios->numdevs = ios->oc->numdevs;
- for (i = 0; i < ti.max_devs; ++i) {
+ for (i = 0; i < ios->numdevs; ++i) {
struct exofs_trunc_attr *size_attr = &size_attrs[i];
u64 obj_size;
@@ -815,7 +1106,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
size_attr->attr.val_ptr = &size_attr->newsize;
ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
- _LLU(comps->comps->obj.id), _LLU(obj_size), i);
+ _LLU(oc->comps->obj.id), _LLU(obj_size), i);
ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
&size_attr->attr);
if (unlikely(ret))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
new file mode 100644
index 00000000000..29c47e5c4a8
--- /dev/null
+++ b/fs/exofs/ore_raid.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright (C) 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ * "Free Software Foundation <info@fsf.org>"
+ */
+
+#include <linux/gfp.h>
+#include <linux/async_tx.h>
+
+#include "ore_raid.h"
+
+#undef ORE_DBGMSG2
+#define ORE_DBGMSG2 ORE_DBGMSG
+
+struct page *_raid_page_alloc(void)
+{
+ return alloc_page(GFP_KERNEL);
+}
+
+void _raid_page_free(struct page *p)
+{
+ __free_page(p);
+}
+
+/* This struct is forward declare in ore_io_state, but is private to here.
+ * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
+ *
+ * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
+ * Ascending page index access is sp2d(p-minor, c-major). But storage is
+ * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
+ * API.
+ */
+struct __stripe_pages_2d {
+ /* Cache some hot path repeated calculations */
+ unsigned parity;
+ unsigned data_devs;
+ unsigned pages_in_unit;
+
+ bool needed ;
+
+ /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
+ struct __1_page_stripe {
+ bool alloc;
+ unsigned write_count;
+ struct async_submit_ctl submit;
+ struct dma_async_tx_descriptor *tx;
+
+ /* The size of this array is data_devs + parity */
+ struct page **pages;
+ struct page **scribble;
+ /* bool array, size of this array is data_devs */
+ char *page_is_read;
+ } _1p_stripes[];
+};
+
+/* This can get bigger then a page. So support multiple page allocations
+ * _sp2d_free should be called even if _sp2d_alloc fails (by returning
+ * none-zero).
+ */
+static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
+ unsigned parity, struct __stripe_pages_2d **psp2d)
+{
+ struct __stripe_pages_2d *sp2d;
+ unsigned data_devs = group_width - parity;
+ struct _alloc_all_bytes {
+ struct __alloc_stripe_pages_2d {
+ struct __stripe_pages_2d sp2d;
+ struct __1_page_stripe _1p_stripes[pages_in_unit];
+ } __asp2d;
+ struct __alloc_1p_arrays {
+ struct page *pages[group_width];
+ struct page *scribble[group_width];
+ char page_is_read[data_devs];
+ } __a1pa[pages_in_unit];
+ } *_aab;
+ struct __alloc_1p_arrays *__a1pa;
+ struct __alloc_1p_arrays *__a1pa_end;
+ const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]);
+ unsigned num_a1pa, alloc_size, i;
+
+ /* FIXME: check these numbers in ore_verify_layout */
+ BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE);
+ BUG_ON(sizeof__a1pa > PAGE_SIZE);
+
+ if (sizeof(*_aab) > PAGE_SIZE) {
+ num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa;
+ alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa;
+ } else {
+ num_a1pa = pages_in_unit;
+ alloc_size = sizeof(*_aab);
+ }
+
+ _aab = kzalloc(alloc_size, GFP_KERNEL);
+ if (unlikely(!_aab)) {
+ ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
+ return -ENOMEM;
+ }
+
+ sp2d = &_aab->__asp2d.sp2d;
+ *psp2d = sp2d; /* From here Just call _sp2d_free */
+
+ __a1pa = _aab->__a1pa;
+ __a1pa_end = __a1pa + num_a1pa;
+
+ for (i = 0; i < pages_in_unit; ++i) {
+ if (unlikely(__a1pa >= __a1pa_end)) {
+ num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
+ pages_in_unit - i);
+
+ __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL);
+ if (unlikely(!__a1pa)) {
+ ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
+ num_a1pa);
+ return -ENOMEM;
+ }
+ __a1pa_end = __a1pa + num_a1pa;
+ /* First *pages is marked for kfree of the buffer */
+ sp2d->_1p_stripes[i].alloc = true;
+ }
+
+ sp2d->_1p_stripes[i].pages = __a1pa->pages;
+ sp2d->_1p_stripes[i].scribble = __a1pa->scribble ;
+ sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read;
+ ++__a1pa;
+ }
+
+ sp2d->parity = parity;
+ sp2d->data_devs = data_devs;
+ sp2d->pages_in_unit = pages_in_unit;
+ return 0;
+}
+
+static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
+ const struct _ore_r4w_op *r4w, void *priv)
+{
+ unsigned data_devs = sp2d->data_devs;
+ unsigned group_width = data_devs + sp2d->parity;
+ unsigned p;
+
+ if (!sp2d->needed)
+ return;
+
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if (_1ps->write_count < group_width) {
+ unsigned c;
+
+ for (c = 0; c < data_devs; c++)
+ if (_1ps->page_is_read[c]) {
+ struct page *page = _1ps->pages[c];
+
+ r4w->put_page(priv, page);
+ _1ps->page_is_read[c] = false;
+ }
+ }
+
+ memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
+ _1ps->write_count = 0;
+ _1ps->tx = NULL;
+ }
+
+ sp2d->needed = false;
+}
+
+static void _sp2d_free(struct __stripe_pages_2d *sp2d)
+{
+ unsigned i;
+
+ if (!sp2d)
+ return;
+
+ for (i = 0; i < sp2d->pages_in_unit; ++i) {
+ if (sp2d->_1p_stripes[i].alloc)
+ kfree(sp2d->_1p_stripes[i].pages);
+ }
+
+ kfree(sp2d);
+}
+
+static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
+{
+ unsigned p;
+
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if (_1ps->write_count)
+ return p;
+ }
+
+ return ~0;
+}
+
+static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
+{
+ unsigned p;
+
+ for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if (_1ps->write_count)
+ return p;
+ }
+
+ return ~0;
+}
+
+static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
+{
+ unsigned p;
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if (!_1ps->write_count)
+ continue;
+
+ init_async_submit(&_1ps->submit,
+ ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
+ NULL,
+ NULL, NULL,
+ (addr_conv_t *)_1ps->scribble);
+
+ /* TODO: raid6 */
+ _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
+ 0, sp2d->data_devs, PAGE_SIZE,
+ &_1ps->submit);
+ }
+
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+ /* NOTE: We wait for HW synchronously (I don't have such HW
+ * to test with.) Is parallelism needed with today's multi
+ * cores?
+ */
+ async_tx_issue_pending(_1ps->tx);
+ }
+}
+
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+ struct ore_striping_info *si, struct page *page)
+{
+ struct __1_page_stripe *_1ps;
+
+ sp2d->needed = true;
+
+ _1ps = &sp2d->_1p_stripes[si->cur_pg];
+ _1ps->pages[si->cur_comp] = page;
+ ++_1ps->write_count;
+
+ si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
+ /* si->cur_comp is advanced outside at main loop */
+}
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+ bool not_last)
+{
+ struct osd_sg_entry *sge;
+
+ ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
+ "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
+ per_dev->dev, cur_len, not_last, per_dev->cur_sg,
+ _LLU(per_dev->offset), per_dev->length,
+ per_dev->last_sgs_total);
+
+ if (!per_dev->cur_sg) {
+ sge = per_dev->sglist;
+
+ /* First time we prepare two entries */
+ if (per_dev->length) {
+ ++per_dev->cur_sg;
+ sge->offset = per_dev->offset;
+ sge->len = per_dev->length;
+ } else {
+ /* Here the parity is the first unit of this object.
+ * This happens every time we reach a parity device on
+ * the same stripe as the per_dev->offset. We need to
+ * just skip this unit.
+ */
+ per_dev->offset += cur_len;
+ return;
+ }
+ } else {
+ /* finalize the last one */
+ sge = &per_dev->sglist[per_dev->cur_sg - 1];
+ sge->len = per_dev->length - per_dev->last_sgs_total;
+ }
+
+ if (not_last) {
+ /* Partly prepare the next one */
+ struct osd_sg_entry *next_sge = sge + 1;
+
+ ++per_dev->cur_sg;
+ next_sge->offset = sge->offset + sge->len + cur_len;
+ /* Save cur len so we know how mutch was added next time */
+ per_dev->last_sgs_total = per_dev->length;
+ next_sge->len = 0;
+ } else if (!sge->len) {
+ /* Optimize for when the last unit is a parity */
+ --per_dev->cur_sg;
+ }
+}
+
+static int _alloc_read_4_write(struct ore_io_state *ios)
+{
+ struct ore_layout *layout = ios->layout;
+ int ret;
+ /* We want to only read those pages not in cache so worst case
+ * is a stripe populated with every other page
+ */
+ unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
+
+ ret = _ore_get_io_state(layout, ios->oc,
+ layout->group_width * layout->mirrors_p1,
+ sgs_per_dev, 0, &ios->ios_read_4_write);
+ return ret;
+}
+
+/* @si contains info of the to-be-inserted page. Update of @si should be
+ * maintained by caller. Specificaly si->dev, si->obj_offset, ...
+ */
+static int _add_to_read_4_write(struct ore_io_state *ios,
+ struct ore_striping_info *si, struct page *page)
+{
+ struct request_queue *q;
+ struct ore_per_dev_state *per_dev;
+ struct ore_io_state *read_ios;
+ unsigned first_dev = si->dev - (si->dev %
+ (ios->layout->group_width * ios->layout->mirrors_p1));
+ unsigned comp = si->dev - first_dev;
+ unsigned added_len;
+
+ if (!ios->ios_read_4_write) {
+ int ret = _alloc_read_4_write(ios);
+
+ if (unlikely(ret))
+ return ret;
+ }
+
+ read_ios = ios->ios_read_4_write;
+ read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
+
+ per_dev = &read_ios->per_dev[comp];
+ if (!per_dev->length) {
+ per_dev->bio = bio_kmalloc(GFP_KERNEL,
+ ios->sp2d->pages_in_unit);
+ if (unlikely(!per_dev->bio)) {
+ ORE_DBGMSG("Failed to allocate BIO size=%u\n",
+ ios->sp2d->pages_in_unit);
+ return -ENOMEM;
+ }
+ per_dev->offset = si->obj_offset;
+ per_dev->dev = si->dev;
+ } else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
+ u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
+
+ _ore_add_sg_seg(per_dev, gap, true);
+ }
+ q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
+ added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0);
+ if (unlikely(added_len != PAGE_SIZE)) {
+ ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
+ per_dev->bio->bi_vcnt);
+ return -ENOMEM;
+ }
+
+ per_dev->length += PAGE_SIZE;
+ return 0;
+}
+
+static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
+{
+ struct bio_vec *bv;
+ unsigned i, d;
+
+ /* loop on all devices all pages */
+ for (d = 0; d < ios->numdevs; d++) {
+ struct bio *bio = ios->per_dev[d].bio;
+
+ if (!bio)
+ continue;
+
+ __bio_for_each_segment(bv, bio, i, 0) {
+ struct page *page = bv->bv_page;
+
+ SetPageUptodate(page);
+ if (PageError(page))
+ ClearPageError(page);
+ }
+ }
+}
+
+/* read_4_write is hacked to read the start of the first stripe and/or
+ * the end of the last stripe. If needed, with an sg-gap at each device/page.
+ * It is assumed to be called after the to_be_written pages of the first stripe
+ * are populating ios->sp2d[][]
+ *
+ * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
+ * These pages are held at sp2d[p].pages[c] but with
+ * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
+ * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
+ * @uptodate=true, so we don't need to read it, only unlock, after IO.
+ *
+ * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
+ * to-be-written count, we should consider the xor-in-place mode.
+ * need_to_read_pages_count is the actual number of pages not present in cache.
+ * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
+ * approximation? In this mode the read pages are put in the empty places of
+ * ios->sp2d[p][*], xor is calculated the same way. These pages are
+ * allocated/freed and don't go through cache
+ */
+static int _read_4_write(struct ore_io_state *ios)
+{
+ struct ore_io_state *ios_read;
+ struct ore_striping_info read_si;
+ struct __stripe_pages_2d *sp2d = ios->sp2d;
+ u64 offset = ios->si.first_stripe_start;
+ u64 last_stripe_end;
+ unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
+ unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+ int ret;
+
+ if (offset == ios->offset) /* Go to start collect $200 */
+ goto read_last_stripe;
+
+ min_p = _sp2d_min_pg(sp2d);
+ max_p = _sp2d_max_pg(sp2d);
+
+ for (c = 0; ; c++) {
+ ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+ read_si.obj_offset += min_p * PAGE_SIZE;
+ offset += min_p * PAGE_SIZE;
+ for (p = min_p; p <= max_p; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+ struct page **pp = &_1ps->pages[c];
+ bool uptodate;
+
+ if (*pp)
+ /* to-be-written pages start here */
+ goto read_last_stripe;
+
+ *pp = ios->r4w->get_page(ios->private, offset,
+ &uptodate);
+ if (unlikely(!*pp))
+ return -ENOMEM;
+
+ if (!uptodate)
+ _add_to_read_4_write(ios, &read_si, *pp);
+
+ /* Mark read-pages to be cache_released */
+ _1ps->page_is_read[c] = true;
+ read_si.obj_offset += PAGE_SIZE;
+ offset += PAGE_SIZE;
+ }
+ offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
+ }
+
+read_last_stripe:
+ offset = ios->offset + (ios->length + PAGE_SIZE - 1) /
+ PAGE_SIZE * PAGE_SIZE;
+ last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
+ * bytes_in_stripe;
+ if (offset == last_stripe_end) /* Optimize for the aligned case */
+ goto read_it;
+
+ ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+ p = read_si.unit_off / PAGE_SIZE;
+ c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
+ ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
+
+ BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
+ /* unaligned IO must be within a single stripe */
+
+ if (min_p == sp2d->pages_in_unit) {
+ /* Didn't do it yet */
+ min_p = _sp2d_min_pg(sp2d);
+ max_p = _sp2d_max_pg(sp2d);
+ }
+
+ while (offset < last_stripe_end) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if ((min_p <= p) && (p <= max_p)) {
+ struct page *page;
+ bool uptodate;
+
+ BUG_ON(_1ps->pages[c]);
+ page = ios->r4w->get_page(ios->private, offset,
+ &uptodate);
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ _1ps->pages[c] = page;
+ /* Mark read-pages to be cache_released */
+ _1ps->page_is_read[c] = true;
+ if (!uptodate)
+ _add_to_read_4_write(ios, &read_si, page);
+ }
+
+ offset += PAGE_SIZE;
+ if (p == (sp2d->pages_in_unit - 1)) {
+ ++c;
+ p = 0;
+ ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+ } else {
+ read_si.obj_offset += PAGE_SIZE;
+ ++p;
+ }
+ }
+
+read_it:
+ ios_read = ios->ios_read_4_write;
+ if (!ios_read)
+ return 0;
+
+ /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
+ * to check for per_dev->bio
+ */
+ ios_read->pages = ios->pages;
+
+ /* Now read these devices */
+ for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
+ ret = _ore_read_mirror(ios_read, i);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ ret = ore_io_execute(ios_read); /* Synchronus execution */
+ if (unlikely(ret)) {
+ ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
+ return ret;
+ }
+
+ _mark_read4write_pages_uptodate(ios_read, ret);
+ return 0;
+}
+
+/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
+int _ore_add_parity_unit(struct ore_io_state *ios,
+ struct ore_striping_info *si,
+ struct ore_per_dev_state *per_dev,
+ unsigned cur_len)
+{
+ if (ios->reading) {
+ BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
+ _ore_add_sg_seg(per_dev, cur_len, true);
+ } else {
+ struct __stripe_pages_2d *sp2d = ios->sp2d;
+ struct page **pages = ios->parity_pages + ios->cur_par_page;
+ unsigned num_pages;
+ unsigned array_start = 0;
+ unsigned i;
+ int ret;
+
+ si->cur_pg = _sp2d_min_pg(sp2d);
+ num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
+
+ if (!cur_len) /* If last stripe operate on parity comp */
+ si->cur_comp = sp2d->data_devs;
+
+ if (!per_dev->length) {
+ per_dev->offset += si->cur_pg * PAGE_SIZE;
+ /* If first stripe, Read in all read4write pages
+ * (if needed) before we calculate the first parity.
+ */
+ _read_4_write(ios);
+ }
+
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = _raid_page_alloc();
+ if (unlikely(!pages[i]))
+ return -ENOMEM;
+
+ ++(ios->cur_par_page);
+ }
+
+ BUG_ON(si->cur_comp != sp2d->data_devs);
+ BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
+
+ ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
+ per_dev, num_pages * PAGE_SIZE);
+ if (unlikely(ret))
+ return ret;
+
+ /* TODO: raid6 if (last_parity_dev) */
+ _gen_xor_unit(sp2d);
+ _sp2d_reset(sp2d, ios->r4w, ios->private);
+ }
+ return 0;
+}
+
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
+{
+ struct ore_layout *layout = ios->layout;
+
+ if (ios->parity_pages) {
+ unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+ unsigned stripe_size = ios->si.bytes_in_stripe;
+ u64 last_stripe, first_stripe;
+
+ if (_sp2d_alloc(pages_in_unit, layout->group_width,
+ layout->parity, &ios->sp2d)) {
+ return -ENOMEM;
+ }
+
+ BUG_ON(ios->offset % PAGE_SIZE);
+
+ /* Round io down to last full strip */
+ first_stripe = div_u64(ios->offset, stripe_size);
+ last_stripe = div_u64(ios->offset + ios->length, stripe_size);
+
+ /* If an IO spans more then a single stripe it must end at
+ * a stripe boundary. The reminder at the end is pushed into the
+ * next IO.
+ */
+ if (last_stripe != first_stripe) {
+ ios->length = last_stripe * stripe_size - ios->offset;
+
+ BUG_ON(!ios->length);
+ ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
+ PAGE_SIZE;
+ ios->si.length = ios->length; /*make it consistent */
+ }
+ }
+ return 0;
+}
+
+void _ore_free_raid_stuff(struct ore_io_state *ios)
+{
+ if (ios->sp2d) { /* writing and raid */
+ unsigned i;
+
+ for (i = 0; i < ios->cur_par_page; i++) {
+ struct page *page = ios->parity_pages[i];
+
+ if (page)
+ _raid_page_free(page);
+ }
+ if (ios->extra_part_alloc)
+ kfree(ios->parity_pages);
+ /* If IO returned an error pages might need unlocking */
+ _sp2d_reset(ios->sp2d, ios->r4w, ios->private);
+ _sp2d_free(ios->sp2d);
+ } else {
+ /* Will only be set if raid reading && sglist is big */
+ if (ios->extra_part_alloc)
+ kfree(ios->per_dev[0].sglist);
+ }
+ if (ios->ios_read_4_write)
+ ore_put_io_state(ios->ios_read_4_write);
+}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
new file mode 100644
index 00000000000..2ffd2c3c6e4
--- /dev/null
+++ b/fs/exofs/ore_raid.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) from 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ * "Free Software Foundation <info@fsf.org>"
+ */
+
+#include <scsi/osd_ore.h>
+
+#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define ORE_DBGMSG(fmt, a...) \
+ printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define ORE_DBGMSG(fmt, a...) \
+ do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+#define ORE_DBGMSG2(M...) do {} while (0)
+/* #define ORE_DBGMSG2 ORE_DBGMSG */
+
+/* Calculate the component order in a stripe. eg the logical data unit
+ * address within the stripe of @dev given the @par_dev of this stripe.
+ */
+static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
+ unsigned par_dev, unsigned dev)
+{
+ unsigned first_dev = dev - dev % devs_in_group;
+
+ dev -= first_dev;
+ par_dev -= first_dev;
+
+ if (devs_in_group == par_dev) /* The raid 0 case */
+ return dev / mirrors_p1;
+ /* raid4/5/6 case */
+ return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
+ mirrors_p1;
+}
+
+/* ios_raid.c stuff needed by ios.c */
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
+void _ore_free_raid_stuff(struct ore_io_state *ios);
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+ bool not_last);
+int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
+ struct ore_per_dev_state *per_dev, unsigned cur_len);
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+ struct ore_striping_info *si, struct page *page);
+static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
+ struct ore_striping_info *si, struct page *page)
+{
+ if (!sp2d) /* Inline the fast path */
+ return; /* Hay no raid stuff */
+ _ore_add_stripe_page(sp2d, si, page);
+}
+
+/* ios.c stuff needed by ios_raid.c */
+int _ore_get_io_state(struct ore_layout *layout,
+ struct ore_components *oc, unsigned numdevs,
+ unsigned sgs_per_dev, unsigned num_par_pages,
+ struct ore_io_state **pios);
+int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
+ unsigned pgbase, struct page **pages,
+ struct ore_per_dev_state *per_dev, int cur_len);
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
+int ore_io_execute(struct ore_io_state *ios);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 274894053b0..e6085ec192d 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -35,6 +35,7 @@
#include <linux/parser.h>
#include <linux/vfs.h>
#include <linux/random.h>
+#include <linux/module.h>
#include <linux/exportfs.h>
#include <linux/slab.h>
@@ -266,7 +267,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
struct ore_io_state *ios;
int ret;
- ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
return ret;
@@ -321,7 +322,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
struct ore_io_state *ios;
int ret;
- ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
return ret;
@@ -355,12 +356,12 @@ static const struct export_operations exofs_export_ops;
/*
* Write the superblock to the OSD
*/
-int exofs_sync_fs(struct super_block *sb, int wait)
+static int exofs_sync_fs(struct super_block *sb, int wait)
{
struct exofs_sb_info *sbi;
struct exofs_fscb *fscb;
struct ore_comp one_comp;
- struct ore_components comps;
+ struct ore_components oc;
struct ore_io_state *ios;
int ret = -ENOMEM;
@@ -378,9 +379,9 @@ int exofs_sync_fs(struct super_block *sb, int wait)
* the writeable info is set in exofs_sbi_write_stats() above.
*/
- exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID);
+ exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
- ret = ore_get_io_state(&sbi->layout, &comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oc, &ios);
if (unlikely(ret))
goto out;
@@ -429,19 +430,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
msg, dev_path ?: "", odi->osdname, _LLU(pid));
}
-void exofs_free_sbi(struct exofs_sb_info *sbi)
+static void exofs_free_sbi(struct exofs_sb_info *sbi)
{
- while (sbi->comps.numdevs) {
- int i = --sbi->comps.numdevs;
- struct osd_dev *od = sbi->comps.ods[i];
+ unsigned numdevs = sbi->oc.numdevs;
+
+ while (numdevs) {
+ unsigned i = --numdevs;
+ struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
if (od) {
- sbi->comps.ods[i] = NULL;
+ ore_comp_set_dev(&sbi->oc, i, NULL);
osduld_put_device(od);
}
}
- if (sbi->comps.ods != sbi->_min_one_dev)
- kfree(sbi->comps.ods);
+ kfree(sbi->oc.ods);
kfree(sbi);
}
@@ -468,7 +470,7 @@ static void exofs_put_super(struct super_block *sb)
msecs_to_jiffies(100));
}
- _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0],
+ _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
sbi->one_comp.obj.partition);
bdi_destroy(&sbi->bdi);
@@ -479,76 +481,20 @@ static void exofs_put_super(struct super_block *sb)
static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
struct exofs_device_table *dt)
{
- u64 stripe_length;
+ int ret;
- sbi->data_map.odm_num_comps =
- le32_to_cpu(dt->dt_data_map.cb_num_comps);
- sbi->data_map.odm_stripe_unit =
+ sbi->layout.stripe_unit =
le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
- sbi->data_map.odm_group_width =
+ sbi->layout.group_width =
le32_to_cpu(dt->dt_data_map.cb_group_width);
- sbi->data_map.odm_group_depth =
+ sbi->layout.group_depth =
le32_to_cpu(dt->dt_data_map.cb_group_depth);
- sbi->data_map.odm_mirror_cnt =
- le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
- sbi->data_map.odm_raid_algorithm =
+ sbi->layout.mirrors_p1 =
+ le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
+ sbi->layout.raid_algorithm =
le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
-/* FIXME: Only raid0 for now. if not so, do not mount */
- if (sbi->data_map.odm_num_comps != numdevs) {
- EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
- sbi->data_map.odm_num_comps, numdevs);
- return -EINVAL;
- }
- if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
- EXOFS_ERR("Only RAID_0 for now\n");
- return -EINVAL;
- }
- if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
- EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
- numdevs, sbi->data_map.odm_mirror_cnt);
- return -EINVAL;
- }
-
- if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
- EXOFS_ERR("Stripe Unit(0x%llx)"
- " must be Multples of PAGE_SIZE(0x%lx)\n",
- _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
- return -EINVAL;
- }
-
- sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
- sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
-
- if (sbi->data_map.odm_group_width) {
- sbi->layout.group_width = sbi->data_map.odm_group_width;
- sbi->layout.group_depth = sbi->data_map.odm_group_depth;
- if (!sbi->layout.group_depth) {
- EXOFS_ERR("group_depth == 0 && group_width != 0\n");
- return -EINVAL;
- }
- sbi->layout.group_count = sbi->data_map.odm_num_comps /
- sbi->layout.mirrors_p1 /
- sbi->data_map.odm_group_width;
- } else {
- if (sbi->data_map.odm_group_depth) {
- printk(KERN_NOTICE "Warning: group_depth ignored "
- "group_width == 0 && group_depth == %d\n",
- sbi->data_map.odm_group_depth);
- sbi->data_map.odm_group_depth = 0;
- }
- sbi->layout.group_width = sbi->data_map.odm_num_comps /
- sbi->layout.mirrors_p1;
- sbi->layout.group_depth = -1;
- sbi->layout.group_count = 1;
- }
-
- stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
- if (stripe_length >= (1ULL << 32)) {
- EXOFS_ERR("Total Stripe length(0x%llx)"
- " >= 32bit is not supported\n", _LLU(stripe_length));
- return -EINVAL;
- }
+ ret = ore_verify_layout(numdevs, &sbi->layout);
EXOFS_DBGMSG("exofs: layout: "
"num_comps=%u stripe_unit=0x%x group_width=%u "
@@ -558,8 +504,8 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
sbi->layout.group_width,
_LLU(sbi->layout.group_depth),
sbi->layout.mirrors_p1,
- sbi->data_map.odm_raid_algorithm);
- return 0;
+ sbi->layout.raid_algorithm);
+ return ret;
}
static unsigned __ra_pages(struct ore_layout *layout)
@@ -605,12 +551,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
return !(odi->systemid_len || odi->osdname_len);
}
+int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+ struct exofs_dev **peds)
+{
+ struct __alloc_ore_devs_and_exofs_devs {
+ /* Twice bigger table: See exofs_init_comps() and comment at
+ * exofs_read_lookup_dev_table()
+ */
+ struct ore_dev *oreds[numdevs * 2 - 1];
+ struct exofs_dev eds[numdevs];
+ } *aoded;
+ struct exofs_dev *eds;
+ unsigned i;
+
+ aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
+ if (unlikely(!aoded)) {
+ EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
+ numdevs);
+ return -ENOMEM;
+ }
+
+ sbi->oc.ods = aoded->oreds;
+ *peds = eds = aoded->eds;
+ for (i = 0; i < numdevs; ++i)
+ aoded->oreds[i] = &eds[i].ored;
+ return 0;
+}
+
static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
struct osd_dev *fscb_od,
unsigned table_count)
{
struct ore_comp comp;
struct exofs_device_table *dt;
+ struct exofs_dev *eds;
unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
sizeof(*dt);
unsigned numdevs, i;
@@ -623,7 +597,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
return -ENOMEM;
}
- sbi->comps.numdevs = 0;
+ sbi->oc.numdevs = 0;
comp.obj.partition = sbi->one_comp.obj.partition;
comp.obj.id = EXOFS_DEVTABLE_ID;
@@ -647,20 +621,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
if (unlikely(ret))
goto out;
- if (likely(numdevs > 1)) {
- unsigned size = numdevs * sizeof(sbi->comps.ods[0]);
-
- /* Twice bigger table: See exofs_init_comps() and below
- * comment
- */
- sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL);
- if (unlikely(!sbi->comps.ods)) {
- EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
- numdevs);
- ret = -ENOMEM;
- goto out;
- }
- }
+ ret = __alloc_dev_table(sbi, numdevs, &eds);
+ if (unlikely(ret))
+ goto out;
+ /* exofs round-robins the device table view according to inode
+ * number. We hold a: twice bigger table hence inodes can point
+ * to any device and have a sequential view of the table
+ * starting at this device. See exofs_init_comps()
+ */
+ memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
+ (numdevs - 1) * sizeof(sbi->oc.ods[0]));
for (i = 0; i < numdevs; i++) {
struct exofs_fscb fscb;
@@ -676,13 +646,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
i, odi.osdname);
+ /* the exofs id is currently the table index */
+ eds[i].did = i;
+
/* On all devices the device table is identical. The user can
* specify any one of the participating devices on the command
* line. We always keep them in device-table order.
*/
if (fscb_od && osduld_device_same(fscb_od, &odi)) {
- sbi->comps.ods[i] = fscb_od;
- ++sbi->comps.numdevs;
+ eds[i].ored.od = fscb_od;
+ ++sbi->oc.numdevs;
fscb_od = NULL;
continue;
}
@@ -695,8 +668,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
goto out;
}
- sbi->comps.ods[i] = od;
- ++sbi->comps.numdevs;
+ eds[i].ored.od = od;
+ ++sbi->oc.numdevs;
/* Read the fscb of the other devices to make sure the FS
* partition is there.
@@ -718,21 +691,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
out:
kfree(dt);
- if (likely(!ret)) {
- unsigned numdevs = sbi->comps.numdevs;
-
- if (unlikely(fscb_od)) {
+ if (unlikely(fscb_od && !ret)) {
EXOFS_ERR("ERROR: Bad device-table container device not present\n");
osduld_put_device(fscb_od);
return -EINVAL;
- }
- /* exofs round-robins the device table view according to inode
- * number. We hold a: twice bigger table hence inodes can point
- * to any device and have a sequential view of the table
- * starting at this device. See exofs_init_comps()
- */
- for (i = 0; i < numdevs - 1; ++i)
- sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
}
return ret;
}
@@ -783,10 +745,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
sbi->one_comp.obj.partition = opts->pid;
sbi->one_comp.obj.id = 0;
exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
- sbi->comps.numdevs = 1;
- sbi->comps.single_comp = EC_SINGLE_COMP;
- sbi->comps.comps = &sbi->one_comp;
- sbi->comps.ods = sbi->_min_one_dev;
+ sbi->oc.numdevs = 1;
+ sbi->oc.single_comp = EC_SINGLE_COMP;
+ sbi->oc.comps = &sbi->one_comp;
/* fill in some other data by hand */
memset(sb->s_id, 0, sizeof(sb->s_id));
@@ -835,7 +796,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
if (unlikely(ret))
goto free_sbi;
} else {
- sbi->comps.ods[0] = od;
+ struct exofs_dev *eds;
+
+ ret = __alloc_dev_table(sbi, 1, &eds);
+ if (unlikely(ret))
+ goto free_sbi;
+
+ ore_comp_set_dev(&sbi->oc, 0, od);
}
__sbi_read_stats(sbi);
@@ -875,7 +842,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}
- _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0],
+ _exofs_print_device("Mounting", opts->dev_name,
+ ore_comp_dev(&sbi->oc, 0),
sbi->one_comp.obj.partition);
return 0;
@@ -924,7 +892,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
uint64_t used = ULLONG_MAX;
int ret;
- ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
if (ret) {
EXOFS_DBGMSG("ore_get_io_state failed.\n");
return ret;
@@ -981,7 +949,7 @@ static const struct super_operations exofs_sops = {
* EXPORT OPERATIONS
*****************************************************************************/
-struct dentry *exofs_get_parent(struct dentry *child)
+static struct dentry *exofs_get_parent(struct dentry *child)
{
unsigned long ino = exofs_parent_ino(child);
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 8f44cef1b3e..a8cbe1bc6ad 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -421,7 +421,7 @@ static inline int rsv_is_empty(struct ext2_reserve_window *rsv)
void ext2_init_block_alloc_info(struct inode *inode)
{
struct ext2_inode_info *ei = EXT2_I(inode);
- struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info;
+ struct ext2_block_alloc_info *block_i;
struct super_block *sb = inode->i_sb;
block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index af9fc89b1b2..9a4e5e206d0 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -135,10 +135,10 @@ extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
struct dentry *ext2_get_parent(struct dentry *child);
/* super.c */
-extern void ext2_error (struct super_block *, const char *, const char *, ...)
- __attribute__ ((format (printf, 3, 4)));
-extern void ext2_msg(struct super_block *, const char *, const char *, ...)
- __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void ext2_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext2_msg(struct super_block *, const char *, const char *, ...);
extern void ext2_update_dynamic_rev (struct super_block *sb);
extern void ext2_write_super (struct super_block *);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ee9ed31948e..c4e81dfb74b 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -601,7 +601,7 @@ fail_free_drop:
fail_drop:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
- inode->i_nlink = 0;
+ clear_nlink(inode);
unlock_new_inode(inode);
iput(inode);
return ERR_PTR(err);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a8a58f63f07..91a6945af6d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1321,7 +1321,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le32_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1dd62ed35b8..bd8ac164a3b 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -327,10 +327,10 @@ static struct inode *ext2_nfs_get_inode(struct super_block *sb,
if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
return ERR_PTR(-ESTALE);
- /* iget isn't really right if the inode is currently unallocated!!
- * ext2_read_inode currently does appropriate checks, but
- * it might be "neater" to call ext2_get_inode first and check
- * if the inode is valid.....
+ /*
+ * ext2_iget isn't quite right if the inode is currently unallocated!
+ * However ext2_iget currently does appropriate checks to handle stale
+ * inodes so everything is OK.
*/
inode = ext2_iget(sb, ino);
if (IS_ERR(inode))
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 5d979b4347b..c922adc8ef4 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -46,28 +46,30 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
value, size, flags);
}
-int
-ext2_init_security(struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *name;
+ const struct xattr *xattr;
+ int err = 0;
- err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
}
- err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
- name, value, len, 0);
- kfree(name);
- kfree(value);
return err;
}
+int
+ext2_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &ext2_initxattrs, NULL);
+}
+
const struct xattr_handler ext2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext2_xattr_security_list,
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 6386d76f44a..a2038928f9a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -427,7 +427,7 @@ static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
void ext3_init_block_alloc_info(struct inode *inode)
{
struct ext3_inode_info *ei = EXT3_I(inode);
- struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
+ struct ext3_block_alloc_info *block_i;
struct super_block *sb = inode->i_sb;
block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
@@ -1440,14 +1440,14 @@ out:
*
* Check if filesystem has at least 1 free block available for allocation.
*/
-static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
+static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation)
{
ext3_fsblk_t free_blocks, root_blocks;
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
- sbi->s_resuid != current_fsuid() &&
+ !use_reservation && sbi->s_resuid != current_fsuid() &&
(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
return 0;
}
@@ -1468,7 +1468,7 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
*/
int ext3_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
+ if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1546,7 +1546,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
my_rsv = &block_i->rsv_window_node;
- if (!ext3_has_free_blocks(sbi)) {
+ if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) {
*errp = -ENOSPC;
goto out;
}
@@ -1924,9 +1924,10 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
* reaches any used block. Then issue a TRIM command on this extent and free
* the extent in the block bitmap. This is done until whole group is scanned.
*/
-ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
- ext3_grpblk_t start, ext3_grpblk_t max,
- ext3_grpblk_t minblocks)
+static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
+ unsigned int group,
+ ext3_grpblk_t start, ext3_grpblk_t max,
+ ext3_grpblk_t minblocks)
{
handle_t *handle;
ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d494c554c6e..1860ed35632 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -61,13 +61,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
- /*
- * Taking the mutex here just to keep consistent with how fsync was
- * called previously, however it looks like we don't need to take
- * i_mutex at all.
- */
- mutex_lock(&inode->i_mutex);
-
J_ASSERT(ext3_journal_current_handle() == NULL);
/*
@@ -85,7 +78,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* safe in-journal, which is all fsync() needs to ensure.
*/
if (ext3_should_journal_data(inode)) {
- mutex_unlock(&inode->i_mutex);
ret = ext3_force_commit(inode->i_sb);
goto out;
}
@@ -108,8 +100,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
if (needs_barrier)
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-
- mutex_unlock(&inode->i_mutex);
out:
trace_ext3_sync_file_exit(inode, ret);
return ret;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index bf09cbf938c..5c866e06e7a 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -178,42 +178,6 @@ error_return:
}
/*
- * There are two policies for allocating an inode. If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory\'s block
- * group to find a free inode.
- */
-static int find_group_dir(struct super_block *sb, struct inode *parent)
-{
- int ngroups = EXT3_SB(sb)->s_groups_count;
- unsigned int freei, avefreei;
- struct ext3_group_desc *desc, *best_desc = NULL;
- int group, best_group = -1;
-
- freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
- avefreei = freei / ngroups;
-
- for (group = 0; group < ngroups; group++) {
- desc = ext3_get_group_desc (sb, group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
- continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
- continue;
- if (!best_desc ||
- (le16_to_cpu(desc->bg_free_blocks_count) >
- le16_to_cpu(best_desc->bg_free_blocks_count))) {
- best_group = group;
- best_desc = desc;
- }
- }
- return best_group;
-}
-
-/*
* Orlov's allocator for directories.
*
* We always try to spread first-level directories.
@@ -436,12 +400,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
sbi = EXT3_SB(sb);
es = sbi->s_es;
- if (S_ISDIR(mode)) {
- if (test_opt (sb, OLDALLOC))
- group = find_group_dir(sb, dir);
- else
- group = find_group_orlov(sb, dir);
- } else
+ if (S_ISDIR(mode))
+ group = find_group_orlov(sb, dir);
+ else
group = find_group_other(sb, dir);
err = -ENOSPC;
@@ -621,7 +582,7 @@ fail_free_drop:
fail_drop:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
- inode->i_nlink = 0;
+ clear_nlink(inode);
unlock_new_inode(inode);
iput(inode);
brelse(bitmap_bh);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 12661e1deed..85fe655fe3e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2899,7 +2899,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le32_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index c7f43944f16..ba1b54e23ca 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -150,30 +150,6 @@ setversion_out:
mnt_drop_write(filp->f_path.mnt);
return err;
}
-#ifdef CONFIG_JBD_DEBUG
- case EXT3_IOC_WAIT_FOR_READONLY:
- /*
- * This is racy - by the time we're woken up and running,
- * the superblock could be released. And the module could
- * have been unloaded. So sue me.
- *
- * Returns 1 if it slept, else zero.
- */
- {
- struct super_block *sb = inode->i_sb;
- DECLARE_WAITQUEUE(wait, current);
- int ret = 0;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
- if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) {
- schedule();
- ret = 1;
- }
- remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
- return ret;
- }
-#endif
case EXT3_IOC_GETRSVSZ:
if (test_opt(inode->i_sb, RESERVATION)
&& S_ISREG(inode->i_mode)
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 0629e09f651..642dc6d66df 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1821,7 +1821,7 @@ retry:
de->name_len = 2;
strcpy (de->name, "..");
ext3_set_de_type(dir->i_sb, de, S_IFDIR);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, dir_block);
if (err)
@@ -1833,7 +1833,7 @@ retry:
if (err) {
out_clear_inode:
- inode->i_nlink = 0;
+ clear_nlink(inode);
unlock_new_inode(inode);
ext3_mark_inode_dirty(handle, inode);
iput (inode);
@@ -2170,7 +2170,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
ext3_warning (inode->i_sb, "ext3_unlink",
"Deleting nonexistent file (%lu), %d",
inode->i_ino, inode->i_nlink);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
retval = ext3_delete_entry(handle, dir, de, bh);
if (retval)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7beb69ae001..922d289aeeb 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -652,8 +652,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",nouid32");
if (test_opt(sb, DEBUG))
seq_puts(seq, ",debug");
- if (test_opt(sb, OLDALLOC))
- seq_puts(seq, ",oldalloc");
#ifdef CONFIG_EXT3_FS_XATTR
if (test_opt(sb, XATTR_USER))
seq_puts(seq, ",user_xattr");
@@ -1049,10 +1047,12 @@ static int parse_options (char *options, struct super_block *sb,
set_opt (sbi->s_mount_opt, DEBUG);
break;
case Opt_oldalloc:
- set_opt (sbi->s_mount_opt, OLDALLOC);
+ ext3_msg(sb, KERN_WARNING,
+ "Ignoring deprecated oldalloc option");
break;
case Opt_orlov:
- clear_opt (sbi->s_mount_opt, OLDALLOC);
+ ext3_msg(sb, KERN_WARNING,
+ "Ignoring deprecated orlov option");
break;
#ifdef CONFIG_EXT3_FS_XATTR
case Opt_user_xattr:
@@ -2669,13 +2669,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
/*
* If we have an unprocessed orphan list hanging
* around from a previously readonly bdev mount,
- * require a full umount/remount for now.
+ * require a full umount & mount for now.
*/
if (es->s_last_orphan) {
ext3_msg(sb, KERN_WARNING, "warning: couldn't "
"remount RDWR because of unprocessed "
"orphan inode list. Please "
- "umount/remount instead.");
+ "umount & mount instead.");
err = -EINVAL;
goto restore_opts;
}
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index b8d9f83aa5c..3c218b8a51d 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -48,28 +48,32 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-int
-ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *name;
+ const struct xattr *xattr;
+ handle_t *handle = fs_info;
+ int err = 0;
- err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ext3_xattr_set_handle(handle, inode,
+ EXT3_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
}
- err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY,
- name, value, len, 0);
- kfree(name);
- kfree(value);
return err;
}
+int
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &ext3_initxattrs, handle);
+}
+
const struct xattr_handler ext3_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext3_xattr_security_list,
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f8224adf496..12ccacda44e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -28,7 +28,8 @@
*/
/*
- * Calculate the block group number and offset, given a block number
+ * Calculate the block group number and offset into the block/cluster
+ * allocation bitmap, given a block number
*/
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
@@ -37,7 +38,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
ext4_grpblk_t offset;
blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
- offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
+ offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
+ EXT4_SB(sb)->s_cluster_bits;
if (offsetp)
*offsetp = offset;
if (blockgrpp)
@@ -55,130 +57,169 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
return 0;
}
-static int ext4_group_used_meta_blocks(struct super_block *sb,
- ext4_group_t block_group,
- struct ext4_group_desc *gdp)
+/* Return the number of clusters used for file system metadata; this
+ * represents the overhead needed by the file system.
+ */
+unsigned ext4_num_overhead_clusters(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
{
- ext4_fsblk_t tmp;
+ unsigned num_clusters;
+ int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
+ ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
+ ext4_fsblk_t itbl_blk;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- /* block bitmap, inode bitmap, and inode table blocks */
- int used_blocks = sbi->s_itb_per_group + 2;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
- if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
- block_group))
- used_blocks--;
-
- if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp),
- block_group))
- used_blocks--;
-
- tmp = ext4_inode_table(sb, gdp);
- for (; tmp < ext4_inode_table(sb, gdp) +
- sbi->s_itb_per_group; tmp++) {
- if (!ext4_block_in_group(sb, tmp, block_group))
- used_blocks -= 1;
+ /* This is the number of clusters used by the superblock,
+ * block group descriptors, and reserved block group
+ * descriptor blocks */
+ num_clusters = ext4_num_base_meta_clusters(sb, block_group);
+
+ /*
+ * For the allocation bitmaps and inode table, we first need
+ * to check to see if the block is in the block group. If it
+ * is, then check to see if the cluster is already accounted
+ * for in the clusters used for the base metadata cluster, or
+ * if we can increment the base metadata cluster to include
+ * that block. Otherwise, we will have to track the cluster
+ * used for the allocation bitmap or inode table explicitly.
+ * Normally all of these blocks are contiguous, so the special
+ * case handling shouldn't be necessary except for *very*
+ * unusual file system layouts.
+ */
+ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
+ block_cluster = EXT4_B2C(sbi, (start -
+ ext4_block_bitmap(sb, gdp)));
+ if (block_cluster < num_clusters)
+ block_cluster = -1;
+ else if (block_cluster == num_clusters) {
+ num_clusters++;
+ block_cluster = -1;
}
}
- return used_blocks;
-}
-/* Initializes an uninitialized block bitmap if given, and returns the
- * number of blocks free in the group. */
-unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
- ext4_group_t block_group, struct ext4_group_desc *gdp)
-{
- int bit, bit_max;
- ext4_group_t ngroups = ext4_get_groups_count(sb);
- unsigned free_blocks, group_blocks;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (bh) {
- J_ASSERT_BH(bh, buffer_locked(bh));
-
- /* If checksum is bad mark all blocks used to prevent allocation
- * essentially implementing a per-group read-only flag. */
- if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
- ext4_error(sb, "Checksum bad for group %u",
- block_group);
- ext4_free_blks_set(sb, gdp, 0);
- ext4_free_inodes_set(sb, gdp, 0);
- ext4_itable_unused_set(sb, gdp, 0);
- memset(bh->b_data, 0xff, sb->s_blocksize);
- return 0;
+ if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
+ inode_cluster = EXT4_B2C(sbi,
+ start - ext4_inode_bitmap(sb, gdp));
+ if (inode_cluster < num_clusters)
+ inode_cluster = -1;
+ else if (inode_cluster == num_clusters) {
+ num_clusters++;
+ inode_cluster = -1;
}
- memset(bh->b_data, 0, sb->s_blocksize);
}
- /* Check for superblock and gdt backups in this group */
- bit_max = ext4_bg_has_super(sb, block_group);
-
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
- block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
- sbi->s_desc_per_block) {
- if (bit_max) {
- bit_max += ext4_bg_num_gdb(sb, block_group);
- bit_max +=
- le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+ itbl_blk = ext4_inode_table(sb, gdp);
+ for (i = 0; i < sbi->s_itb_per_group; i++) {
+ if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
+ c = EXT4_B2C(sbi, start - itbl_blk + i);
+ if ((c < num_clusters) || (c == inode_cluster) ||
+ (c == block_cluster) || (c == itbl_cluster))
+ continue;
+ if (c == num_clusters) {
+ num_clusters++;
+ continue;
+ }
+ num_clusters++;
+ itbl_cluster = c;
}
- } else { /* For META_BG_BLOCK_GROUPS */
- bit_max += ext4_bg_num_gdb(sb, block_group);
}
- if (block_group == ngroups - 1) {
+ if (block_cluster != -1)
+ num_clusters++;
+ if (inode_cluster != -1)
+ num_clusters++;
+
+ return num_clusters;
+}
+
+static unsigned int num_clusters_in_group(struct super_block *sb,
+ ext4_group_t block_group)
+{
+ unsigned int blocks;
+
+ if (block_group == ext4_get_groups_count(sb) - 1) {
/*
- * Even though mke2fs always initialize first and last group
- * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
- * to make sure we calculate the right free blocks
+ * Even though mke2fs always initializes the first and
+ * last group, just in case some other tool was used,
+ * we need to make sure we calculate the right free
+ * blocks.
*/
- group_blocks = ext4_blocks_count(sbi->s_es) -
- ext4_group_first_block_no(sb, ngroups - 1);
- } else {
- group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
- }
+ blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
+ ext4_group_first_block_no(sb, block_group);
+ } else
+ blocks = EXT4_BLOCKS_PER_GROUP(sb);
+ return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
+}
- free_blocks = group_blocks - bit_max;
+/* Initializes an uninitialized block bitmap */
+void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
+{
+ unsigned int bit, bit_max;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t start, tmp;
+ int flex_bg = 0;
+
+ J_ASSERT_BH(bh, buffer_locked(bh));
+
+ /* If checksum is bad mark all blocks used to prevent allocation
+ * essentially implementing a per-group read-only flag. */
+ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+ ext4_error(sb, "Checksum bad for group %u", block_group);
+ ext4_free_group_clusters_set(sb, gdp, 0);
+ ext4_free_inodes_set(sb, gdp, 0);
+ ext4_itable_unused_set(sb, gdp, 0);
+ memset(bh->b_data, 0xff, sb->s_blocksize);
+ return;
+ }
+ memset(bh->b_data, 0, sb->s_blocksize);
- if (bh) {
- ext4_fsblk_t start, tmp;
- int flex_bg = 0;
+ bit_max = ext4_num_base_meta_clusters(sb, block_group);
+ for (bit = 0; bit < bit_max; bit++)
+ ext4_set_bit(bit, bh->b_data);
- for (bit = 0; bit < bit_max; bit++)
- ext4_set_bit(bit, bh->b_data);
+ start = ext4_group_first_block_no(sb, block_group);
- start = ext4_group_first_block_no(sb, block_group);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ flex_bg = 1;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_FLEX_BG))
- flex_bg = 1;
+ /* Set bits for block and inode bitmaps, and inode table */
+ tmp = ext4_block_bitmap(sb, gdp);
+ if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
- /* Set bits for block and inode bitmaps, and inode table */
- tmp = ext4_block_bitmap(sb, gdp);
- if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
- ext4_set_bit(tmp - start, bh->b_data);
+ tmp = ext4_inode_bitmap(sb, gdp);
+ if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
- tmp = ext4_inode_bitmap(sb, gdp);
+ tmp = ext4_inode_table(sb, gdp);
+ for (; tmp < ext4_inode_table(sb, gdp) +
+ sbi->s_itb_per_group; tmp++) {
if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
- ext4_set_bit(tmp - start, bh->b_data);
-
- tmp = ext4_inode_table(sb, gdp);
- for (; tmp < ext4_inode_table(sb, gdp) +
- sbi->s_itb_per_group; tmp++) {
- if (!flex_bg ||
- ext4_block_in_group(sb, tmp, block_group))
- ext4_set_bit(tmp - start, bh->b_data);
- }
- /*
- * Also if the number of blocks within the group is
- * less than the blocksize * 8 ( which is the size
- * of bitmap ), set rest of the block bitmap to 1
- */
- ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
- bh->b_data);
+ ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
}
- return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
+
+ /*
+ * Also if the number of blocks within the group is less than
+ * the blocksize * 8 ( which is the size of bitmap ), set rest
+ * of the block bitmap to 1
+ */
+ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
+ sb->s_blocksize * 8, bh->b_data);
}
+/* Return the number of free blocks in a block group. It is used when
+ * the block bitmap is uninitialized, so we can't just count the bits
+ * in the bitmap. */
+unsigned ext4_free_clusters_after_init(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
+{
+ return num_clusters_in_group(sb, block_group) -
+ ext4_num_overhead_clusters(sb, block_group, gdp);
+}
/*
* The free blocks are managed by bitmaps. A file system contains several
@@ -362,53 +403,54 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
}
/**
- * ext4_has_free_blocks()
+ * ext4_has_free_clusters()
* @sbi: in-core super block structure.
- * @nblocks: number of needed blocks
+ * @nclusters: number of needed blocks
+ * @flags: flags from ext4_mb_new_blocks()
*
- * Check if filesystem has nblocks free & available for allocation.
+ * Check if filesystem has nclusters free & available for allocation.
* On success return 1, return 0 on failure.
*/
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
- s64 nblocks, unsigned int flags)
+static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
+ s64 nclusters, unsigned int flags)
{
- s64 free_blocks, dirty_blocks, root_blocks;
- struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
- struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
-
- free_blocks = percpu_counter_read_positive(fbc);
- dirty_blocks = percpu_counter_read_positive(dbc);
- root_blocks = ext4_r_blocks_count(sbi->s_es);
-
- if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
- EXT4_FREEBLOCKS_WATERMARK) {
- free_blocks = percpu_counter_sum_positive(fbc);
- dirty_blocks = percpu_counter_sum_positive(dbc);
+ s64 free_clusters, dirty_clusters, root_clusters;
+ struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
+ struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
+
+ free_clusters = percpu_counter_read_positive(fcc);
+ dirty_clusters = percpu_counter_read_positive(dcc);
+ root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es));
+
+ if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
+ EXT4_FREECLUSTERS_WATERMARK) {
+ free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc));
+ dirty_clusters = percpu_counter_sum_positive(dcc);
}
- /* Check whether we have space after
- * accounting for current dirty blocks & root reserved blocks.
+ /* Check whether we have space after accounting for current
+ * dirty clusters & root reserved clusters.
*/
- if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks))
+ if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
return 1;
- /* Hm, nope. Are (enough) root reserved blocks available? */
+ /* Hm, nope. Are (enough) root reserved clusters available? */
if (sbi->s_resuid == current_fsuid() ||
((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
capable(CAP_SYS_RESOURCE) ||
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
- if (free_blocks >= (nblocks + dirty_blocks))
+ if (free_clusters >= (nclusters + dirty_clusters))
return 1;
}
return 0;
}
-int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
- s64 nblocks, unsigned int flags)
+int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+ s64 nclusters, unsigned int flags)
{
- if (ext4_has_free_blocks(sbi, nblocks, flags)) {
- percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
+ if (ext4_has_free_clusters(sbi, nclusters, flags)) {
+ percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
return 0;
} else
return -ENOSPC;
@@ -428,7 +470,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
+ if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
(*retries)++ > 3 ||
!EXT4_SB(sb)->s_journal)
return 0;
@@ -444,7 +486,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
- * @count: pointer to total number of blocks needed
+ * @count: pointer to total number of clusters needed
* @errp: error code
*
* Return 1st allocated block number on success, *count stores total account
@@ -476,18 +518,19 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- dquot_alloc_block_nofail(inode, ar.len);
+ dquot_alloc_block_nofail(inode,
+ EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
}
return ret;
}
/**
- * ext4_count_free_blocks() -- count filesystem free blocks
+ * ext4_count_free_clusters() -- count filesystem free clusters
* @sb: superblock
*
- * Adds up the number of free blocks from each block group.
+ * Adds up the number of free clusters from each block group.
*/
-ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
+ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
{
ext4_fsblk_t desc_count;
struct ext4_group_desc *gdp;
@@ -508,7 +551,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += ext4_free_blks_count(sb, gdp);
+ desc_count += ext4_free_group_clusters(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
@@ -516,12 +559,13 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
x = ext4_count_free(bitmap_bh, sb->s_blocksize);
printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
- i, ext4_free_blks_count(sb, gdp), x);
+ i, ext4_free_group_clusters(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
- printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
- ", computed = %llu, %llu\n", ext4_free_blocks_count(es),
+ printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
+ ", computed = %llu, %llu\n",
+ EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
desc_count, bitmap_count);
return bitmap_count;
#else
@@ -530,7 +574,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += ext4_free_blks_count(sb, gdp);
+ desc_count += ext4_free_group_clusters(sb, gdp);
}
return desc_count;
@@ -620,6 +664,31 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
}
+/*
+ * This function returns the number of file system metadata clusters at
+ * the beginning of a block group, including the reserved gdt blocks.
+ */
+unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned num;
+
+ /* Check for superblock and gdt backups in this group */
+ num = ext4_bg_has_super(sb, block_group);
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+ block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
+ sbi->s_desc_per_block) {
+ if (num) {
+ num += ext4_bg_num_gdb(sb, block_group);
+ num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+ }
+ } else { /* For META_BG_BLOCK_GROUPS */
+ num += ext4_bg_num_gdb(sb, block_group);
+ }
+ return EXT4_NUM_B2C(sbi, num);
+}
/**
* ext4_inode_to_goal_block - return a hint for block allocation
* @inode: inode for block allocation
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b7d7bd0f066..5b0e26a1272 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -144,9 +144,17 @@ struct ext4_allocation_request {
#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
#define EXT4_MAP_UNINIT (1 << BH_Uninit)
+/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
+ * ext4_map_blocks wants to know whether or not the underlying cluster has
+ * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
+ * the requested mapping was from previously mapped (or delayed allocated)
+ * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
+ * should never appear on buffer_head's state flags.
+ */
+#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
- EXT4_MAP_UNINIT)
+ EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -239,8 +247,11 @@ struct ext4_io_submit {
# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
#endif
#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
+#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \
+ EXT4_SB(s)->s_cluster_bits)
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits)
#else
# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10)
#endif
@@ -258,6 +269,14 @@ struct ext4_io_submit {
#endif
#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits)))
+/* Translate a block number to a cluster number */
+#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
+/* Translate a cluster number to a block number */
+#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits)
+/* Translate # of blks to # of clusters */
+#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \
+ (sbi)->s_cluster_bits)
+
/*
* Structure of a blocks group descriptor
*/
@@ -289,7 +308,7 @@ struct ext4_group_desc
struct flex_groups {
atomic_t free_inodes;
- atomic_t free_blocks;
+ atomic_t free_clusters;
atomic_t used_dirs;
};
@@ -306,6 +325,7 @@ struct flex_groups {
#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size)
#ifdef __KERNEL__
# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group)
+# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group)
# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block)
# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group)
# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits)
@@ -358,8 +378,7 @@ struct flex_groups {
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
- EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
- EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
@@ -520,6 +539,8 @@ struct ext4_new_group_data {
#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
/* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
+ /* Request will not result in inode size update (user for fallocate) */
+#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
/*
* Flags used by ext4_free_blocks
@@ -528,6 +549,13 @@ struct ext4_new_group_data {
#define EXT4_FREE_BLOCKS_FORGET 0x0002
#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
+#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
+#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
+
+/*
+ * Flags used by ext4_discard_partial_page_buffers
+ */
+#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
/*
* ioctl commands
@@ -538,9 +566,6 @@ struct ext4_new_group_data {
#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
-#ifdef CONFIG_JBD2_DEBUG
-#define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
-#endif
#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
@@ -563,9 +588,6 @@ struct ext4_new_group_data {
#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
-#ifdef CONFIG_JBD2_DEBUG
-#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
-#endif
#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
#endif
@@ -837,6 +859,7 @@ struct ext4_inode_info {
ext4_group_t i_last_alloc_group;
/* allocation reservation info for delalloc */
+ /* In case of bigalloc, these refer to clusters rather than blocks */
unsigned int i_reserved_data_blocks;
unsigned int i_reserved_meta_blocks;
unsigned int i_allocated_meta_blocks;
@@ -886,7 +909,6 @@ struct ext4_inode_info {
/*
* Mount flags
*/
-#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
@@ -918,6 +940,9 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
+#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly
+ specified delalloc */
+
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
@@ -968,9 +993,9 @@ struct ext4_super_block {
/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
__le32 s_first_data_block; /* First Data Block */
__le32 s_log_block_size; /* Block size */
- __le32 s_obso_log_frag_size; /* Obsoleted fragment size */
+ __le32 s_log_cluster_size; /* Allocation cluster size */
/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
- __le32 s_obso_frags_per_group; /* Obsoleted fragments per group */
+ __le32 s_clusters_per_group; /* # Clusters per group */
__le32 s_inodes_per_group; /* # Inodes per group */
__le32 s_mtime; /* Mount time */
/*30*/ __le32 s_wtime; /* Write time */
@@ -1066,7 +1091,10 @@ struct ext4_super_block {
__u8 s_last_error_func[32]; /* function where the error happened */
#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
__u8 s_mount_opts[64];
- __le32 s_reserved[112]; /* Padding to the end of the block */
+ __le32 s_usr_quota_inum; /* inode for tracking user quota */
+ __le32 s_grp_quota_inum; /* inode for tracking group quota */
+ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
+ __le32 s_reserved[109]; /* Padding to the end of the block */
};
#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
@@ -1086,6 +1114,7 @@ struct ext4_sb_info {
unsigned long s_desc_size; /* Size of a group descriptor in bytes */
unsigned long s_inodes_per_block;/* Number of inodes per block */
unsigned long s_blocks_per_group;/* Number of blocks in a group */
+ unsigned long s_clusters_per_group; /* Number of clusters in a group */
unsigned long s_inodes_per_group;/* Number of inodes in a group */
unsigned long s_itb_per_group; /* Number of inode table blocks per group */
unsigned long s_gdb_count; /* Number of group descriptor blocks */
@@ -1094,6 +1123,8 @@ struct ext4_sb_info {
ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
unsigned long s_overhead_last; /* Last calculated overhead */
unsigned long s_blocks_last; /* Last seen block count */
+ unsigned int s_cluster_ratio; /* Number of blocks per cluster */
+ unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
struct buffer_head * s_sbh; /* Buffer containing the super block */
struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
@@ -1117,10 +1148,10 @@ struct ext4_sb_info {
u32 s_hash_seed[4];
int s_def_hash_version;
int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
- struct percpu_counter s_freeblocks_counter;
+ struct percpu_counter s_freeclusters_counter;
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
- struct percpu_counter s_dirtyblocks_counter;
+ struct percpu_counter s_dirtyclusters_counter;
struct blockgroup_lock *s_blockgroup_lock;
struct proc_dir_entry *s_proc;
struct kobject s_kobj;
@@ -1136,10 +1167,6 @@ struct ext4_sb_info {
u32 s_max_batch_time;
u32 s_min_batch_time;
struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
- struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
- wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
-#endif
#ifdef CONFIG_QUOTA
char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
@@ -1248,6 +1275,15 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+ struct ext4_io_end *io_end)
+{
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ io_end->flag |= EXT4_IO_END_UNWRITTEN;
+ atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+ }
+}
+
/*
* Inode dynamic state flags
*/
@@ -1360,6 +1396,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
+#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1402,7 +1439,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)
/*
* Default values for user and/or group using reserved blocks
@@ -1735,9 +1773,9 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
unsigned int flags,
unsigned long *count,
int *errp);
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
- s64 nblocks, unsigned int flags);
-extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
+extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+ s64 nclusters, unsigned int flags);
+extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
ext4_group_t block_group,
@@ -1745,12 +1783,18 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
- struct buffer_head *bh,
- ext4_group_t group,
- struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc) \
- ext4_init_block_bitmap(sb, NULL, group, desc)
+extern void ext4_init_block_bitmap(struct super_block *sb,
+ struct buffer_head *bh,
+ ext4_group_t group,
+ struct ext4_group_desc *desc);
+extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp);
+extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group);
+extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
/* dir.c */
@@ -1776,7 +1820,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct
/* ialloc.c */
extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
- const struct qstr *qstr, __u32 goal);
+ const struct qstr *qstr, __u32 goal,
+ uid_t *owner);
extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
@@ -1839,6 +1884,12 @@ extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
extern int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_discard_partial_page_buffers(handle_t *handle,
+ struct address_space *mapping, loff_t from,
+ loff_t length, int flags);
+extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+ struct inode *inode, struct page *page, loff_t from,
+ loff_t length, int flags);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1878,40 +1929,40 @@ extern int ext4_group_extend(struct super_block *sb,
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
extern void *ext4_kvzalloc(size_t size, gfp_t flags);
extern void ext4_kvfree(void *ptr);
-extern void __ext4_error(struct super_block *, const char *, unsigned int,
- const char *, ...)
- __attribute__ ((format (printf, 4, 5)));
+extern __printf(4, 5)
+void __ext4_error(struct super_block *, const char *, unsigned int,
+ const char *, ...);
#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
__LINE__, ## message)
-extern void ext4_error_inode(struct inode *, const char *, unsigned int,
- ext4_fsblk_t, const char *, ...)
- __attribute__ ((format (printf, 5, 6)));
-extern void ext4_error_file(struct file *, const char *, unsigned int,
- ext4_fsblk_t, const char *, ...)
- __attribute__ ((format (printf, 5, 6)));
+extern __printf(5, 6)
+void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+ const char *, ...);
+extern __printf(5, 6)
+void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+ const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
unsigned int, int);
-extern void __ext4_abort(struct super_block *, const char *, unsigned int,
- const char *, ...)
- __attribute__ ((format (printf, 4, 5)));
+extern __printf(4, 5)
+void __ext4_abort(struct super_block *, const char *, unsigned int,
+ const char *, ...);
#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
__LINE__, ## message)
-extern void __ext4_warning(struct super_block *, const char *, unsigned int,
- const char *, ...)
- __attribute__ ((format (printf, 4, 5)));
+extern __printf(4, 5)
+void __ext4_warning(struct super_block *, const char *, unsigned int,
+ const char *, ...);
#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
__LINE__, ## message)
-extern void ext4_msg(struct super_block *, const char *, const char *, ...)
- __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
const char *, unsigned int, const char *);
#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
__LINE__, msg)
-extern void __ext4_grp_locked_error(const char *, unsigned int, \
- struct super_block *, ext4_group_t, \
- unsigned long, ext4_fsblk_t, \
- const char *, ...)
- __attribute__ ((format (printf, 7, 8)));
+extern __printf(7, 8)
+void __ext4_grp_locked_error(const char *, unsigned int,
+ struct super_block *, ext4_group_t,
+ unsigned long, ext4_fsblk_t,
+ const char *, ...);
#define ext4_grp_locked_error(sb, grp, message...) \
__ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
extern void ext4_update_dynamic_rev(struct super_block *sb);
@@ -1927,8 +1978,8 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
struct ext4_group_desc *bg);
-extern __u32 ext4_free_blks_count(struct super_block *sb,
- struct ext4_group_desc *bg);
+extern __u32 ext4_free_group_clusters(struct super_block *sb,
+ struct ext4_group_desc *bg);
extern __u32 ext4_free_inodes_count(struct super_block *sb,
struct ext4_group_desc *bg);
extern __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -1941,8 +1992,9 @@ extern void ext4_inode_bitmap_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_table_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
-extern void ext4_free_blks_set(struct super_block *sb,
- struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_group_clusters_set(struct super_block *sb,
+ struct ext4_group_desc *bg,
+ __u32 count);
extern void ext4_free_inodes_set(struct super_block *sb,
struct ext4_group_desc *bg, __u32 count);
extern void ext4_used_dirs_set(struct super_block *sb,
@@ -2051,13 +2103,13 @@ do { \
} while (0)
#ifdef CONFIG_SMP
-/* Each CPU can accumulate percpu_counter_batch blocks in their local
- * counters. So we need to make sure we have free blocks more
+/* Each CPU can accumulate percpu_counter_batch clusters in their local
+ * counters. So we need to make sure we have free clusters more
* than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times.
*/
-#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
+#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
#else
-#define EXT4_FREEBLOCKS_WATERMARK 0
+#define EXT4_FREECLUSTERS_WATERMARK 0
#endif
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
@@ -2243,10 +2295,19 @@ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
enum ext4_state_bits {
BH_Uninit /* blocks are allocated but uninitialized on disk */
= BH_JBDPrivateStart,
+ BH_AllocFromCluster, /* allocated blocks were part of already
+ * allocated cluster. Note that this flag will
+ * never, ever appear in a buffer_head's state
+ * flag. See EXT4_MAP_FROM_CLUSTER to see where
+ * this is used. */
+ BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This
+ * flag is set when ext4_map_blocks is called on a
+ * delayed allocated block to get its real mapping. */
};
BUFFER_FNS(Uninit, uninit)
TAS_BUFFER_FNS(Uninit, uninit)
+BUFFER_FNS(Da_Mapped, da_mapped)
/*
* Add new method to test wether block and inode bitmaps are properly
@@ -2282,4 +2343,6 @@ extern void ext4_resize_end(struct super_block *sb);
#endif /* __KERNEL__ */
+#include "ext4_extents.h"
+
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 095c36f3b61..a52db3a69a3 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -290,5 +290,7 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
struct ext4_ext_path *);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+ int search_hint_reverse);
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index f5240aa1560..aca17901758 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -109,9 +109,11 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, line, __func__,
- bh, handle, err);
+ if (err) {
+ /* Errors can only happen if there is a bug */
+ handle->h_err = err;
+ __ext4_journal_stop(where, line, handle);
+ }
} else {
if (inode)
mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 57cf568a98a..607b1557d29 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -42,7 +42,6 @@
#include <asm/uaccess.h>
#include <linux/fiemap.h>
#include "ext4_jbd2.h"
-#include "ext4_extents.h"
#include <trace/events/ext4.h>
@@ -96,13 +95,17 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
* - ENOMEM
* - EIO
*/
-static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
+#define ext4_ext_dirty(handle, inode, path) \
+ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+static int __ext4_ext_dirty(const char *where, unsigned int line,
+ handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
{
int err;
if (path->p_bh) {
/* path points to block */
- err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
+ err = __ext4_handle_dirty_metadata(where, line, handle,
+ inode, path->p_bh);
} else {
/* path points to leaf/index in inode body */
err = ext4_mark_inode_dirty(handle, inode);
@@ -114,11 +117,9 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t block)
{
- int depth;
-
if (path) {
+ int depth = path->p_depth;
struct ext4_extent *ex;
- depth = path->p_depth;
/*
* Try to predict block placement assuming that we are
@@ -180,12 +181,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check)
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent);
- if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 6)
- size = 6;
+ if (!check && size > 6)
+ size = 6;
#endif
- }
return size;
}
@@ -195,12 +194,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent_idx);
- if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 5)
- size = 5;
+ if (!check && size > 5)
+ size = 5;
#endif
- }
return size;
}
@@ -211,12 +208,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check)
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent);
- if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 3)
- size = 3;
+ if (!check && size > 3)
+ size = 3;
#endif
- }
return size;
}
@@ -227,12 +222,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent_idx);
- if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 4)
- size = 4;
+ if (!check && size > 4)
+ size = 4;
#endif
- }
return size;
}
@@ -244,7 +237,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- int idxs, num = 0;
+ int idxs;
idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent_idx));
@@ -259,6 +252,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
*/
if (ei->i_da_metadata_calc_len &&
ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+ int num = 0;
+
if ((ei->i_da_metadata_calc_len % idxs) == 0)
num++;
if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
@@ -321,8 +316,6 @@ static int ext4_valid_extent_entries(struct inode *inode,
struct ext4_extent_header *eh,
int depth)
{
- struct ext4_extent *ext;
- struct ext4_extent_idx *ext_idx;
unsigned short entries;
if (eh->eh_entries == 0)
return 1;
@@ -331,7 +324,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
if (depth == 0) {
/* leaf entries */
- ext = EXT_FIRST_EXTENT(eh);
+ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
@@ -339,7 +332,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
entries--;
}
} else {
- ext_idx = EXT_FIRST_INDEX(eh);
+ struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
while (entries) {
if (!ext4_valid_extent_idx(inode, ext_idx))
return 0;
@@ -751,31 +744,30 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
return -EIO;
}
- len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
/* insert after */
- if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
- len = (len - 1) * sizeof(struct ext4_extent_idx);
- len = len < 0 ? 0 : len;
- ext_debug("insert new index %d after: %llu. "
- "move %d from 0x%p to 0x%p\n",
- logical, ptr, len,
- (curp->p_idx + 1), (curp->p_idx + 2));
- memmove(curp->p_idx + 2, curp->p_idx + 1, len);
- }
+ ext_debug("insert new index %d after: %llu\n", logical, ptr);
ix = curp->p_idx + 1;
} else {
/* insert before */
- len = len * sizeof(struct ext4_extent_idx);
- len = len < 0 ? 0 : len;
- ext_debug("insert new index %d before: %llu. "
- "move %d from 0x%p to 0x%p\n",
- logical, ptr, len,
- curp->p_idx, (curp->p_idx + 1));
- memmove(curp->p_idx + 1, curp->p_idx, len);
+ ext_debug("insert new index %d before: %llu\n", logical, ptr);
ix = curp->p_idx;
}
+ len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
+ BUG_ON(len < 0);
+ if (len > 0) {
+ ext_debug("insert new index %d: "
+ "move %d indices from 0x%p to 0x%p\n",
+ logical, len, ix, ix + 1);
+ memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
+ }
+
+ if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+ EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+ return -EIO;
+ }
+
ix->ei_block = cpu_to_le32(logical);
ext4_idx_store_pblock(ix, ptr);
le16_add_cpu(&curp->p_hdr->eh_entries, 1);
@@ -1042,16 +1034,14 @@ cleanup:
*/
static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
unsigned int flags,
- struct ext4_ext_path *path,
struct ext4_extent *newext)
{
- struct ext4_ext_path *curp = path;
struct ext4_extent_header *neh;
struct buffer_head *bh;
ext4_fsblk_t newblock;
int err = 0;
- newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newblock = ext4_ext_new_meta_block(handle, inode, NULL,
newext, &err, flags);
if (newblock == 0)
return err;
@@ -1071,7 +1061,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
}
/* move top-level index/leaf into new block */
- memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
+ memmove(bh->b_data, EXT4_I(inode)->i_data,
+ sizeof(EXT4_I(inode)->i_data));
/* set size of new block */
neh = ext_block_hdr(bh);
@@ -1089,32 +1080,23 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
if (err)
goto out;
- /* create index in new top-level index: num,max,pointer */
- err = ext4_ext_get_access(handle, inode, curp);
- if (err)
- goto out;
-
- curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
- curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
- curp->p_hdr->eh_entries = cpu_to_le16(1);
- curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
-
- if (path[0].p_hdr->eh_depth)
- curp->p_idx->ei_block =
- EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
- else
- curp->p_idx->ei_block =
- EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
- ext4_idx_store_pblock(curp->p_idx, newblock);
-
+ /* Update top-level index: num,max,pointer */
neh = ext_inode_hdr(inode);
+ neh->eh_entries = cpu_to_le16(1);
+ ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
+ if (neh->eh_depth == 0) {
+ /* Root extent block becomes index block */
+ neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
+ EXT_FIRST_INDEX(neh)->ei_block =
+ EXT_FIRST_EXTENT(neh)->ee_block;
+ }
ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
- neh->eh_depth = cpu_to_le16(path->p_depth + 1);
- err = ext4_ext_dirty(handle, inode, curp);
+ neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1);
+ ext4_mark_inode_dirty(handle, inode);
out:
brelse(bh);
@@ -1162,8 +1144,7 @@ repeat:
err = PTR_ERR(path);
} else {
/* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, flags,
- path, newext);
+ err = ext4_ext_grow_indepth(handle, inode, flags, newext);
if (err)
goto out;
@@ -1235,9 +1216,9 @@ static int ext4_ext_search_left(struct inode *inode,
if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
EXT4_ERROR_INODE(inode,
"ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
- ix != NULL ? ix->ei_block : 0,
+ ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
- EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
+ le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
depth);
return -EIO;
}
@@ -1260,13 +1241,14 @@ static int ext4_ext_search_left(struct inode *inode,
/*
* search the closest allocated block to the right for *logical
* and returns it at @logical + it's physical address at @phys
- * if *logical is the smallest allocated block, the function
+ * if *logical is the largest allocated block, the function
* returns 0 at @phys
* return value contains 0 (success) or error code
*/
static int ext4_ext_search_right(struct inode *inode,
struct ext4_ext_path *path,
- ext4_lblk_t *logical, ext4_fsblk_t *phys)
+ ext4_lblk_t *logical, ext4_fsblk_t *phys,
+ struct ext4_extent **ret_ex)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
@@ -1308,9 +1290,7 @@ static int ext4_ext_search_right(struct inode *inode,
return -EIO;
}
}
- *logical = le32_to_cpu(ex->ee_block);
- *phys = ext4_ext_pblock(ex);
- return 0;
+ goto found_extent;
}
if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
@@ -1323,9 +1303,7 @@ static int ext4_ext_search_right(struct inode *inode,
if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
/* next allocated block in this leaf */
ex++;
- *logical = le32_to_cpu(ex->ee_block);
- *phys = ext4_ext_pblock(ex);
- return 0;
+ goto found_extent;
}
/* go up and search for index to the right */
@@ -1368,9 +1346,12 @@ got_index:
return -EIO;
}
ex = EXT_FIRST_EXTENT(eh);
+found_extent:
*logical = le32_to_cpu(ex->ee_block);
*phys = ext4_ext_pblock(ex);
- put_bh(bh);
+ *ret_ex = ex;
+ if (bh)
+ put_bh(bh);
return 0;
}
@@ -1395,7 +1376,8 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
while (depth >= 0) {
if (depth == path->p_depth) {
/* leaf */
- if (path[depth].p_ext !=
+ if (path[depth].p_ext &&
+ path[depth].p_ext !=
EXT_LAST_EXTENT(path[depth].p_hdr))
return le32_to_cpu(path[depth].p_ext[1].ee_block);
} else {
@@ -1623,7 +1605,8 @@ static int ext4_ext_try_to_merge(struct inode *inode,
* such that there will be no overlap, and then returns 1.
* If there is no overlap found, it returns 0.
*/
-static unsigned int ext4_ext_check_overlap(struct inode *inode,
+static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
+ struct inode *inode,
struct ext4_extent *newext,
struct ext4_ext_path *path)
{
@@ -1637,6 +1620,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
if (!path[depth].p_ext)
goto out;
b2 = le32_to_cpu(path[depth].p_ext->ee_block);
+ b2 &= ~(sbi->s_cluster_ratio - 1);
/*
* get the next allocated block if the extent in the path
@@ -1646,6 +1630,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
b2 = ext4_ext_next_allocated_block(path);
if (b2 == EXT_MAX_BLOCKS)
goto out;
+ b2 &= ~(sbi->s_cluster_ratio - 1);
}
/* check for wrap through zero on extent logical start block*/
@@ -1697,7 +1682,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
/* try to insert block into found extent and return */
if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
&& ext4_can_extents_be_merged(inode, ex, newext)) {
- ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+ ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
@@ -1735,7 +1720,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
next = ext4_ext_next_leaf_block(path);
if (next != EXT_MAX_BLOCKS) {
- ext_debug("next leaf block - %d\n", next);
+ ext_debug("next leaf block - %u\n", next);
BUG_ON(npath != NULL);
npath = ext4_ext_find_extent(inode, next, NULL);
if (IS_ERR(npath))
@@ -1773,46 +1758,51 @@ has_space:
if (!nearex) {
/* there is no extent in this leaf, create first one */
- ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
+ ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext));
- path[depth].p_ext = EXT_FIRST_EXTENT(eh);
- } else if (le32_to_cpu(newext->ee_block)
+ nearex = EXT_FIRST_EXTENT(eh);
+ } else {
+ if (le32_to_cpu(newext->ee_block)
> le32_to_cpu(nearex->ee_block)) {
-/* BUG_ON(newext->ee_block == nearex->ee_block); */
- if (nearex != EXT_LAST_EXTENT(eh)) {
- len = EXT_MAX_EXTENT(eh) - nearex;
- len = (len - 1) * sizeof(struct ext4_extent);
- len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
- "move %d from 0x%p to 0x%p\n",
+ /* Insert after */
+ ext_debug("insert %u:%llu:[%d]%d before: "
+ "nearest %p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
- nearex, len, nearex + 1, nearex + 2);
- memmove(nearex + 2, nearex + 1, len);
+ nearex);
+ nearex++;
+ } else {
+ /* Insert before */
+ BUG_ON(newext->ee_block == nearex->ee_block);
+ ext_debug("insert %u:%llu:[%d]%d after: "
+ "nearest %p\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext),
+ nearex);
+ }
+ len = EXT_LAST_EXTENT(eh) - nearex + 1;
+ if (len > 0) {
+ ext_debug("insert %u:%llu:[%d]%d: "
+ "move %d extents from 0x%p to 0x%p\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext),
+ len, nearex, nearex + 1);
+ memmove(nearex + 1, nearex,
+ len * sizeof(struct ext4_extent));
}
- path[depth].p_ext = nearex + 1;
- } else {
- BUG_ON(newext->ee_block == nearex->ee_block);
- len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
- len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
- "move %d from 0x%p to 0x%p\n",
- le32_to_cpu(newext->ee_block),
- ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
- ext4_ext_get_actual_len(newext),
- nearex, len, nearex, nearex + 1);
- memmove(nearex + 1, nearex, len);
- path[depth].p_ext = nearex;
}
le16_add_cpu(&eh->eh_entries, 1);
- nearex = path[depth].p_ext;
+ path[depth].p_ext = nearex;
nearex->ee_block = newext->ee_block;
ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
nearex->ee_len = newext->ee_len;
@@ -1962,6 +1952,7 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
struct ext4_ext_cache *cex;
BUG_ON(len == 0);
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ trace_ext4_ext_put_in_cache(inode, block, len, start);
cex = &EXT4_I(inode)->i_cached_extent;
cex->ec_block = block;
cex->ec_len = len;
@@ -2063,6 +2054,7 @@ errout:
sbi->extent_cache_misses++;
else
sbi->extent_cache_hits++;
+ trace_ext4_ext_in_cache(inode, block, ret);
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return ret;
}
@@ -2130,6 +2122,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
if (err)
return err;
ext_debug("index is empty, remove it, free block %llu\n", leaf);
+ trace_ext4_ext_rm_idx(inode, leaf);
+
ext4_free_blocks(handle, inode, NULL, leaf, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
return err;
@@ -2158,7 +2152,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
* need to account for leaf block credit
*
* bitmaps and block group descriptor blocks
- * and other metadat blocks still need to be
+ * and other metadata blocks still need to be
* accounted.
*/
/* 1 bitmap, 1 block group descriptor */
@@ -2195,14 +2189,40 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
}
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
- struct ext4_extent *ex,
- ext4_lblk_t from, ext4_lblk_t to)
+ struct ext4_extent *ex,
+ ext4_fsblk_t *partial_cluster,
+ ext4_lblk_t from, ext4_lblk_t to)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
+ ext4_fsblk_t pblk;
int flags = EXT4_FREE_BLOCKS_FORGET;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
flags |= EXT4_FREE_BLOCKS_METADATA;
+ /*
+ * For bigalloc file systems, we never free a partial cluster
+ * at the beginning of the extent. Instead, we make a note
+ * that we tried freeing the cluster, and check to see if we
+ * need to free it on a subsequent call to ext4_remove_blocks,
+ * or at the end of the ext4_truncate() operation.
+ */
+ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+
+ trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
+ /*
+ * If we have a partial cluster, and it's different from the
+ * cluster of the last block, we need to explicitly free the
+ * partial cluster here.
+ */
+ pblk = ext4_ext_pblock(ex) + ee_len - 1;
+ if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(sbi, *partial_cluster),
+ sbi->s_cluster_ratio, flags);
+ *partial_cluster = 0;
+ }
+
#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2222,12 +2242,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
&& to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
ext4_lblk_t num;
- ext4_fsblk_t start;
num = le32_to_cpu(ex->ee_block) + ee_len - from;
- start = ext4_ext_pblock(ex) + ee_len - num;
- ext_debug("free last %u blocks starting %llu\n", num, start);
- ext4_free_blocks(handle, inode, NULL, start, num, flags);
+ pblk = ext4_ext_pblock(ex) + ee_len - num;
+ ext_debug("free last %u blocks starting %llu\n", num, pblk);
+ ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
+ /*
+ * If the block range to be freed didn't start at the
+ * beginning of a cluster, and we removed the entire
+ * extent, save the partial cluster here, since we
+ * might need to delete if we determine that the
+ * truncate operation has removed all of the blocks in
+ * the cluster.
+ */
+ if (pblk & (sbi->s_cluster_ratio - 1) &&
+ (ee_len == num))
+ *partial_cluster = EXT4_B2C(sbi, pblk);
+ else
+ *partial_cluster = 0;
} else if (from == le32_to_cpu(ex->ee_block)
&& to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* head removal */
@@ -2238,7 +2270,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
start = ext4_ext_pblock(ex);
ext_debug("free first %u blocks starting %llu\n", num, start);
- ext4_free_blocks(handle, inode, 0, start, num, flags);
+ ext4_free_blocks(handle, inode, NULL, start, num, flags);
} else {
printk(KERN_INFO "strange request: removal(2) "
@@ -2262,19 +2294,19 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
*/
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path, ext4_lblk_t start,
- ext4_lblk_t end)
+ struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+ ext4_lblk_t start, ext4_lblk_t end)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int err = 0, correct_index = 0;
int depth = ext_depth(inode), credits;
struct ext4_extent_header *eh;
- ext4_lblk_t a, b, block;
+ ext4_lblk_t a, b;
unsigned num;
ext4_lblk_t ex_ee_block;
unsigned short ex_ee_len;
unsigned uninitialized = 0;
struct ext4_extent *ex;
- struct ext4_map_blocks map;
/* the header must be checked already in ext4_ext_remove_space() */
ext_debug("truncate since %u in leaf\n", start);
@@ -2291,6 +2323,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
+ trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+
while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
@@ -2315,86 +2349,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
continue;
- } else if (a != ex_ee_block &&
- b != ex_ee_block + ex_ee_len - 1) {
- /*
- * If this is a truncate, then this condition should
- * never happen because at least one of the end points
- * needs to be on the edge of the extent.
- */
- if (end == EXT_MAX_BLOCKS - 1) {
- ext_debug(" bad truncate %u:%u\n",
- start, end);
- block = 0;
- num = 0;
- err = -EIO;
- goto out;
- }
- /*
- * else this is a hole punch, so the extent needs to
- * be split since neither edge of the hole is on the
- * extent edge
- */
- else{
- map.m_pblk = ext4_ext_pblock(ex);
- map.m_lblk = ex_ee_block;
- map.m_len = b - ex_ee_block;
-
- err = ext4_split_extent(handle,
- inode, path, &map, 0,
- EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
- EXT4_GET_BLOCKS_PRE_IO);
-
- if (err < 0)
- goto out;
-
- ex_ee_len = ext4_ext_get_actual_len(ex);
-
- b = ex_ee_block+ex_ee_len - 1 < end ?
- ex_ee_block+ex_ee_len - 1 : end;
-
- /* Then remove tail of this extent */
- block = ex_ee_block;
- num = a - block;
- }
+ } else if (b != ex_ee_block + ex_ee_len - 1) {
+ EXT4_ERROR_INODE(inode," bad truncate %u:%u\n",
+ start, end);
+ err = -EIO;
+ goto out;
} else if (a != ex_ee_block) {
/* remove tail of the extent */
- block = ex_ee_block;
- num = a - block;
- } else if (b != ex_ee_block + ex_ee_len - 1) {
- /* remove head of the extent */
- block = b;
- num = ex_ee_block + ex_ee_len - b;
-
- /*
- * If this is a truncate, this condition
- * should never happen
- */
- if (end == EXT_MAX_BLOCKS - 1) {
- ext_debug(" bad truncate %u:%u\n",
- start, end);
- err = -EIO;
- goto out;
- }
+ num = a - ex_ee_block;
} else {
/* remove whole extent: excellent! */
- block = ex_ee_block;
num = 0;
- if (a != ex_ee_block) {
- ext_debug(" bad truncate %u:%u\n",
- start, end);
- err = -EIO;
- goto out;
- }
-
- if (b != ex_ee_block + ex_ee_len - 1) {
- ext_debug(" bad truncate %u:%u\n",
- start, end);
- err = -EIO;
- goto out;
- }
}
-
/*
* 3 for leaf, sb, and inode plus 2 (bmap and group
* descriptor) for each block group; assume two block
@@ -2416,23 +2382,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (err)
goto out;
- err = ext4_remove_blocks(handle, inode, ex, a, b);
+ err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
+ a, b);
if (err)
goto out;
- if (num == 0) {
+ if (num == 0)
/* this extent is removed; mark slot entirely unused */
ext4_ext_store_pblock(ex, 0);
- } else if (block != ex_ee_block) {
- /*
- * If this was a head removal, then we need to update
- * the physical block since it is now at a different
- * location
- */
- ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
- }
- ex->ee_block = cpu_to_le32(block);
ex->ee_len = cpu_to_le16(num);
/*
* Do not mark uninitialized if all the blocks in the
@@ -2440,11 +2398,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
*/
if (uninitialized && num)
ext4_ext_mark_uninitialized(ex);
-
- err = ext4_ext_dirty(handle, inode, path + depth);
- if (err)
- goto out;
-
/*
* If the extent was completely released,
* we need to remove it from the leaf
@@ -2464,9 +2417,14 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
sizeof(struct ext4_extent));
}
le16_add_cpu(&eh->eh_entries, -1);
- }
+ } else
+ *partial_cluster = 0;
- ext_debug("new extent: %u:%u:%llu\n", block, num,
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
ext4_ext_pblock(ex));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2476,6 +2434,25 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (correct_index && eh->eh_entries)
err = ext4_ext_correct_indexes(handle, inode, path);
+ /*
+ * If there is still a entry in the leaf node, check to see if
+ * it references the partial cluster. This is the only place
+ * where it could; if it doesn't, we can free the cluster.
+ */
+ if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+ (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
+ *partial_cluster)) {
+ int flags = EXT4_FREE_BLOCKS_FORGET;
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
+
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(sbi, *partial_cluster),
+ sbi->s_cluster_ratio, flags);
+ *partial_cluster = 0;
+ }
+
/* if this leaf is free, then we should
* remove it from index block above */
if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
@@ -2511,6 +2488,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode);
struct ext4_ext_path *path;
+ ext4_fsblk_t partial_cluster = 0;
handle_t *handle;
int i, err;
@@ -2524,6 +2502,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
again:
ext4_ext_invalidate_cache(inode);
+ trace_ext4_ext_remove_space(inode, start, depth);
+
/*
* We start scanning from right side, freeing all the blocks
* after i_size and walking into the tree depth-wise.
@@ -2546,7 +2526,8 @@ again:
if (i == depth) {
/* this is leaf block */
err = ext4_ext_rm_leaf(handle, inode, path,
- start, EXT_MAX_BLOCKS - 1);
+ &partial_cluster, start,
+ EXT_MAX_BLOCKS - 1);
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
path[i].p_bh = NULL;
@@ -2618,6 +2599,24 @@ again:
}
}
+ trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
+ path->p_hdr->eh_entries);
+
+ /* If we still have something in the partial cluster and we have removed
+ * even the first extent, then we should free the blocks in the partial
+ * cluster as well. */
+ if (partial_cluster && path->p_hdr->eh_entries == 0) {
+ int flags = EXT4_FREE_BLOCKS_FORGET;
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
+
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(EXT4_SB(sb), partial_cluster),
+ EXT4_SB(sb)->s_cluster_ratio, flags);
+ partial_cluster = 0;
+ }
+
/* TODO: flexible tree reduction should be here */
if (path->p_hdr->eh_entries == 0) {
/*
@@ -2909,17 +2908,29 @@ out:
* a> There is no split required: Entire extent should be initialized
* b> Splits in two extents: Write is happening at either end of the extent
* c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ * - The extent pointed to by 'path' is uninitialized.
+ * - The extent pointed to by 'path' contains a superset
+ * of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ * - the returned value is the number of blocks beyond map->l_lblk
+ * that are allocated and initialized.
+ * It is guaranteed to be >= map->m_len.
*/
static int ext4_ext_convert_to_initialized(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path)
{
+ struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
struct ext4_extent zero_ex;
struct ext4_extent *ex;
ext4_lblk_t ee_block, eof_block;
- unsigned int allocated, ee_len, depth;
+ unsigned int ee_len, depth;
+ int allocated;
int err = 0;
int split_flag = 0;
@@ -2933,11 +2944,92 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
eof_block = map->m_lblk + map->m_len;
depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (map->m_lblk - ee_block);
+ trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+ /* Pre-conditions */
+ BUG_ON(!ext4_ext_is_uninitialized(ex));
+ BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+
+ /*
+ * Attempt to transfer newly initialized blocks from the currently
+ * uninitialized extent to its left neighbor. This is much cheaper
+ * than an insertion followed by a merge as those involve costly
+ * memmove() calls. This is the common case in steady state for
+ * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
+ * writes.
+ *
+ * Limitations of the current logic:
+ * - L1: we only deal with writes at the start of the extent.
+ * The approach could be extended to writes at the end
+ * of the extent but this scenario was deemed less common.
+ * - L2: we do not deal with writes covering the whole extent.
+ * This would require removing the extent if the transfer
+ * is possible.
+ * - L3: we only attempt to merge with an extent stored in the
+ * same extent tree node.
+ */
+ if ((map->m_lblk == ee_block) && /*L1*/
+ (map->m_len < ee_len) && /*L2*/
+ (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/
+ struct ext4_extent *prev_ex;
+ ext4_lblk_t prev_lblk;
+ ext4_fsblk_t prev_pblk, ee_pblk;
+ unsigned int prev_len, write_len;
+
+ prev_ex = ex - 1;
+ prev_lblk = le32_to_cpu(prev_ex->ee_block);
+ prev_len = ext4_ext_get_actual_len(prev_ex);
+ prev_pblk = ext4_ext_pblock(prev_ex);
+ ee_pblk = ext4_ext_pblock(ex);
+ write_len = map->m_len;
+
+ /*
+ * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+ * upon those conditions:
+ * - C1: prev_ex is initialized,
+ * - C2: prev_ex is logically abutting ex,
+ * - C3: prev_ex is physically abutting ex,
+ * - C4: prev_ex can receive the additional blocks without
+ * overflowing the (initialized) length limit.
+ */
+ if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/
+ ((prev_lblk + prev_len) == ee_block) && /*C2*/
+ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
+ (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ trace_ext4_ext_convert_to_initialized_fastpath(inode,
+ map, ex, prev_ex);
+
+ /* Shift the start of ex by 'write_len' blocks */
+ ex->ee_block = cpu_to_le32(ee_block + write_len);
+ ext4_ext_store_pblock(ex, ee_pblk + write_len);
+ ex->ee_len = cpu_to_le16(ee_len - write_len);
+ ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+ /* Extend prev_ex by 'write_len' blocks */
+ prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+
+ /* Mark the block containing both extents as dirty */
+ ext4_ext_dirty(handle, inode, path + depth);
+
+ /* Update path to point to the right extent */
+ path[depth].p_ext = prev_ex;
+
+ /* Result: number of initialized blocks past m_lblk */
+ allocated = write_len;
+ goto out;
+ }
+ }
+
WARN_ON(map->m_lblk < ee_block);
/*
* It is safe to convert extent to initialized via explicit
@@ -3165,6 +3257,192 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
return ext4_mark_inode_dirty(handle, inode);
}
+/**
+ * ext4_find_delalloc_range: find delayed allocated block in the given range.
+ *
+ * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
+ * whether there are any buffers marked for delayed allocation. It returns '1'
+ * on the first delalloc'ed buffer head found. If no buffer head in the given
+ * range is marked for delalloc, it returns 0.
+ * lblk_start should always be <= lblk_end.
+ * search_hint_reverse is to indicate that searching in reverse from lblk_end to
+ * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
+ * block sooner). This is useful when blocks are truncated sequentially from
+ * lblk_start towards lblk_end.
+ */
+static int ext4_find_delalloc_range(struct inode *inode,
+ ext4_lblk_t lblk_start,
+ ext4_lblk_t lblk_end,
+ int search_hint_reverse)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct buffer_head *head, *bh = NULL;
+ struct page *page;
+ ext4_lblk_t i, pg_lblk;
+ pgoff_t index;
+
+ /* reverse search wont work if fs block size is less than page size */
+ if (inode->i_blkbits < PAGE_CACHE_SHIFT)
+ search_hint_reverse = 0;
+
+ if (search_hint_reverse)
+ i = lblk_end;
+ else
+ i = lblk_start;
+
+ index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ while ((i >= lblk_start) && (i <= lblk_end)) {
+ page = find_get_page(mapping, index);
+ if (!page)
+ goto nextpage;
+
+ if (!page_has_buffers(page))
+ goto nextpage;
+
+ head = page_buffers(page);
+ if (!head)
+ goto nextpage;
+
+ bh = head;
+ pg_lblk = index << (PAGE_CACHE_SHIFT -
+ inode->i_blkbits);
+ do {
+ if (unlikely(pg_lblk < lblk_start)) {
+ /*
+ * This is possible when fs block size is less
+ * than page size and our cluster starts/ends in
+ * middle of the page. So we need to skip the
+ * initial few blocks till we reach the 'lblk'
+ */
+ pg_lblk++;
+ continue;
+ }
+
+ /* Check if the buffer is delayed allocated and that it
+ * is not yet mapped. (when da-buffers are mapped during
+ * their writeout, their da_mapped bit is set.)
+ */
+ if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
+ page_cache_release(page);
+ trace_ext4_find_delalloc_range(inode,
+ lblk_start, lblk_end,
+ search_hint_reverse,
+ 1, i);
+ return 1;
+ }
+ if (search_hint_reverse)
+ i--;
+ else
+ i++;
+ } while ((i >= lblk_start) && (i <= lblk_end) &&
+ ((bh = bh->b_this_page) != head));
+nextpage:
+ if (page)
+ page_cache_release(page);
+ /*
+ * Move to next page. 'i' will be the first lblk in the next
+ * page.
+ */
+ if (search_hint_reverse)
+ index--;
+ else
+ index++;
+ i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ }
+
+ trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+ search_hint_reverse, 0, 0);
+ return 0;
+}
+
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+ int search_hint_reverse)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t lblk_start, lblk_end;
+ lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
+ lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+ return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+ search_hint_reverse);
+}
+
+/**
+ * Determines how many complete clusters (out of those specified by the 'map')
+ * are under delalloc and were reserved quota for.
+ * This function is called when we are writing out the blocks that were
+ * originally written with their allocation delayed, but then the space was
+ * allocated using fallocate() before the delayed allocation could be resolved.
+ * The cases to look for are:
+ * ('=' indicated delayed allocated blocks
+ * '-' indicates non-delayed allocated blocks)
+ * (a) partial clusters towards beginning and/or end outside of allocated range
+ * are not delalloc'ed.
+ * Ex:
+ * |----c---=|====c====|====c====|===-c----|
+ * |++++++ allocated ++++++|
+ * ==> 4 complete clusters in above example
+ *
+ * (b) partial cluster (outside of allocated range) towards either end is
+ * marked for delayed allocation. In this case, we will exclude that
+ * cluster.
+ * Ex:
+ * |----====c========|========c========|
+ * |++++++ allocated ++++++|
+ * ==> 1 complete clusters in above example
+ *
+ * Ex:
+ * |================c================|
+ * |++++++ allocated ++++++|
+ * ==> 0 complete clusters in above example
+ *
+ * The ext4_da_update_reserve_space will be called only if we
+ * determine here that there were some "entire" clusters that span
+ * this 'allocated' range.
+ * In the non-bigalloc case, this function will just end up returning num_blks
+ * without ever calling ext4_find_delalloc_range.
+ */
+static unsigned int
+get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
+ unsigned int num_blks)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
+ ext4_lblk_t lblk_from, lblk_to, c_offset;
+ unsigned int allocated_clusters = 0;
+
+ alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
+ alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
+
+ /* max possible clusters for this allocation */
+ allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
+
+ trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
+
+ /* Check towards left side */
+ c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
+ if (c_offset) {
+ lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
+ lblk_to = lblk_from + c_offset - 1;
+
+ if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+ allocated_clusters--;
+ }
+
+ /* Now check towards right. */
+ c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
+ if (allocated_clusters && c_offset) {
+ lblk_from = lblk_start + num_blks;
+ lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
+
+ if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+ allocated_clusters--;
+ }
+
+ return allocated_clusters;
+}
+
static int
ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
@@ -3181,6 +3459,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
flags, allocated);
ext4_ext_show_leaf(inode, path);
+ trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
+ newblock);
+
/* get_block() before submit the IO, split the extent */
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
ret = ext4_split_unwritten_extents(handle, inode, map,
@@ -3190,10 +3471,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
* that this IO needs to conversion to written when IO is
* completed
*/
- if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
- io->flag = EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
- } else
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
if (ext4_should_dioread_nolock(inode))
map->m_flags |= EXT4_MAP_UNINIT;
@@ -3234,14 +3514,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
/* buffered write, writepage time, convert*/
ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
- if (ret >= 0) {
+ if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
- err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
- map->m_len);
- if (err < 0)
- goto out2;
- }
-
out:
if (ret <= 0) {
err = ret;
@@ -3270,11 +3544,24 @@ out:
* But fallocate would have already updated quota and block
* count for this offset. So cancel these reservation
*/
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- ext4_da_update_reserve_space(inode, allocated, 0);
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ unsigned int reserved_clusters;
+ reserved_clusters = get_reserved_cluster_alloc(inode,
+ map->m_lblk, map->m_len);
+ if (reserved_clusters)
+ ext4_da_update_reserve_space(inode,
+ reserved_clusters,
+ 0);
+ }
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
+ if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
+ err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+ map->m_len);
+ if (err < 0)
+ goto out2;
+ }
out1:
if (allocated > map->m_len)
allocated = map->m_len;
@@ -3290,6 +3577,111 @@ out2:
}
/*
+ * get_implied_cluster_alloc - check to see if the requested
+ * allocation (in the map structure) overlaps with a cluster already
+ * allocated in an extent.
+ * @sb The filesystem superblock structure
+ * @map The requested lblk->pblk mapping
+ * @ex The extent structure which might contain an implied
+ * cluster allocation
+ *
+ * This function is called by ext4_ext_map_blocks() after we failed to
+ * find blocks that were already in the inode's extent tree. Hence,
+ * we know that the beginning of the requested region cannot overlap
+ * the extent from the inode's extent tree. There are three cases we
+ * want to catch. The first is this case:
+ *
+ * |--- cluster # N--|
+ * |--- extent ---| |---- requested region ---|
+ * |==========|
+ *
+ * The second case that we need to test for is this one:
+ *
+ * |--------- cluster # N ----------------|
+ * |--- requested region --| |------- extent ----|
+ * |=======================|
+ *
+ * The third case is when the requested region lies between two extents
+ * within the same cluster:
+ * |------------- cluster # N-------------|
+ * |----- ex -----| |---- ex_right ----|
+ * |------ requested region ------|
+ * |================|
+ *
+ * In each of the above cases, we need to set the map->m_pblk and
+ * map->m_len so it corresponds to the return the extent labelled as
+ * "|====|" from cluster #N, since it is already in use for data in
+ * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to
+ * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
+ * as a new "allocated" block region. Otherwise, we will return 0 and
+ * ext4_ext_map_blocks() will then allocate one or more new clusters
+ * by calling ext4_mb_new_blocks().
+ */
+static int get_implied_cluster_alloc(struct super_block *sb,
+ struct ext4_map_blocks *map,
+ struct ext4_extent *ex,
+ struct ext4_ext_path *path)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+ ext4_lblk_t ex_cluster_start, ex_cluster_end;
+ ext4_lblk_t rr_cluster_start, rr_cluster_end;
+ ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+ ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
+
+ /* The extent passed in that we are trying to match */
+ ex_cluster_start = EXT4_B2C(sbi, ee_block);
+ ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
+
+ /* The requested region passed into ext4_map_blocks() */
+ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
+ rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
+
+ if ((rr_cluster_start == ex_cluster_end) ||
+ (rr_cluster_start == ex_cluster_start)) {
+ if (rr_cluster_start == ex_cluster_end)
+ ee_start += ee_len - 1;
+ map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
+ c_offset;
+ map->m_len = min(map->m_len,
+ (unsigned) sbi->s_cluster_ratio - c_offset);
+ /*
+ * Check for and handle this case:
+ *
+ * |--------- cluster # N-------------|
+ * |------- extent ----|
+ * |--- requested region ---|
+ * |===========|
+ */
+
+ if (map->m_lblk < ee_block)
+ map->m_len = min(map->m_len, ee_block - map->m_lblk);
+
+ /*
+ * Check for the case where there is already another allocated
+ * block to the right of 'ex' but before the end of the cluster.
+ *
+ * |------------- cluster # N-------------|
+ * |----- ex -----| |---- ex_right ----|
+ * |------ requested region ------|
+ * |================|
+ */
+ if (map->m_lblk > ee_block) {
+ ext4_lblk_t next = ext4_ext_next_allocated_block(path);
+ map->m_len = min(map->m_len, next - map->m_lblk);
+ }
+
+ trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
+ return 1;
+ }
+
+ trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
+ return 0;
+}
+
+
+/*
* Block allocation/map/preallocation routine for extents based files
*
*
@@ -3311,15 +3703,17 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
struct ext4_ext_path *path = NULL;
- struct ext4_extent newex, *ex;
+ struct ext4_extent newex, *ex, *ex2;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_fsblk_t newblock = 0;
- int err = 0, depth, ret;
- unsigned int allocated = 0;
+ int free_on_err = 0, err = 0, depth, ret;
+ unsigned int allocated = 0, offset = 0;
+ unsigned int allocated_clusters = 0;
unsigned int punched_out = 0;
unsigned int result = 0;
struct ext4_allocation_request ar;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
- struct ext4_map_blocks punch_map;
+ ext4_lblk_t cluster_offset;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
@@ -3329,6 +3723,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
if (!newex.ee_start_lo && !newex.ee_start_hi) {
+ if ((sbi->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
/*
* block isn't allocated yet and
@@ -3339,6 +3737,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
/* we should allocate requested block */
} else {
/* block is already allocated */
+ if (sbi->s_cluster_ratio > 1)
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
newblock = map->m_lblk
- le32_to_cpu(newex.ee_block)
+ ext4_ext_pblock(&newex);
@@ -3384,8 +3784,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* we split out initialized portions during a write.
*/
ee_len = ext4_ext_get_actual_len(ex);
+
+ trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
+
/* if found extent covers block, simply return it */
if (in_range(map->m_lblk, ee_block, ee_len)) {
+ struct ext4_map_blocks punch_map;
+ ext4_fsblk_t partial_cluster = 0;
+
newblock = map->m_lblk - ee_block + ee_start;
/* number of remaining blocks in the extent */
allocated = ee_len - (map->m_lblk - ee_block);
@@ -3469,7 +3875,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext4_ext_invalidate_cache(inode);
err = ext4_ext_rm_leaf(handle, inode, path,
- map->m_lblk, map->m_lblk + punched_out);
+ &partial_cluster, map->m_lblk,
+ map->m_lblk + punched_out);
if (!err && path->p_hdr->eh_entries == 0) {
/*
@@ -3492,6 +3899,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
}
}
+ if ((sbi->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
/*
* requested block isn't allocated yet;
* we couldn't try to create block if create flag is zero
@@ -3504,9 +3915,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
goto out2;
}
+
/*
* Okay, we need to do block allocation.
*/
+ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+ newex.ee_block = cpu_to_le32(map->m_lblk);
+ cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+
+ /*
+ * If we are doing bigalloc, check to see if the extent returned
+ * by ext4_ext_find_extent() implies a cluster we can use.
+ */
+ if (cluster_offset && ex &&
+ get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
+ ar.len = allocated = map->m_len;
+ newblock = map->m_pblk;
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ goto got_allocated_blocks;
+ }
/* find neighbour allocated blocks */
ar.lleft = map->m_lblk;
@@ -3514,10 +3941,21 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (err)
goto out2;
ar.lright = map->m_lblk;
- err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
+ ex2 = NULL;
+ err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
if (err)
goto out2;
+ /* Check if the extent after searching to the right implies a
+ * cluster we can use. */
+ if ((sbi->s_cluster_ratio > 1) && ex2 &&
+ get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
+ ar.len = allocated = map->m_len;
+ newblock = map->m_pblk;
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ goto got_allocated_blocks;
+ }
+
/*
* See if request is beyond maximum number of blocks we can have in
* a single extent. For an initialized extent this limit is
@@ -3532,9 +3970,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
map->m_len = EXT_UNINIT_MAX_LEN;
/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
- newex.ee_block = cpu_to_le32(map->m_lblk);
newex.ee_len = cpu_to_le16(map->m_len);
- err = ext4_ext_check_overlap(inode, &newex, path);
+ err = ext4_ext_check_overlap(sbi, inode, &newex, path);
if (err)
allocated = ext4_ext_get_actual_len(&newex);
else
@@ -3544,7 +3981,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ar.inode = inode;
ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
ar.logical = map->m_lblk;
- ar.len = allocated;
+ /*
+ * We calculate the offset from the beginning of the cluster
+ * for the logical block number, since when we allocate a
+ * physical cluster, the physical block should start at the
+ * same offset from the beginning of the cluster. This is
+ * needed so that future calls to get_implied_cluster_alloc()
+ * work correctly.
+ */
+ offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
+ ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
+ ar.goal -= offset;
+ ar.logical -= offset;
if (S_ISREG(inode->i_mode))
ar.flags = EXT4_MB_HINT_DATA;
else
@@ -3557,9 +4005,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
goto out2;
ext_debug("allocate new block: goal %llu, found %llu/%u\n",
ar.goal, newblock, allocated);
+ free_on_err = 1;
+ allocated_clusters = ar.len;
+ ar.len = EXT4_C2B(sbi, ar.len) - offset;
+ if (ar.len > allocated)
+ ar.len = allocated;
+got_allocated_blocks:
/* try to insert new extent into found leaf and return */
- ext4_ext_store_pblock(&newex, newblock);
+ ext4_ext_store_pblock(&newex, newblock + offset);
newex.ee_len = cpu_to_le16(ar.len);
/* Mark uninitialized */
if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
@@ -3572,10 +4026,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* that we need to perform conversion when IO is done.
*/
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
- io->flag = EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
- } else
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
ext4_set_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN);
}
@@ -3583,11 +4036,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
map->m_flags |= EXT4_MAP_UNINIT;
}
- err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
+ err = 0;
+ if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
+ err = check_eofblocks_fl(handle, inode, map->m_lblk,
+ path, ar.len);
if (!err)
err = ext4_ext_insert_extent(handle, inode, path,
&newex, flags);
- if (err) {
+ if (err && free_on_err) {
int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
/* free data blocks we just allocated */
@@ -3610,8 +4066,82 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* Update reserved blocks/metadata blocks after successful
* block allocation which had been deferred till now.
*/
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- ext4_da_update_reserve_space(inode, allocated, 1);
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ unsigned int reserved_clusters;
+ /*
+ * Check how many clusters we had reserved this allocated range
+ */
+ reserved_clusters = get_reserved_cluster_alloc(inode,
+ map->m_lblk, allocated);
+ if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+ if (reserved_clusters) {
+ /*
+ * We have clusters reserved for this range.
+ * But since we are not doing actual allocation
+ * and are simply using blocks from previously
+ * allocated cluster, we should release the
+ * reservation and not claim quota.
+ */
+ ext4_da_update_reserve_space(inode,
+ reserved_clusters, 0);
+ }
+ } else {
+ BUG_ON(allocated_clusters < reserved_clusters);
+ /* We will claim quota for all newly allocated blocks.*/
+ ext4_da_update_reserve_space(inode, allocated_clusters,
+ 1);
+ if (reserved_clusters < allocated_clusters) {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int reservation = allocated_clusters -
+ reserved_clusters;
+ /*
+ * It seems we claimed few clusters outside of
+ * the range of this allocation. We should give
+ * it back to the reservation pool. This can
+ * happen in the following case:
+ *
+ * * Suppose s_cluster_ratio is 4 (i.e., each
+ * cluster has 4 blocks. Thus, the clusters
+ * are [0-3],[4-7],[8-11]...
+ * * First comes delayed allocation write for
+ * logical blocks 10 & 11. Since there were no
+ * previous delayed allocated blocks in the
+ * range [8-11], we would reserve 1 cluster
+ * for this write.
+ * * Next comes write for logical blocks 3 to 8.
+ * In this case, we will reserve 2 clusters
+ * (for [0-3] and [4-7]; and not for [8-11] as
+ * that range has a delayed allocated blocks.
+ * Thus total reserved clusters now becomes 3.
+ * * Now, during the delayed allocation writeout
+ * time, we will first write blocks [3-8] and
+ * allocate 3 clusters for writing these
+ * blocks. Also, we would claim all these
+ * three clusters above.
+ * * Now when we come here to writeout the
+ * blocks [10-11], we would expect to claim
+ * the reservation of 1 cluster we had made
+ * (and we would claim it since there are no
+ * more delayed allocated blocks in the range
+ * [8-11]. But our reserved cluster count had
+ * already gone to 0.
+ *
+ * Thus, at the step 4 above when we determine
+ * that there are still some unwritten delayed
+ * allocated blocks outside of our current
+ * block range, we should increment the
+ * reserved clusters count so that when the
+ * remaining blocks finally gets written, we
+ * could claim them.
+ */
+ dquot_reserve_block(inode,
+ EXT4_C2B(sbi, reservation));
+ spin_lock(&ei->i_block_reservation_lock);
+ ei->i_reserved_data_blocks += reservation;
+ spin_unlock(&ei->i_block_reservation_lock);
+ }
+ }
+ }
/*
* Cache the extent and update transaction to commit on fdatasync only
@@ -3634,12 +4164,12 @@ out2:
ext4_ext_drop_refs(path);
kfree(path);
}
- trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
- newblock, map->m_len, err ? err : allocated);
-
result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
punched_out : allocated;
+ trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+ newblock, map->m_len, err ? err : result);
+
return err ? err : result;
}
@@ -3649,6 +4179,7 @@ void ext4_ext_truncate(struct inode *inode)
struct super_block *sb = inode->i_sb;
ext4_lblk_t last_block;
handle_t *handle;
+ loff_t page_len;
int err = 0;
/*
@@ -3665,8 +4196,16 @@ void ext4_ext_truncate(struct inode *inode)
if (IS_ERR(handle))
return;
- if (inode->i_size & (sb->s_blocksize - 1))
- ext4_block_truncate_page(handle, mapping, inode->i_size);
+ if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+ page_len = PAGE_CACHE_SIZE -
+ (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, inode->i_size, page_len, 0);
+
+ if (err)
+ goto out_stop;
+ }
if (ext4_orphan_add(handle, inode))
goto out_stop;
@@ -3760,6 +4299,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
int ret = 0;
int ret2 = 0;
int retries = 0;
+ int flags;
struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
@@ -3796,6 +4336,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
return ret;
}
+ flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+ /*
+ * Don't normalize the request if it can fit in one extent so
+ * that it doesn't get unnecessarily split into multiple
+ * extents.
+ */
+ if (len <= EXT_UNINIT_MAX_LEN << blkbits)
+ flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
retry:
while (ret >= 0 && ret < max_blocks) {
map.m_lblk = map.m_lblk + ret;
@@ -3805,9 +4355,7 @@ retry:
ret = PTR_ERR(handle);
break;
}
- ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
- EXT4_GET_BLOCKS_NO_NORMALIZE);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);
@@ -4102,7 +4650,6 @@ found_delayed_extent:
return EXT_BREAK;
return EXT_CONTINUE;
}
-
/* fiemap flags we can handle specified here */
#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -4162,17 +4709,28 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
struct address_space *mapping = inode->i_mapping;
struct ext4_map_blocks map;
handle_t *handle;
- loff_t first_block_offset, last_block_offset, block_len;
- loff_t first_page, last_page, first_page_offset, last_page_offset;
+ loff_t first_page, last_page, page_len;
+ loff_t first_page_offset, last_page_offset;
int ret, credits, blocks_released, err = 0;
+ /* No need to punch hole beyond i_size */
+ if (offset >= inode->i_size)
+ return 0;
+
+ /*
+ * If the hole extends beyond i_size, set the hole
+ * to end after the page that contains i_size
+ */
+ if (offset + length > inode->i_size) {
+ length = inode->i_size +
+ PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ offset;
+ }
+
first_block = (offset + sb->s_blocksize - 1) >>
EXT4_BLOCK_SIZE_BITS(sb);
last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
- first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
- last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
-
first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
last_page = (offset + length) >> PAGE_CACHE_SHIFT;
@@ -4185,11 +4743,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
*/
if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
err = filemap_write_and_wait_range(mapping,
- first_page_offset == 0 ? 0 : first_page_offset-1,
- last_page_offset);
+ offset, offset + length - 1);
- if (err)
- return err;
+ if (err)
+ return err;
}
/* Now release the pages */
@@ -4211,24 +4768,64 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
goto out;
/*
- * Now we need to zero out the un block aligned data.
- * If the file is smaller than a block, just
- * zero out the middle
+ * Now we need to zero out the non-page-aligned data in the
+ * pages at the start and tail of the hole, and unmap the buffer
+ * heads for the block aligned regions of the page that were
+ * completely zeroed.
*/
- if (first_block > last_block)
- ext4_block_zero_page_range(handle, mapping, offset, length);
- else {
- /* zero out the head of the hole before the first block */
- block_len = first_block_offset - offset;
- if (block_len > 0)
- ext4_block_zero_page_range(handle, mapping,
- offset, block_len);
-
- /* zero out the tail of the hole after the last block */
- block_len = offset + length - last_block_offset;
- if (block_len > 0) {
- ext4_block_zero_page_range(handle, mapping,
- last_block_offset, block_len);
+ if (first_page > last_page) {
+ /*
+ * If the file space being truncated is contained within a page
+ * just zero out and unmap the middle of that page
+ */
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, offset, length, 0);
+
+ if (err)
+ goto out;
+ } else {
+ /*
+ * zero out and unmap the partial page that contains
+ * the start of the hole
+ */
+ page_len = first_page_offset - offset;
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle, mapping,
+ offset, page_len, 0);
+ if (err)
+ goto out;
+ }
+
+ /*
+ * zero out and unmap the partial page that contains
+ * the end of the hole
+ */
+ page_len = offset + length - last_page_offset;
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle, mapping,
+ last_page_offset, page_len, 0);
+ if (err)
+ goto out;
+ }
+ }
+
+
+ /*
+ * If i_size is contained in the last page, we need to
+ * unmap and zero the partial page after i_size
+ */
+ if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
+ inode->i_size % PAGE_CACHE_SIZE != 0) {
+
+ page_len = PAGE_CACHE_SIZE -
+ (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, inode->i_size, page_len, 0);
+
+ if (err)
+ goto out;
}
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e4095e988eb..cb70f1812a7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -181,8 +181,8 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
path.dentry = mnt->mnt_root;
cp = d_path(&path, buf, sizeof(buf));
if (!IS_ERR(cp)) {
- memcpy(sbi->s_es->s_last_mounted, cp,
- sizeof(sbi->s_es->s_last_mounted));
+ strlcpy(sbi->s_es->s_last_mounted, cp,
+ sizeof(sbi->s_es->s_last_mounted));
ext4_mark_super_dirty(sb);
}
}
@@ -224,53 +224,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
else
maxbytes = inode->i_sb->s_maxbytes;
- mutex_lock(&inode->i_mutex);
- switch (origin) {
- case SEEK_END:
- offset += inode->i_size;
- break;
- case SEEK_CUR:
- if (offset == 0) {
- mutex_unlock(&inode->i_mutex);
- return file->f_pos;
- }
- offset += file->f_pos;
- break;
- case SEEK_DATA:
- /*
- * In the generic case the entire file is data, so as long as
- * offset isn't at the end of the file then the offset is data.
- */
- if (offset >= inode->i_size) {
- mutex_unlock(&inode->i_mutex);
- return -ENXIO;
- }
- break;
- case SEEK_HOLE:
- /*
- * There is a virtual hole at the end of the file, so as long as
- * offset isn't i_size or larger, return i_size.
- */
- if (offset >= inode->i_size) {
- mutex_unlock(&inode->i_mutex);
- return -ENXIO;
- }
- offset = inode->i_size;
- break;
- }
-
- if (offset < 0 || offset > maxbytes) {
- mutex_unlock(&inode->i_mutex);
- return -EINVAL;
- }
-
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
- mutex_unlock(&inode->i_mutex);
- return offset;
+ return generic_file_llseek_size(file, offset, origin, maxbytes);
}
const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 036f78f7a1e..00a2cb753ef 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
* to written.
* The function return the number of pending IOs on success.
*/
-extern int ext4_flush_completed_IO(struct inode *inode)
+int ext4_flush_completed_IO(struct inode *inode)
{
ext4_io_end_t *io;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -83,14 +83,12 @@ extern int ext4_flush_completed_IO(struct inode *inode)
int ret = 0;
int ret2 = 0;
- if (list_empty(&ei->i_completed_io_list))
- return ret;
-
dump_completed_IO(inode);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
while (!list_empty(&ei->i_completed_io_list)){
io = list_entry(ei->i_completed_io_list.next,
ext4_io_end_t, list);
+ list_del_init(&io->list);
/*
* Calling ext4_end_io_nolock() to convert completed
* IO to written.
@@ -107,11 +105,9 @@ extern int ext4_flush_completed_IO(struct inode *inode)
*/
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
ret = ext4_end_io_nolock(io);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
if (ret < 0)
ret2 = ret;
- else
- list_del_init(&io->list);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
}
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
return (ret2 < 0) ? ret2 : 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9c63f273b55..00beb4f9cc4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -78,7 +78,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
* allocation, essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
- ext4_free_blks_set(sb, gdp, 0);
+ ext4_free_group_clusters_set(sb, gdp, 0);
ext4_free_inodes_set(sb, gdp, 0);
ext4_itable_unused_set(sb, gdp, 0);
memset(bh->b_data, 0xff, sb->s_blocksize);
@@ -293,121 +293,9 @@ error_return:
ext4_std_error(sb, fatal);
}
-/*
- * There are two policies for allocating an inode. If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory\'s block
- * group to find a free inode.
- */
-static int find_group_dir(struct super_block *sb, struct inode *parent,
- ext4_group_t *best_group)
-{
- ext4_group_t ngroups = ext4_get_groups_count(sb);
- unsigned int freei, avefreei;
- struct ext4_group_desc *desc, *best_desc = NULL;
- ext4_group_t group;
- int ret = -1;
-
- freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
- avefreei = freei / ngroups;
-
- for (group = 0; group < ngroups; group++) {
- desc = ext4_get_group_desc(sb, group, NULL);
- if (!desc || !ext4_free_inodes_count(sb, desc))
- continue;
- if (ext4_free_inodes_count(sb, desc) < avefreei)
- continue;
- if (!best_desc ||
- (ext4_free_blks_count(sb, desc) >
- ext4_free_blks_count(sb, best_desc))) {
- *best_group = group;
- best_desc = desc;
- ret = 0;
- }
- }
- return ret;
-}
-
-#define free_block_ratio 10
-
-static int find_group_flex(struct super_block *sb, struct inode *parent,
- ext4_group_t *best_group)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_group_desc *desc;
- struct flex_groups *flex_group = sbi->s_flex_groups;
- ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
- ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
- ext4_group_t ngroups = ext4_get_groups_count(sb);
- int flex_size = ext4_flex_bg_size(sbi);
- ext4_group_t best_flex = parent_fbg_group;
- int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
- int flexbg_free_blocks;
- int flex_freeb_ratio;
- ext4_group_t n_fbg_groups;
- ext4_group_t i;
-
- n_fbg_groups = (ngroups + flex_size - 1) >>
- sbi->s_log_groups_per_flex;
-
-find_close_to_parent:
- flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
- flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
- if (atomic_read(&flex_group[best_flex].free_inodes) &&
- flex_freeb_ratio > free_block_ratio)
- goto found_flexbg;
-
- if (best_flex && best_flex == parent_fbg_group) {
- best_flex--;
- goto find_close_to_parent;
- }
-
- for (i = 0; i < n_fbg_groups; i++) {
- if (i == parent_fbg_group || i == parent_fbg_group - 1)
- continue;
-
- flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
- flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-
- if (flex_freeb_ratio > free_block_ratio &&
- (atomic_read(&flex_group[i].free_inodes))) {
- best_flex = i;
- goto found_flexbg;
- }
-
- if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
- ((atomic_read(&flex_group[i].free_blocks) >
- atomic_read(&flex_group[best_flex].free_blocks)) &&
- atomic_read(&flex_group[i].free_inodes)))
- best_flex = i;
- }
-
- if (!atomic_read(&flex_group[best_flex].free_inodes) ||
- !atomic_read(&flex_group[best_flex].free_blocks))
- return -1;
-
-found_flexbg:
- for (i = best_flex * flex_size; i < ngroups &&
- i < (best_flex + 1) * flex_size; i++) {
- desc = ext4_get_group_desc(sb, i, NULL);
- if (ext4_free_inodes_count(sb, desc)) {
- *best_group = i;
- goto out;
- }
- }
-
- return -1;
-out:
- return 0;
-}
-
struct orlov_stats {
__u32 free_inodes;
- __u32 free_blocks;
+ __u32 free_clusters;
__u32 used_dirs;
};
@@ -424,7 +312,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
if (flex_size > 1) {
stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
- stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+ stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
return;
}
@@ -432,11 +320,11 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
desc = ext4_get_group_desc(sb, g, NULL);
if (desc) {
stats->free_inodes = ext4_free_inodes_count(sb, desc);
- stats->free_blocks = ext4_free_blks_count(sb, desc);
+ stats->free_clusters = ext4_free_group_clusters(sb, desc);
stats->used_dirs = ext4_used_dirs_count(sb, desc);
} else {
stats->free_inodes = 0;
- stats->free_blocks = 0;
+ stats->free_clusters = 0;
stats->used_dirs = 0;
}
}
@@ -471,10 +359,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
ext4_group_t real_ngroups = ext4_get_groups_count(sb);
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei;
- ext4_fsblk_t freeb, avefreeb;
+ ext4_fsblk_t freeb, avefreec;
unsigned int ndirs;
int max_dirs, min_inodes;
- ext4_grpblk_t min_blocks;
+ ext4_grpblk_t min_clusters;
ext4_group_t i, grp, g, ngroups;
struct ext4_group_desc *desc;
struct orlov_stats stats;
@@ -490,9 +378,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
- freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- avefreeb = freeb;
- do_div(avefreeb, ngroups);
+ freeb = EXT4_C2B(sbi,
+ percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+ avefreec = freeb;
+ do_div(avefreec, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
if (S_ISDIR(mode) &&
@@ -518,7 +407,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
continue;
if (stats.free_inodes < avefreei)
continue;
- if (stats.free_blocks < avefreeb)
+ if (stats.free_clusters < avefreec)
continue;
grp = g;
ret = 0;
@@ -556,7 +445,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
min_inodes = avefreei - inodes_per_group*flex_size / 4;
if (min_inodes < 1)
min_inodes = 1;
- min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+ min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
/*
* Start looking in the flex group where we last allocated an
@@ -575,7 +464,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
continue;
if (stats.free_inodes < min_inodes)
continue;
- if (stats.free_blocks < min_blocks)
+ if (stats.free_clusters < min_clusters)
continue;
goto found_flex_bg;
}
@@ -659,7 +548,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*group = parent_group;
desc = ext4_get_group_desc(sb, *group, NULL);
if (desc && ext4_free_inodes_count(sb, desc) &&
- ext4_free_blks_count(sb, desc))
+ ext4_free_group_clusters(sb, desc))
return 0;
/*
@@ -683,7 +572,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*group -= ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
if (desc && ext4_free_inodes_count(sb, desc) &&
- ext4_free_blks_count(sb, desc))
+ ext4_free_group_clusters(sb, desc))
return 0;
}
@@ -802,7 +691,7 @@ err_ret:
* group to find a free inode.
*/
struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
- const struct qstr *qstr, __u32 goal)
+ const struct qstr *qstr, __u32 goal, uid_t *owner)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
@@ -816,8 +705,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
int ret2, err = 0;
struct inode *ret;
ext4_group_t i;
- int free = 0;
- static int once = 1;
ext4_group_t flex_group;
/* Cannot create files in a deleted directory */
@@ -843,26 +730,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
goto got_group;
}
- if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
- ret2 = find_group_flex(sb, dir, &group);
- if (ret2 == -1) {
- ret2 = find_group_other(sb, dir, &group, mode);
- if (ret2 == 0 && once) {
- once = 0;
- printk(KERN_NOTICE "ext4: find_group_flex "
- "failed, fallback succeeded dir %lu\n",
- dir->i_ino);
- }
- }
- goto got_group;
- }
-
- if (S_ISDIR(mode)) {
- if (test_opt(sb, OLDALLOC))
- ret2 = find_group_dir(sb, dir, &group);
- else
- ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
- } else
+ if (S_ISDIR(mode))
+ ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
+ else
ret2 = find_group_other(sb, dir, &group, mode);
got_group:
@@ -950,26 +820,21 @@ got:
goto fail;
}
- free = 0;
- ext4_lock_group(sb, group);
+ BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+ err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
+ brelse(block_bitmap_bh);
+
/* recheck and clear flag under lock if we still need to */
+ ext4_lock_group(sb, group);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- free = ext4_free_blocks_after_init(sb, group, gdp);
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_blks_set(sb, gdp, free);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
gdp);
}
ext4_unlock_group(sb, group);
- /* Don't need to dirty bitmap block if we didn't change it */
- if (free) {
- BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
- err = ext4_handle_dirty_metadata(handle,
- NULL, block_bitmap_bh);
- }
-
- brelse(block_bitmap_bh);
if (err)
goto fail;
}
@@ -987,8 +852,11 @@ got:
flex_group = ext4_flex_group(sbi, group);
atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
}
-
- if (test_opt(sb, GRPID)) {
+ if (owner) {
+ inode->i_mode = mode;
+ inode->i_uid = owner[0];
+ inode->i_gid = owner[1];
+ } else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = dir->i_gid;
@@ -1005,11 +873,7 @@ got:
ei->i_dir_start_lookup = 0;
ei->i_disksize = 0;
- /*
- * Don't inherit extent flag from directory, amongst others. We set
- * extent flag on newly created directory and file only if -o extent
- * mount option is specified
- */
+ /* Don't inherit extent flag from directory, amongst others. */
ei->i_flags =
ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
ei->i_file_acl = 0;
@@ -1084,7 +948,7 @@ fail_free_drop:
fail_drop:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
- inode->i_nlink = 0;
+ clear_nlink(inode);
unlock_new_inode(inode);
iput(inode);
brelse(inode_bitmap_bh);
@@ -1235,7 +1099,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
* inode allocation from the current group, so we take alloc_sem lock, to
* block ext4_claim_inode until we are finished.
*/
-extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
int barrier)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 0962642119c..3cfc73fbca8 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -699,6 +699,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
/*
* Okay, we need to do block allocation.
*/
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
+ "non-extent mapped inodes with bigalloc");
+ return -ENOSPC;
+ }
+
goal = ext4_find_goal(inode, map->m_lblk, partial);
/* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -1343,7 +1350,9 @@ void ext4_ind_truncate(struct inode *inode)
__le32 nr = 0;
int n = 0;
ext4_lblk_t last_block, max_block;
+ loff_t page_len;
unsigned blocksize = inode->i_sb->s_blocksize;
+ int err;
handle = start_transaction(inode);
if (IS_ERR(handle))
@@ -1354,9 +1363,16 @@ void ext4_ind_truncate(struct inode *inode)
max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
- if (inode->i_size & (blocksize - 1))
- if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+ if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+ page_len = PAGE_CACHE_SIZE -
+ (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, inode->i_size, page_len, 0);
+
+ if (err)
goto out_stop;
+ }
if (last_block != max_block) {
n = ext4_block_to_path(inode, last_block, offsets, NULL);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986e2388f03..92655fd8965 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -42,7 +42,6 @@
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
-#include "ext4_extents.h"
#include "truncate.h"
#include <trace/events/ext4.h>
@@ -268,7 +267,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
struct ext4_inode_info *ei = EXT4_I(inode);
spin_lock(&ei->i_block_reservation_lock);
- trace_ext4_da_update_reserve_space(inode, used);
+ trace_ext4_da_update_reserve_space(inode, used, quota_claim);
if (unlikely(used > ei->i_reserved_data_blocks)) {
ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
"with only %d reserved data blocks\n",
@@ -281,7 +280,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
/* Update per-inode reservations */
ei->i_reserved_data_blocks -= used;
ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
used + ei->i_allocated_meta_blocks);
ei->i_allocated_meta_blocks = 0;
@@ -291,7 +290,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
* only when we have written all of the delayed
* allocation blocks.
*/
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
ei->i_reserved_meta_blocks);
ei->i_reserved_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
@@ -300,14 +299,14 @@ void ext4_da_update_reserve_space(struct inode *inode,
/* Update quota subsystem for data blocks */
if (quota_claim)
- dquot_claim_block(inode, used);
+ dquot_claim_block(inode, EXT4_C2B(sbi, used));
else {
/*
* We did fallocate with an offset that is already delayed
* allocated. So on delayed allocated writeback we should
* not re-claim the quota for fallocated blocks.
*/
- dquot_release_reservation_block(inode, used);
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
}
/*
@@ -399,6 +398,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
}
/*
+ * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
+ */
+static void set_buffers_da_mapped(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct pagevec pvec;
+ int i, nr_pages;
+ pgoff_t index, end;
+
+ index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ end = (map->m_lblk + map->m_len - 1) >>
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ pagevec_init(&pvec, 0);
+ while (index <= end) {
+ nr_pages = pagevec_lookup(&pvec, mapping, index,
+ min(end - index + 1,
+ (pgoff_t)PAGEVEC_SIZE));
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh, *head;
+
+ if (unlikely(page->mapping != mapping) ||
+ !PageDirty(page))
+ break;
+
+ if (page_has_buffers(page)) {
+ bh = head = page_buffers(page);
+ do {
+ set_buffer_da_mapped(bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
+ }
+ index++;
+ }
+ pagevec_release(&pvec);
+ }
+}
+
+/*
* The ext4_map_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
*
@@ -416,7 +458,7 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
* the buffer head is mapped.
*
* It returns 0 if plain look up failed (blocks have not been allocated), in
- * that casem, buffer head is unmapped
+ * that case, buffer head is unmapped
*
* It returns the error in case of allocation failure.
*/
@@ -435,9 +477,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
*/
down_read((&EXT4_I(inode)->i_data_sem));
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, 0);
+ retval = ext4_ext_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
} else {
- retval = ext4_ind_map_blocks(handle, inode, map, 0);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
}
up_read((&EXT4_I(inode)->i_data_sem));
@@ -455,7 +499,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* Returns if the blocks have already allocated
*
* Note that if blocks have been preallocated
- * ext4_ext_get_block() returns th create = 0
+ * ext4_ext_get_block() returns the create = 0
* with buffer head unmapped.
*/
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
@@ -517,9 +561,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
ext4_da_update_reserve_space(inode, retval, 1);
}
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+ /* If we have successfully mapped the delayed allocated blocks,
+ * set the BH_Da_Mapped bit on them. Its important to do this
+ * under the protection of i_data_sem.
+ */
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+ set_buffers_da_mapped(inode, map);
+ }
+
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
int ret = check_block_validity(inode, map);
@@ -909,7 +961,11 @@ static int ext4_ordered_write_end(struct file *file,
ext4_orphan_add(handle, inode);
if (ret2 < 0)
ret = ret2;
+ } else {
+ unlock_page(page);
+ page_cache_release(page);
}
+
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1037,14 +1093,14 @@ static int ext4_journalled_write_end(struct file *file,
}
/*
- * Reserve a single block located at lblock
+ * Reserve a single cluster located at lblock
*/
static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
{
int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long md_needed;
+ unsigned int md_needed;
int ret;
/*
@@ -1054,7 +1110,8 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
*/
repeat:
spin_lock(&ei->i_block_reservation_lock);
- md_needed = ext4_calc_metadata_amount(inode, lblock);
+ md_needed = EXT4_NUM_B2C(sbi,
+ ext4_calc_metadata_amount(inode, lblock));
trace_ext4_da_reserve_space(inode, md_needed);
spin_unlock(&ei->i_block_reservation_lock);
@@ -1063,15 +1120,15 @@ repeat:
* us from metadata over-estimation, though we may go over by
* a small amount in the end. Here we just reserve for data.
*/
- ret = dquot_reserve_block(inode, 1);
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
if (ret)
return ret;
/*
* We do still charge estimated metadata to the sb though;
* we cannot afford to run out of free blocks.
*/
- if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
- dquot_release_reservation_block(inode, 1);
+ if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
@@ -1118,19 +1175,21 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
* We can release all of the reserved metadata blocks
* only when we have written all of the delayed
* allocation blocks.
+ * Note that in case of bigalloc, i_reserved_meta_blocks,
+ * i_reserved_data_blocks, etc. refer to number of clusters.
*/
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
ei->i_reserved_meta_blocks);
ei->i_reserved_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
}
/* update fs dirty data blocks counter */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- dquot_release_reservation_block(inode, to_free);
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
}
static void ext4_da_page_release_reservation(struct page *page,
@@ -1139,6 +1198,9 @@ static void ext4_da_page_release_reservation(struct page *page,
int to_release = 0;
struct buffer_head *head, *bh;
unsigned int curr_off = 0;
+ struct inode *inode = page->mapping->host;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int num_clusters;
head = page_buffers(page);
bh = head;
@@ -1148,10 +1210,24 @@ static void ext4_da_page_release_reservation(struct page *page,
if ((offset <= curr_off) && (buffer_delay(bh))) {
to_release++;
clear_buffer_delay(bh);
+ clear_buffer_da_mapped(bh);
}
curr_off = next_off;
} while ((bh = bh->b_this_page) != head);
- ext4_da_release_space(page->mapping->host, to_release);
+
+ /* If we have released all the blocks belonging to a cluster, then we
+ * need to release the reserved space for that cluster. */
+ num_clusters = EXT4_NUM_B2C(sbi, to_release);
+ while (num_clusters > 0) {
+ ext4_fsblk_t lblk;
+ lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+ ((num_clusters - 1) << sbi->s_cluster_bits);
+ if (sbi->s_cluster_ratio == 1 ||
+ !ext4_find_delalloc_cluster(inode, lblk, 1))
+ ext4_da_release_space(inode, 1);
+
+ num_clusters--;
+ }
}
/*
@@ -1253,6 +1329,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
clear_buffer_delay(bh);
bh->b_blocknr = pblock;
}
+ if (buffer_da_mapped(bh))
+ clear_buffer_da_mapped(bh);
if (buffer_unwritten(bh) ||
buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
@@ -1261,8 +1339,11 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
clear_buffer_unwritten(bh);
}
- /* skip page if block allocation undone */
- if (buffer_delay(bh) || buffer_unwritten(bh))
+ /*
+ * skip page if block allocation undone and
+ * block is dirty
+ */
+ if (ext4_bh_delay_or_unwritten(NULL, bh))
skip_page = 1;
bh = bh->b_this_page;
block_start += bh->b_size;
@@ -1346,12 +1427,15 @@ static void ext4_print_free_blocks(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
printk(KERN_CRIT "Total free blocks count %lld\n",
- ext4_count_free_blocks(inode->i_sb));
+ EXT4_C2B(EXT4_SB(inode->i_sb),
+ ext4_count_free_clusters(inode->i_sb)));
printk(KERN_CRIT "Free/Dirty block details\n");
printk(KERN_CRIT "free_blocks=%lld\n",
- (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
+ (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+ percpu_counter_sum(&sbi->s_freeclusters_counter)));
printk(KERN_CRIT "dirty_blocks=%lld\n",
- (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+ (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
printk(KERN_CRIT "Block reservation details\n");
printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
EXT4_I(inode)->i_reserved_data_blocks);
@@ -1430,8 +1514,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
if (err == -EAGAIN)
goto submit_io;
- if (err == -ENOSPC &&
- ext4_count_free_blocks(sb)) {
+ if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
mpd->retval = err;
goto submit_io;
}
@@ -1471,13 +1554,15 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
for (i = 0; i < map.m_len; i++)
unmap_underlying_metadata(bdev, map.m_pblk + i);
- }
- if (ext4_should_order_data(mpd->inode)) {
- err = ext4_jbd2_file_inode(handle, mpd->inode);
- if (err)
- /* This only happens if the journal is aborted */
- return;
+ if (ext4_should_order_data(mpd->inode)) {
+ err = ext4_jbd2_file_inode(handle, mpd->inode);
+ if (err) {
+ /* Only if the journal is aborted */
+ mpd->retval = err;
+ goto submit_io;
+ }
+ }
}
/*
@@ -1584,6 +1669,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
}
/*
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
+ */
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+ struct ext4_map_blocks *map,
+ struct buffer_head *bh)
+{
+ int retval;
+ sector_t invalid_block = ~((sector_t) 0xffff);
+
+ if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+ invalid_block = ~0;
+
+ map->m_flags = 0;
+ ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, map->m_len,
+ (unsigned long) map->m_lblk);
+ /*
+ * Try to see if we can get the block without requesting a new
+ * file system block.
+ */
+ down_read((&EXT4_I(inode)->i_data_sem));
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+ else
+ retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+
+ if (retval == 0) {
+ /*
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
+ */
+ /* If the block was allocated from previously allocated cluster,
+ * then we dont need to reserve it again. */
+ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+ retval = ext4_da_reserve_space(inode, iblock);
+ if (retval)
+ /* not enough space to reserve */
+ goto out_unlock;
+ }
+
+ /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
+ * and it should not appear on the bh->b_state.
+ */
+ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ }
+
+out_unlock:
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ return retval;
+}
+
+/*
* This is a special get_blocks_t callback which is used by
* ext4_da_write_begin(). It will either return mapped block or
* reserve space for a single block.
@@ -1600,10 +1745,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
{
struct ext4_map_blocks map;
int ret = 0;
- sector_t invalid_block = ~((sector_t) 0xffff);
-
- if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
- invalid_block = ~0;
BUG_ON(create == 0);
BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
@@ -1616,25 +1757,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
+ ret = ext4_da_map_blocks(inode, iblock, &map, bh);
+ if (ret <= 0)
return ret;
- if (ret == 0) {
- if (buffer_delay(bh))
- return 0; /* Not sure this could or should happen */
- /*
- * XXX: __block_write_begin() unmaps passed block, is it OK?
- */
- ret = ext4_da_reserve_space(inode, iblock);
- if (ret)
- /* not enough space to reserve */
- return ret;
-
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
- return 0;
- }
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
@@ -1811,8 +1936,12 @@ static int ext4_writepage(struct page *page,
* We don't want to do block allocation, so redirty
* the page and return. We may reach here when we do
* a journal commit via journal_submit_inode_data_buffers.
- * We can also reach here via shrink_page_list
+ * We can also reach here via shrink_page_list but it
+ * should never be for direct reclaim so warn if that
+ * happens
*/
+ WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+ PF_MEMALLOC);
goto redirty_page;
}
if (commit_write)
@@ -2046,6 +2175,7 @@ static int ext4_da_writepages(struct address_space *mapping,
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
pgoff_t done_index = 0;
pgoff_t end;
+ struct blk_plug plug;
trace_ext4_da_writepages(inode, wbc);
@@ -2124,6 +2254,7 @@ retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
+ blk_start_plug(&plug);
while (!ret && wbc->nr_to_write > 0) {
/*
@@ -2142,6 +2273,7 @@ retry:
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
+ blk_finish_plug(&plug);
goto out_writepages;
}
@@ -2174,11 +2306,12 @@ retry:
ret = 0;
} else if (ret == MPAGE_DA_EXTENT_TAIL) {
/*
- * got one extent now try with
- * rest of the pages
+ * Got one extent now try with rest of the pages.
+ * If mpd.retval is set -EIO, journal is aborted.
+ * So we don't need to write any more.
*/
pages_written += mpd.pages_written;
- ret = 0;
+ ret = mpd.retval;
io_done = 1;
} else if (wbc->nr_to_write)
/*
@@ -2188,6 +2321,7 @@ retry:
*/
break;
}
+ blk_finish_plug(&plug);
if (!io_done && !cycled) {
cycled = 1;
index = 0;
@@ -2226,10 +2360,11 @@ static int ext4_nonda_switch(struct super_block *sb)
* Delalloc need an accurate free block accounting. So switch
* to non delalloc when we are near to error range.
*/
- free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+ free_blocks = EXT4_C2B(sbi,
+ percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+ dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
if (2 * free_blocks < 3 * dirty_blocks ||
- free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+ free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
/*
* free block count is less than 150% of dirty blocks
* or free blocks is less than watermark
@@ -2241,7 +2376,7 @@ static int ext4_nonda_switch(struct super_block *sb)
* start pushing delalloc when 1/2 of free blocks are dirty.
*/
if (free_blocks < 2 * dirty_blocks)
- writeback_inodes_sb_if_idle(sb);
+ writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
return 0;
}
@@ -2367,7 +2502,7 @@ static int ext4_da_write_end(struct file *file,
*/
new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize) {
+ if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
if (ext4_da_should_update_i_disksize(page, end)) {
down_write(&EXT4_I(inode)->i_data_sem);
if (new_i_size > EXT4_I(inode)->i_disksize) {
@@ -2630,10 +2765,11 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
iocb->private, io_end->inode->i_ino, iocb, offset,
size);
+ iocb->private = NULL;
+
/* if not aio dio with unwritten extents, just free io and return */
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
ext4_free_io_end(io_end);
- iocb->private = NULL;
out:
if (is_async)
aio_complete(iocb, ret, 0);
@@ -2657,7 +2793,6 @@ out:
/* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work);
- iocb->private = NULL;
/* XXX: probably should move into the real I/O completion handler */
inode_dio_done(inode);
@@ -2685,10 +2820,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
* but being more careful is always safe for the future change.
*/
inode = io_end->inode;
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
- }
+ ext4_set_io_unwritten_flag(inode, io_end);
/* Add the io_end to per-inode completed io list*/
spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -2854,6 +2986,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
struct inode *inode = file->f_mapping->host;
ssize_t ret;
+ /*
+ * If we are doing data journalling we don't support O_DIRECT
+ */
+ if (ext4_should_journal_data(inode))
+ return 0;
+
trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -2923,6 +3061,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
@@ -2959,6 +3098,209 @@ void ext4_set_aops(struct inode *inode)
inode->i_mapping->a_ops = &ext4_journalled_aops;
}
+
+/*
+ * ext4_discard_partial_page_buffers()
+ * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
+ * This function finds and locks the page containing the offset
+ * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
+ * Calling functions that already have the page locked should call
+ * ext4_discard_partial_page_buffers_no_lock directly.
+ */
+int ext4_discard_partial_page_buffers(handle_t *handle,
+ struct address_space *mapping, loff_t from,
+ loff_t length, int flags)
+{
+ struct inode *inode = mapping->host;
+ struct page *page;
+ int err = 0;
+
+ page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+ mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (!page)
+ return -ENOMEM;
+
+ err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
+ from, length, flags);
+
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
+
+/*
+ * ext4_discard_partial_page_buffers_no_lock()
+ * Zeros a page range of length 'length' starting from offset 'from'.
+ * Buffer heads that correspond to the block aligned regions of the
+ * zeroed range will be unmapped. Unblock aligned regions
+ * will have the corresponding buffer head mapped if needed so that
+ * that region of the page can be updated with the partial zero out.
+ *
+ * This function assumes that the page has already been locked. The
+ * The range to be discarded must be contained with in the given page.
+ * If the specified range exceeds the end of the page it will be shortened
+ * to the end of the page that corresponds to 'from'. This function is
+ * appropriate for updating a page and it buffer heads to be unmapped and
+ * zeroed for blocks that have been either released, or are going to be
+ * released.
+ *
+ * handle: The journal handle
+ * inode: The files inode
+ * page: A locked page that contains the offset "from"
+ * from: The starting byte offset (from the begining of the file)
+ * to begin discarding
+ * len: The length of bytes to discard
+ * flags: Optional flags that may be used:
+ *
+ * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
+ * Only zero the regions of the page whose buffer heads
+ * have already been unmapped. This flag is appropriate
+ * for updateing the contents of a page whose blocks may
+ * have already been released, and we only want to zero
+ * out the regions that correspond to those released blocks.
+ *
+ * Returns zero on sucess or negative on failure.
+ */
+int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+ struct inode *inode, struct page *page, loff_t from,
+ loff_t length, int flags)
+{
+ ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned int offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned int blocksize, max, pos;
+ ext4_lblk_t iblock;
+ struct buffer_head *bh;
+ int err = 0;
+
+ blocksize = inode->i_sb->s_blocksize;
+ max = PAGE_CACHE_SIZE - offset;
+
+ if (index != page->index)
+ return -EINVAL;
+
+ /*
+ * correct length if it does not fall between
+ * 'from' and the end of the page
+ */
+ if (length > max || length < 0)
+ length = max;
+
+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+
+ /* Find the buffer that contains "offset" */
+ bh = page_buffers(page);
+ pos = blocksize;
+ while (offset >= pos) {
+ bh = bh->b_this_page;
+ iblock++;
+ pos += blocksize;
+ }
+
+ pos = offset;
+ while (pos < offset + length) {
+ unsigned int end_of_block, range_to_discard;
+
+ err = 0;
+
+ /* The length of space left to zero and unmap */
+ range_to_discard = offset + length - pos;
+
+ /* The length of space until the end of the block */
+ end_of_block = blocksize - (pos & (blocksize-1));
+
+ /*
+ * Do not unmap or zero past end of block
+ * for this buffer head
+ */
+ if (range_to_discard > end_of_block)
+ range_to_discard = end_of_block;
+
+
+ /*
+ * Skip this buffer head if we are only zeroing unampped
+ * regions of the page
+ */
+ if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
+ buffer_mapped(bh))
+ goto next;
+
+ /* If the range is block aligned, unmap */
+ if (range_to_discard == blocksize) {
+ clear_buffer_dirty(bh);
+ bh->b_bdev = NULL;
+ clear_buffer_mapped(bh);
+ clear_buffer_req(bh);
+ clear_buffer_new(bh);
+ clear_buffer_delay(bh);
+ clear_buffer_unwritten(bh);
+ clear_buffer_uptodate(bh);
+ zero_user(page, pos, range_to_discard);
+ BUFFER_TRACE(bh, "Buffer discarded");
+ goto next;
+ }
+
+ /*
+ * If this block is not completely contained in the range
+ * to be discarded, then it is not going to be released. Because
+ * we need to keep this block, we need to make sure this part
+ * of the page is uptodate before we modify it by writeing
+ * partial zeros on it.
+ */
+ if (!buffer_mapped(bh)) {
+ /*
+ * Buffer head must be mapped before we can read
+ * from the block
+ */
+ BUFFER_TRACE(bh, "unmapped");
+ ext4_get_block(inode, iblock, bh, 0);
+ /* unmapped? It's a hole - nothing to do */
+ if (!buffer_mapped(bh)) {
+ BUFFER_TRACE(bh, "still unmapped");
+ goto next;
+ }
+ }
+
+ /* Ok, it's mapped. Make sure it's up-to-date */
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
+
+ if (!buffer_uptodate(bh)) {
+ err = -EIO;
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ /* Uhhuh. Read error. Complain and punt.*/
+ if (!buffer_uptodate(bh))
+ goto next;
+ }
+
+ if (ext4_should_journal_data(inode)) {
+ BUFFER_TRACE(bh, "get write access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto next;
+ }
+
+ zero_user(page, pos, range_to_discard);
+
+ err = 0;
+ if (ext4_should_journal_data(inode)) {
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ } else
+ mark_buffer_dirty(bh);
+
+ BUFFER_TRACE(bh, "Partial buffer zeroed");
+next:
+ bh = bh->b_this_page;
+ iblock++;
+ pos += range_to_discard;
+ }
+
+ return err;
+}
+
/*
* ext4_block_truncate_page() zeroes out a mapping from file offset `from'
* up to the end of the block which corresponds to `from'.
@@ -3001,7 +3343,7 @@ int ext4_block_zero_page_range(handle_t *handle,
page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
mapping_gfp_mask(mapping) & ~__GFP_FS);
if (!page)
- return -EINVAL;
+ return -ENOMEM;
blocksize = inode->i_sb->s_blocksize;
max = blocksize - (offset & (blocksize - 1));
@@ -3070,11 +3412,8 @@ int ext4_block_zero_page_range(handle_t *handle,
err = 0;
if (ext4_should_journal_data(inode)) {
err = ext4_handle_dirty_metadata(handle, inode, bh);
- } else {
- if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
- err = ext4_jbd2_file_inode(handle, inode);
+ } else
mark_buffer_dirty(bh);
- }
unlock:
unlock_page(page);
@@ -3115,6 +3454,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
return -ENOTSUPP;
}
+ if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
+ /* TODO: Add support for bigalloc file systems */
+ return -ENOTSUPP;
+ }
+
return ext4_ext_punch_hole(file, offset, length);
}
@@ -3414,7 +3758,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
ei->i_dir_start_lookup = 0;
@@ -4416,6 +4760,7 @@ retry_alloc:
PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
+ ext4_journal_stop(handle);
goto out;
}
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index f18bfe37aff..a56796814d6 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -21,6 +21,7 @@
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = filp->f_dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int flags;
@@ -173,33 +174,8 @@ setversion_out:
mnt_drop_write(filp->f_path.mnt);
return err;
}
-#ifdef CONFIG_JBD2_DEBUG
- case EXT4_IOC_WAIT_FOR_READONLY:
- /*
- * This is racy - by the time we're woken up and running,
- * the superblock could be released. And the module could
- * have been unloaded. So sue me.
- *
- * Returns 1 if it slept, else zero.
- */
- {
- struct super_block *sb = inode->i_sb;
- DECLARE_WAITQUEUE(wait, current);
- int ret = 0;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
- if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
- schedule();
- ret = 1;
- }
- remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
- return ret;
- }
-#endif
case EXT4_IOC_GROUP_EXTEND: {
ext4_fsblk_t n_blocks_count;
- struct super_block *sb = inode->i_sb;
int err, err2=0;
err = ext4_resize_begin(sb);
@@ -209,6 +185,13 @@ setversion_out:
if (get_user(n_blocks_count, (__u32 __user *)arg))
return -EFAULT;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+
err = mnt_want_write(filp->f_path.mnt);
if (err)
return err;
@@ -250,6 +233,13 @@ setversion_out:
goto mext_out;
}
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+
err = mnt_want_write(filp->f_path.mnt);
if (err)
goto mext_out;
@@ -270,7 +260,6 @@ mext_out:
case EXT4_IOC_GROUP_ADD: {
struct ext4_new_group_data input;
- struct super_block *sb = inode->i_sb;
int err, err2=0;
err = ext4_resize_begin(sb);
@@ -281,6 +270,13 @@ mext_out:
sizeof(input)))
return -EFAULT;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+
err = mnt_want_write(filp->f_path.mnt);
if (err)
return err;
@@ -337,7 +333,6 @@ mext_out:
case FITRIM:
{
- struct super_block *sb = inode->i_sb;
struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
@@ -348,7 +343,14 @@ mext_out:
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
- if (copy_from_user(&range, (struct fstrim_range *)arg,
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "FITRIM not supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+
+ if (copy_from_user(&range, (struct fstrim_range __user *)arg,
sizeof(range)))
return -EFAULT;
@@ -358,7 +360,7 @@ mext_out:
if (ret < 0)
return ret;
- if (copy_to_user((struct fstrim_range *)arg, &range,
+ if (copy_to_user((struct fstrim_range __user *)arg, &range,
sizeof(range)))
return -EFAULT;
@@ -396,11 +398,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC32_SETVERSION_OLD:
cmd = EXT4_IOC_SETVERSION_OLD;
break;
-#ifdef CONFIG_JBD2_DEBUG
- case EXT4_IOC32_WAIT_FOR_READONLY:
- cmd = EXT4_IOC_WAIT_FOR_READONLY;
- break;
-#endif
case EXT4_IOC32_GETRSVSZ:
cmd = EXT4_IOC_GETRSVSZ;
break;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 17a5a57c415..e2d8be8f28b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -70,8 +70,8 @@
*
* pa_lstart -> the logical start block for this prealloc space
* pa_pstart -> the physical start block for this prealloc space
- * pa_len -> length for this prealloc space
- * pa_free -> free space available in this prealloc space
+ * pa_len -> length for this prealloc space (in clusters)
+ * pa_free -> free space available in this prealloc space (in clusters)
*
* The inode preallocation space is used looking at the _logical_ start
* block. If only the logical file block falls within the range of prealloc
@@ -126,7 +126,8 @@
* list. In case of inode preallocation we follow a list of heuristics
* based on file size. This can be found in ext4_mb_normalize_request. If
* we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
+ * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
+ * dependent on the cluster size; for non-bigalloc file systems, it is
* 512 blocks. This can be tuned via
* /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
* terms of number of blocks. If we have mounted the file system with -O
@@ -459,7 +460,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
ext4_fsblk_t blocknr;
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
- blocknr += first + i;
+ blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
@@ -580,7 +581,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
continue;
}
- /* both bits in buddy2 must be 0 */
+ /* both bits in buddy2 must be 1 */
MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
@@ -653,7 +654,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
ext4_grpblk_t chunk;
unsigned short border;
- BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
+ BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
border = 2 << sb->s_blocksize_bits;
@@ -705,7 +706,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
- ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
ext4_grpblk_t i = 0;
ext4_grpblk_t first;
ext4_grpblk_t len;
@@ -734,7 +735,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
if (free != grp->bb_free) {
ext4_grp_locked_error(sb, group, 0, 0,
- "%u blocks in bitmap, %u in gd",
+ "%u clusters in bitmap, %u in gd",
free, grp->bb_free);
/*
* If we intent to continue, we consider group descritor
@@ -1339,7 +1340,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
ext4_fsblk_t blocknr;
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
- blocknr += block;
+ blocknr += EXT4_C2B(EXT4_SB(sb), block);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
@@ -1390,7 +1391,6 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
{
int next = block;
int max;
- int ord;
void *buddy;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
@@ -1432,9 +1432,8 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
break;
- ord = mb_find_order_for_block(e4b, next);
+ order = mb_find_order_for_block(e4b, next);
- order = ord;
block = next >> order;
ex->fe_len += 1 << order;
}
@@ -1624,8 +1623,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
struct ext4_free_extent *gex = &ac->ac_g_ex;
BUG_ON(ex->fe_len <= 0);
- BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
- BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
ac->ac_found++;
@@ -1823,15 +1822,15 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
while (free && ac->ac_status == AC_STATUS_CONTINUE) {
i = mb_find_next_zero_bit(bitmap,
- EXT4_BLOCKS_PER_GROUP(sb), i);
- if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+ EXT4_CLUSTERS_PER_GROUP(sb), i);
+ if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
/*
* IF we have corrupt bitmap, we won't find any
* free blocks even though group info says we
* we have free blocks
*/
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
- "%d free blocks as per "
+ "%d free clusters as per "
"group info. But bitmap says 0",
free);
break;
@@ -1841,7 +1840,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
BUG_ON(ex.fe_len <= 0);
if (free < ex.fe_len) {
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
- "%d free blocks as per "
+ "%d free clusters as per "
"group info. But got %d blocks",
free, ex.fe_len);
/*
@@ -1887,7 +1886,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
do_div(a, sbi->s_stripe);
i = (a * sbi->s_stripe) - first_group_block;
- while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+ while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
if (max >= sbi->s_stripe) {
@@ -2252,10 +2251,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
*/
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
meta_group_info[i]->bb_free =
- ext4_free_blocks_after_init(sb, group, desc);
+ ext4_free_clusters_after_init(sb, group, desc);
} else {
meta_group_info[i]->bb_free =
- ext4_free_blks_count(sb, desc);
+ ext4_free_group_clusters(sb, desc);
}
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
@@ -2473,7 +2472,20 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+ /*
+ * The default group preallocation is 512, which for 4k block
+ * sizes translates to 2 megabytes. However for bigalloc file
+ * systems, this is probably too big (i.e, if the cluster size
+ * is 1 megabyte, then group preallocation size becomes half a
+ * gigabyte!). As a default, we will keep a two megabyte
+ * group pralloc size for cluster sizes up to 64k, and after
+ * that, we will force a minimum group preallocation size of
+ * 32 clusters. This translates to 8 megs when the cluster
+ * size is 256k, and 32 megs when the cluster size is 1 meg,
+ * which seems reasonable as a default.
+ */
+ sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
+ sbi->s_cluster_bits, 32);
/*
* If there is a s_stripe > 1, then we set the s_mb_group_prealloc
* to the lowest multiple of s_stripe which is bigger than
@@ -2490,7 +2502,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
ret = -ENOMEM;
- goto out;
+ goto out_free_groupinfo_slab;
}
for_each_possible_cpu(i) {
struct ext4_locality_group *lg;
@@ -2503,9 +2515,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
/* init file for buddy data */
ret = ext4_mb_init_backend(sb);
- if (ret != 0) {
- goto out;
- }
+ if (ret != 0)
+ goto out_free_locality_groups;
if (sbi->s_proc)
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
@@ -2513,11 +2524,19 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
if (sbi->s_journal)
sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+
+ return 0;
+
+out_free_locality_groups:
+ free_percpu(sbi->s_locality_groups);
+ sbi->s_locality_groups = NULL;
+out_free_groupinfo_slab:
+ ext4_groupinfo_destroy_slabs();
out:
- if (ret) {
- kfree(sbi->s_mb_offsets);
- kfree(sbi->s_mb_maxs);
- }
+ kfree(sbi->s_mb_offsets);
+ sbi->s_mb_offsets = NULL;
+ kfree(sbi->s_mb_maxs);
+ sbi->s_mb_maxs = NULL;
return ret;
}
@@ -2602,11 +2621,13 @@ int ext4_mb_release(struct super_block *sb)
}
static inline int ext4_issue_discard(struct super_block *sb,
- ext4_group_t block_group, ext4_grpblk_t block, int count)
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count)
{
ext4_fsblk_t discard_block;
- discard_block = block + ext4_group_first_block_no(sb, block_group);
+ discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
+ ext4_group_first_block_no(sb, block_group));
+ count = EXT4_C2B(EXT4_SB(sb), count);
trace_ext4_discard_blocks(sb,
(unsigned long long) discard_block, count);
return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
@@ -2633,7 +2654,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
if (test_opt(sb, DISCARD))
ext4_issue_discard(sb, entry->group,
- entry->start_blk, entry->count);
+ entry->start_cluster, entry->count);
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
@@ -2646,7 +2667,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
ext4_lock_group(sb, entry->group);
/* Take it out of per group rb tree */
rb_erase(&entry->node, &(db->bb_free_root));
- mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+ mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count);
/*
* Clear the trimmed flag for the group so that the next
@@ -2752,7 +2773,7 @@ void ext4_exit_mballoc(void)
*/
static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
- handle_t *handle, unsigned int reserv_blks)
+ handle_t *handle, unsigned int reserv_clstrs)
{
struct buffer_head *bitmap_bh = NULL;
struct ext4_group_desc *gdp;
@@ -2783,7 +2804,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
goto out_err;
ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
- ext4_free_blks_count(sb, gdp));
+ ext4_free_group_clusters(sb, gdp));
err = ext4_journal_get_write_access(handle, gdp_bh);
if (err)
@@ -2791,7 +2812,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
- len = ac->ac_b_ex.fe_len;
+ len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (!ext4_data_block_valid(sbi, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
"fs metadata\n", block, block+len);
@@ -2823,28 +2844,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ac->ac_b_ex.fe_len);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_blks_set(sb, gdp,
- ext4_free_blocks_after_init(sb,
- ac->ac_b_ex.fe_group, gdp));
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb,
+ ac->ac_b_ex.fe_group, gdp));
}
- len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
- ext4_free_blks_set(sb, gdp, len);
+ len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
+ ext4_free_group_clusters_set(sb, gdp, len);
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
- percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+ percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
/*
* Now reduce the dirty block count also. Should not go negative
*/
if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
ac->ac_b_ex.fe_group);
atomic_sub(ac->ac_b_ex.fe_len,
- &sbi->s_flex_groups[flex_group].free_blocks);
+ &sbi->s_flex_groups[flex_group].free_clusters);
}
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -2886,6 +2908,7 @@ static noinline_for_stack void
ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits, max;
ext4_lblk_t end;
loff_t size, orig_size, start_off;
@@ -2916,7 +2939,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* first, let's learn actual file size
* given current request is allocated */
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
size = size << bsbits;
if (size < i_size_read(ac->ac_inode))
size = i_size_read(ac->ac_inode);
@@ -2988,7 +3011,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
continue;
}
- pa_end = pa->pa_lstart + pa->pa_len;
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
/* PA must not overlap original request */
BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
@@ -3018,9 +3042,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
rcu_read_lock();
list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
ext4_lblk_t pa_end;
+
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0) {
- pa_end = pa->pa_lstart + pa->pa_len;
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
}
spin_unlock(&pa->pa_lock);
@@ -3036,14 +3062,14 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
}
BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
start > ac->ac_o_ex.fe_logical);
- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+ BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
/* now prepare goal request */
/* XXX: is it better to align blocks WRT to logical
* placement or satisfy big request as is */
ac->ac_g_ex.fe_logical = start;
- ac->ac_g_ex.fe_len = size;
+ ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
/* define goal start in order to merge */
if (ar->pright && (ar->lright == (start + size))) {
@@ -3112,14 +3138,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
struct ext4_prealloc_space *pa)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
ext4_fsblk_t start;
ext4_fsblk_t end;
int len;
/* found preallocated blocks, use them */
start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
- end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
- len = end - start;
+ end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+ start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
+ len = EXT4_NUM_B2C(sbi, end - start);
ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
&ac->ac_b_ex.fe_start);
ac->ac_b_ex.fe_len = len;
@@ -3127,7 +3155,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
ac->ac_pa = pa;
BUG_ON(start < pa->pa_pstart);
- BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+ BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
BUG_ON(pa->pa_free < len);
pa->pa_free -= len;
@@ -3193,6 +3221,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
static noinline_for_stack int
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
@@ -3210,12 +3239,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/* all fields in this condition don't change,
* so we can skip locking for them */
if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
- ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+ ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
+ EXT4_C2B(sbi, pa->pa_len)))
continue;
/* non-extent files can't have physical blocks past 2^32 */
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
- pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+ (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
+ EXT4_MAX_BLOCK_FILE_PHYS))
continue;
/* found preallocated blocks, use them */
@@ -3291,7 +3322,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
while (n) {
entry = rb_entry(n, struct ext4_free_data, node);
- ext4_set_bits(bitmap, entry->start_blk, entry->count);
+ ext4_set_bits(bitmap, entry->start_cluster, entry->count);
n = rb_next(n);
}
return;
@@ -3312,7 +3343,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t groupnr;
ext4_grpblk_t start;
int preallocated = 0;
- int count = 0;
int len;
/* all form of preallocation discards first load group,
@@ -3335,7 +3365,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
BUG_ON(groupnr != group);
ext4_set_bits(bitmap, start, len);
preallocated += len;
- count++;
}
mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
}
@@ -3412,6 +3441,7 @@ static noinline_for_stack int
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_prealloc_space *pa;
struct ext4_group_info *grp;
struct ext4_inode_info *ei;
@@ -3443,16 +3473,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
/* also, we should cover whole original request */
- wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+ wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
/* the smallest one defines real window */
win = min(winl, wins);
- offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+ offs = ac->ac_o_ex.fe_logical %
+ EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (offs && offs < win)
win = offs;
- ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+ ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
+ EXT4_B2C(sbi, win);
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
}
@@ -3477,7 +3509,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
trace_ext4_mb_new_inode_pa(ac, pa);
ext4_mb_use_inode_pa(ac, pa);
- atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+ atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
ei = EXT4_I(ac->ac_inode);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
@@ -3592,7 +3624,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- grp_blk_start = pa->pa_pstart - bit;
+ grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
end = bit + pa->pa_len;
@@ -3607,7 +3639,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
free += next - bit;
trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
- trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit,
+ trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
+ EXT4_C2B(sbi, bit)),
next - bit);
mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
bit = next + 1;
@@ -3690,7 +3723,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
}
if (needed == 0)
- needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
INIT_LIST_HEAD(&list);
repeat:
@@ -3958,7 +3991,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return;
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
@@ -3969,6 +4002,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
return;
}
+ if (sbi->s_mb_group_prealloc <= 0) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ return;
+ }
+
/* don't use group allocation for large files */
size = max(size, isize);
if (size > sbi->s_mb_stream_request) {
@@ -4007,8 +4045,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
len = ar->len;
/* just a dirty hack to filter too big requests */
- if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
- len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+ if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10)
+ len = EXT4_CLUSTERS_PER_GROUP(sb) - 10;
/* start searching from the goal */
goal = ar->goal;
@@ -4019,18 +4057,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
/* set up allocation goals */
memset(ac, 0, sizeof(struct ext4_allocation_context));
- ac->ac_b_ex.fe_logical = ar->logical;
+ ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_sb = sb;
ac->ac_inode = ar->inode;
- ac->ac_o_ex.fe_logical = ar->logical;
+ ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
ac->ac_o_ex.fe_group = group;
ac->ac_o_ex.fe_start = block;
ac->ac_o_ex.fe_len = len;
- ac->ac_g_ex.fe_logical = ar->logical;
- ac->ac_g_ex.fe_group = group;
- ac->ac_g_ex.fe_start = block;
- ac->ac_g_ex.fe_len = len;
+ ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
/* we have to define context: we'll we work with a file or
@@ -4182,13 +4217,14 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
if (pa->pa_type == MB_GROUP_PA) {
/* see comment in ext4_mb_use_group_pa() */
spin_lock(&pa->pa_lock);
- pa->pa_pstart += ac->ac_b_ex.fe_len;
- pa->pa_lstart += ac->ac_b_ex.fe_len;
+ pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
@@ -4249,13 +4285,17 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
struct super_block *sb;
ext4_fsblk_t block = 0;
unsigned int inquota = 0;
- unsigned int reserv_blks = 0;
+ unsigned int reserv_clstrs = 0;
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
trace_ext4_request_blocks(ar);
+ /* Allow to use superuser reservation for quota file */
+ if (IS_NOQUOTA(ar->inode))
+ ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
+
/*
* For delayed allocation, we could skip the ENOSPC and
* EDQUOT check, as blocks and quotas have been already
@@ -4269,7 +4309,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
* and verify allocation doesn't exceed the quota limits.
*/
while (ar->len &&
- ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
+ ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
/* let others to free the space */
yield();
@@ -4279,12 +4319,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
*errp = -ENOSPC;
return 0;
}
- reserv_blks = ar->len;
+ reserv_clstrs = ar->len;
if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
- dquot_alloc_block_nofail(ar->inode, ar->len);
+ dquot_alloc_block_nofail(ar->inode,
+ EXT4_C2B(sbi, ar->len));
} else {
while (ar->len &&
- dquot_alloc_block(ar->inode, ar->len)) {
+ dquot_alloc_block(ar->inode,
+ EXT4_C2B(sbi, ar->len))) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
ar->len--;
@@ -4328,7 +4370,7 @@ repeat:
ext4_mb_new_preallocation(ac);
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
- *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
+ *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
if (*errp == -EAGAIN) {
/*
* drop the reference that we took
@@ -4364,13 +4406,13 @@ out:
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
if (inquota && ar->len < inquota)
- dquot_free_block(ar->inode, inquota - ar->len);
+ dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
if (!ar->len) {
if (!ext4_test_inode_state(ar->inode,
EXT4_STATE_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
- reserv_blks);
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
}
trace_ext4_allocate_blocks(ar, (unsigned long long)block);
@@ -4388,7 +4430,7 @@ static int can_merge(struct ext4_free_data *entry1,
{
if ((entry1->t_tid == entry2->t_tid) &&
(entry1->group == entry2->group) &&
- ((entry1->start_blk + entry1->count) == entry2->start_blk))
+ ((entry1->start_cluster + entry1->count) == entry2->start_cluster))
return 1;
return 0;
}
@@ -4398,7 +4440,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_free_data *new_entry)
{
ext4_group_t group = e4b->bd_group;
- ext4_grpblk_t block;
+ ext4_grpblk_t cluster;
struct ext4_free_data *entry;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
@@ -4411,7 +4453,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
BUG_ON(e4b->bd_buddy_page == NULL);
new_node = &new_entry->node;
- block = new_entry->start_blk;
+ cluster = new_entry->start_cluster;
if (!*n) {
/* first free block exent. We need to
@@ -4425,13 +4467,14 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
while (*n) {
parent = *n;
entry = rb_entry(parent, struct ext4_free_data, node);
- if (block < entry->start_blk)
+ if (cluster < entry->start_cluster)
n = &(*n)->rb_left;
- else if (block >= (entry->start_blk + entry->count))
+ else if (cluster >= (entry->start_cluster + entry->count))
n = &(*n)->rb_right;
else {
ext4_grp_locked_error(sb, group, 0,
- ext4_group_first_block_no(sb, group) + block,
+ ext4_group_first_block_no(sb, group) +
+ EXT4_C2B(sbi, cluster),
"Block already on to-be-freed list");
return 0;
}
@@ -4445,7 +4488,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
if (node) {
entry = rb_entry(node, struct ext4_free_data, node);
if (can_merge(entry, new_entry)) {
- new_entry->start_blk = entry->start_blk;
+ new_entry->start_cluster = entry->start_cluster;
new_entry->count += entry->count;
rb_erase(node, &(db->bb_free_root));
spin_lock(&sbi->s_md_lock);
@@ -4496,6 +4539,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
ext4_group_t block_group;
struct ext4_sb_info *sbi;
struct ext4_buddy e4b;
+ unsigned int count_clusters;
int err = 0;
int ret;
@@ -4544,6 +4588,38 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
if (!ext4_should_writeback_data(inode))
flags |= EXT4_FREE_BLOCKS_METADATA;
+ /*
+ * If the extent to be freed does not begin on a cluster
+ * boundary, we need to deal with partial clusters at the
+ * beginning and end of the extent. Normally we will free
+ * blocks at the beginning or the end unless we are explicitly
+ * requested to avoid doing so.
+ */
+ overflow = block & (sbi->s_cluster_ratio - 1);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+ overflow = sbi->s_cluster_ratio - overflow;
+ block += overflow;
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else {
+ block -= overflow;
+ count += overflow;
+ }
+ }
+ overflow = count & (sbi->s_cluster_ratio - 1);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else
+ count += sbi->s_cluster_ratio - overflow;
+ }
+
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4552,10 +4628,12 @@ do_more:
* Check to see if we are freeing blocks across a group
* boundary.
*/
- if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+ if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+ overflow = EXT4_C2B(sbi, bit) + count -
+ EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
+ count_clusters = EXT4_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh) {
err = -EIO;
@@ -4570,9 +4648,9 @@ do_more:
if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
in_range(block, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group) ||
+ EXT4_SB(sb)->s_itb_per_group) ||
in_range(block + count - 1, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group)) {
+ EXT4_SB(sb)->s_itb_per_group)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
@@ -4597,11 +4675,11 @@ do_more:
#ifdef AGGRESSIVE_CHECK
{
int i;
- for (i = 0; i < count; i++)
+ for (i = 0; i < count_clusters; i++)
BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
}
#endif
- trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
+ trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
err = ext4_mb_load_buddy(sb, block_group, &e4b);
if (err)
@@ -4618,13 +4696,13 @@ do_more:
err = -ENOMEM;
goto error_return;
}
- new_entry->start_blk = bit;
+ new_entry->start_cluster = bit;
new_entry->group = block_group;
- new_entry->count = count;
+ new_entry->count = count_clusters;
new_entry->t_tid = handle->h_transaction->t_tid;
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count);
+ mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
ext4_mb_free_metadata(handle, &e4b, new_entry);
} else {
/* need to update group_info->bb_free and bitmap
@@ -4632,25 +4710,29 @@ do_more:
* them with group lock_held
*/
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count);
- mb_free_blocks(inode, &e4b, bit, count);
+ mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+ mb_free_blocks(inode, &e4b, bit, count_clusters);
}
- ret = ext4_free_blks_count(sb, gdp) + count;
- ext4_free_blks_set(sb, gdp, ret);
+ ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
+ ext4_free_group_clusters_set(sb, gdp, ret);
gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
ext4_unlock_group(sb, block_group);
- percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic_add(count_clusters,
+ &sbi->s_flex_groups[flex_group].free_clusters);
}
ext4_mb_unload_buddy(&e4b);
freed += count;
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4669,8 +4751,6 @@ do_more:
}
ext4_mark_super_dirty(sb);
error_return:
- if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
- dquot_free_block(inode, freed);
brelse(bitmap_bh);
ext4_std_error(sb, err);
return;
@@ -4778,16 +4858,17 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count);
mb_free_blocks(NULL, &e4b, bit, count);
- blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
- ext4_free_blks_set(sb, desc, blk_free_count);
+ blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
+ ext4_free_group_clusters_set(sb, desc, blk_free_count);
desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
ext4_unlock_group(sb, block_group);
- percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ EXT4_B2C(sbi, blocks_freed));
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic_add(blocks_freed,
- &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic_add(EXT4_B2C(sbi, blocks_freed),
+ &sbi->s_flex_groups[flex_group].free_clusters);
}
ext4_mb_unload_buddy(&e4b);
@@ -4948,7 +5029,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
struct ext4_group_info *grp;
ext4_group_t first_group, last_group;
ext4_group_t group, ngroups = ext4_get_groups_count(sb);
- ext4_grpblk_t cnt = 0, first_block, last_block;
+ ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
uint64_t start, len, minlen, trimmed = 0;
ext4_fsblk_t first_data_blk =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
@@ -4958,7 +5039,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
len = range->len >> sb->s_blocksize_bits;
minlen = range->minlen >> sb->s_blocksize_bits;
- if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+ if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)))
return -EINVAL;
if (start + len <= first_data_blk)
goto out;
@@ -4969,11 +5050,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
/* Determine first and last group to examine based on start and len */
ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
- &first_group, &first_block);
+ &first_group, &first_cluster);
ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
- &last_group, &last_block);
+ &last_group, &last_cluster);
last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
- last_block = EXT4_BLOCKS_PER_GROUP(sb);
+ last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
if (first_group > last_group)
return -EINVAL;
@@ -4993,20 +5074,20 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
* change it for the last group in which case start +
* len < EXT4_BLOCKS_PER_GROUP(sb).
*/
- if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
- last_block = first_block + len;
- len -= last_block - first_block;
+ if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb))
+ last_cluster = first_cluster + len;
+ len -= last_cluster - first_cluster;
if (grp->bb_free >= minlen) {
- cnt = ext4_trim_all_free(sb, group, first_block,
- last_block, minlen);
+ cnt = ext4_trim_all_free(sb, group, first_cluster,
+ last_cluster, minlen);
if (cnt < 0) {
ret = cnt;
break;
}
}
trimmed += cnt;
- first_block = 0;
+ first_cluster = 0;
}
range->len = trimmed * sb->s_blocksize;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 9d4a636b546..47705f3285e 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -106,7 +106,7 @@ struct ext4_free_data {
ext4_group_t group;
/* free block extent */
- ext4_grpblk_t start_blk;
+ ext4_grpblk_t start_cluster;
ext4_grpblk_t count;
/* transaction which freed this extent */
@@ -139,9 +139,9 @@ enum {
struct ext4_free_extent {
ext4_lblk_t fe_logical;
- ext4_grpblk_t fe_start;
+ ext4_grpblk_t fe_start; /* In cluster units */
ext4_group_t fe_group;
- ext4_grpblk_t fe_len;
+ ext4_grpblk_t fe_len; /* In cluster units */
};
/*
@@ -175,7 +175,7 @@ struct ext4_allocation_context {
/* the best found extent */
struct ext4_free_extent ac_b_ex;
- /* copy of the bext found extent taken before preallocation efforts */
+ /* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex;
/* number of iterations done. we have to track to limit searching */
@@ -216,6 +216,7 @@ struct ext4_buddy {
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
struct ext4_free_extent *fex)
{
- return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
+ return ext4_group_first_block_no(sb, fex->fe_group) +
+ (fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
}
#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b57b98fb44d..16ac228dbec 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -15,19 +15,18 @@
#include <linux/module.h>
#include <linux/slab.h>
#include "ext4_jbd2.h"
-#include "ext4_extents.h"
/*
* The contiguous blocks details which can be
* represented by a single extent
*/
-struct list_blocks_struct {
- ext4_lblk_t first_block, last_block;
+struct migrate_struct {
+ ext4_lblk_t first_block, last_block, curr_block;
ext4_fsblk_t first_pblock, last_pblock;
};
static int finish_range(handle_t *handle, struct inode *inode,
- struct list_blocks_struct *lb)
+ struct migrate_struct *lb)
{
int retval = 0, needed;
@@ -87,8 +86,7 @@ err_out:
}
static int update_extent_range(handle_t *handle, struct inode *inode,
- ext4_fsblk_t pblock, ext4_lblk_t blk_num,
- struct list_blocks_struct *lb)
+ ext4_fsblk_t pblock, struct migrate_struct *lb)
{
int retval;
/*
@@ -96,9 +94,10 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
*/
if (lb->first_pblock &&
(lb->last_pblock+1 == pblock) &&
- (lb->last_block+1 == blk_num)) {
+ (lb->last_block+1 == lb->curr_block)) {
lb->last_pblock = pblock;
- lb->last_block = blk_num;
+ lb->last_block = lb->curr_block;
+ lb->curr_block++;
return 0;
}
/*
@@ -106,64 +105,49 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
*/
retval = finish_range(handle, inode, lb);
lb->first_pblock = lb->last_pblock = pblock;
- lb->first_block = lb->last_block = blk_num;
-
+ lb->first_block = lb->last_block = lb->curr_block;
+ lb->curr_block++;
return retval;
}
static int update_ind_extent_range(handle_t *handle, struct inode *inode,
- ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
- struct list_blocks_struct *lb)
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
{
struct buffer_head *bh;
__le32 *i_data;
int i, retval = 0;
- ext4_lblk_t blk_count = *blk_nump;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- if (!pblock) {
- /* Only update the file block number */
- *blk_nump += max_entries;
- return 0;
- }
-
bh = sb_bread(inode->i_sb, pblock);
if (!bh)
return -EIO;
i_data = (__le32 *)bh->b_data;
- for (i = 0; i < max_entries; i++, blk_count++) {
+ for (i = 0; i < max_entries; i++) {
if (i_data[i]) {
retval = update_extent_range(handle, inode,
- le32_to_cpu(i_data[i]),
- blk_count, lb);
+ le32_to_cpu(i_data[i]), lb);
if (retval)
break;
+ } else {
+ lb->curr_block++;
}
}
-
- /* Update the file block number */
- *blk_nump = blk_count;
put_bh(bh);
return retval;
}
static int update_dind_extent_range(handle_t *handle, struct inode *inode,
- ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
- struct list_blocks_struct *lb)
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
{
struct buffer_head *bh;
__le32 *i_data;
int i, retval = 0;
- ext4_lblk_t blk_count = *blk_nump;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- if (!pblock) {
- /* Only update the file block number */
- *blk_nump += max_entries * max_entries;
- return 0;
- }
bh = sb_bread(inode->i_sb, pblock);
if (!bh)
return -EIO;
@@ -172,38 +156,28 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode,
for (i = 0; i < max_entries; i++) {
if (i_data[i]) {
retval = update_ind_extent_range(handle, inode,
- le32_to_cpu(i_data[i]),
- &blk_count, lb);
+ le32_to_cpu(i_data[i]), lb);
if (retval)
break;
} else {
/* Only update the file block number */
- blk_count += max_entries;
+ lb->curr_block += max_entries;
}
}
-
- /* Update the file block number */
- *blk_nump = blk_count;
put_bh(bh);
return retval;
}
static int update_tind_extent_range(handle_t *handle, struct inode *inode,
- ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
- struct list_blocks_struct *lb)
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
{
struct buffer_head *bh;
__le32 *i_data;
int i, retval = 0;
- ext4_lblk_t blk_count = *blk_nump;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- if (!pblock) {
- /* Only update the file block number */
- *blk_nump += max_entries * max_entries * max_entries;
- return 0;
- }
bh = sb_bread(inode->i_sb, pblock);
if (!bh)
return -EIO;
@@ -212,16 +186,14 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
for (i = 0; i < max_entries; i++) {
if (i_data[i]) {
retval = update_dind_extent_range(handle, inode,
- le32_to_cpu(i_data[i]),
- &blk_count, lb);
+ le32_to_cpu(i_data[i]), lb);
if (retval)
break;
- } else
+ } else {
/* Only update the file block number */
- blk_count += max_entries * max_entries;
+ lb->curr_block += max_entries * max_entries;
+ }
}
- /* Update the file block number */
- *blk_nump = blk_count;
put_bh(bh);
return retval;
@@ -462,12 +434,12 @@ int ext4_ext_migrate(struct inode *inode)
handle_t *handle;
int retval = 0, i;
__le32 *i_data;
- ext4_lblk_t blk_count = 0;
struct ext4_inode_info *ei;
struct inode *tmp_inode = NULL;
- struct list_blocks_struct lb;
+ struct migrate_struct lb;
unsigned long max_entries;
__u32 goal;
+ uid_t owner[2];
/*
* If the filesystem does not support extents, or the inode
@@ -495,10 +467,12 @@ int ext4_ext_migrate(struct inode *inode)
}
goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+ owner[0] = inode->i_uid;
+ owner[1] = inode->i_gid;
tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
- S_IFREG, NULL, goal);
+ S_IFREG, NULL, goal, owner);
if (IS_ERR(tmp_inode)) {
- retval = -ENOMEM;
+ retval = PTR_ERR(inode);
ext4_journal_stop(handle);
return retval;
}
@@ -507,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode)
* Set the i_nlink to zero so it will be deleted later
* when we drop inode reference.
*/
- tmp_inode->i_nlink = 0;
+ clear_nlink(tmp_inode);
ext4_ext_tree_init(handle, tmp_inode);
ext4_orphan_add(handle, tmp_inode);
@@ -551,35 +525,32 @@ int ext4_ext_migrate(struct inode *inode)
/* 32 bit block address 4 bytes */
max_entries = inode->i_sb->s_blocksize >> 2;
- for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
+ for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
if (i_data[i]) {
retval = update_extent_range(handle, tmp_inode,
- le32_to_cpu(i_data[i]),
- blk_count, &lb);
+ le32_to_cpu(i_data[i]), &lb);
if (retval)
goto err_out;
- }
+ } else
+ lb.curr_block++;
}
if (i_data[EXT4_IND_BLOCK]) {
retval = update_ind_extent_range(handle, tmp_inode,
- le32_to_cpu(i_data[EXT4_IND_BLOCK]),
- &blk_count, &lb);
+ le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
if (retval)
goto err_out;
} else
- blk_count += max_entries;
+ lb.curr_block += max_entries;
if (i_data[EXT4_DIND_BLOCK]) {
retval = update_dind_extent_range(handle, tmp_inode,
- le32_to_cpu(i_data[EXT4_DIND_BLOCK]),
- &blk_count, &lb);
+ le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
if (retval)
goto err_out;
} else
- blk_count += max_entries * max_entries;
+ lb.curr_block += max_entries * max_entries;
if (i_data[EXT4_TIND_BLOCK]) {
retval = update_tind_extent_range(handle, tmp_inode,
- le32_to_cpu(i_data[EXT4_TIND_BLOCK]),
- &blk_count, &lb);
+ le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
if (retval)
goto err_out;
}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 9bdef3f537c..7ea4ba4eff2 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -109,7 +109,7 @@ static int kmmpd(void *data)
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
bdevname(bh->b_bdev, mmp->mmp_bdevname);
- memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+ memcpy(mmp->mmp_nodename, init_utsname()->nodename,
sizeof(mmp->mmp_nodename));
while (!kthread_should_stop()) {
@@ -125,8 +125,9 @@ static int kmmpd(void *data)
* Don't spew too many error messages. Print one every
* (s_mmp_update_interval * 60) seconds.
*/
- if (retval && (failed_writes % 60) == 0) {
- ext4_error(sb, "Error writing to MMP block");
+ if (retval) {
+ if ((failed_writes % 60) == 0)
+ ext4_error(sb, "Error writing to MMP block");
failed_writes++;
}
@@ -295,7 +296,8 @@ skip:
/*
* write a new random sequence number.
*/
- mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+ seq = mmp_new_seq();
+ mmp->mmp_seq = cpu_to_le32(seq);
retval = write_mmp_block(bh);
if (retval)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index f57455a1b1b..c5826c623e7 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -17,7 +17,6 @@
#include <linux/quotaops.h>
#include <linux/slab.h>
#include "ext4_jbd2.h"
-#include "ext4_extents.h"
#include "ext4.h"
/**
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1c924faeb6c..aa4c782c9dd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1586,7 +1586,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
dxtrace(dx_show_index("node", frames[1].entries));
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
- err = ext4_handle_dirty_metadata(handle, inode, bh2);
+ err = ext4_handle_dirty_metadata(handle, dir, bh2);
if (err)
goto journal_error;
brelse (bh2);
@@ -1612,7 +1612,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (err)
goto journal_error;
}
- err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+ err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
if (err) {
ext4_std_error(inode->i_sb, err);
goto cleanup;
@@ -1694,7 +1694,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
if (is_dx(inode) && inode->i_nlink > 1) {
/* limit is 16-bit i_links_count */
if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
}
@@ -1707,9 +1707,8 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
*/
static void ext4_dec_count(handle_t *handle, struct inode *inode)
{
- drop_nlink(inode);
- if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
- inc_nlink(inode);
+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+ drop_nlink(inode);
}
@@ -1756,7 +1755,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
@@ -1792,7 +1791,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
@@ -1832,7 +1831,7 @@ retry:
ext4_handle_sync(handle);
inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
- &dentry->d_name, 0);
+ &dentry->d_name, 0, NULL);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -1861,9 +1860,9 @@ retry:
de->name_len = 2;
strcpy(de->name, "..");
ext4_set_de_type(dir->i_sb, de, S_IFDIR);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, dir, dir_block);
+ err = ext4_handle_dirty_metadata(handle, inode, dir_block);
if (err)
goto out_clear_inode;
err = ext4_mark_inode_dirty(handle, inode);
@@ -2214,7 +2213,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
ext4_warning(inode->i_sb,
"Deleting nonexistent file (%lu), %d",
inode->i_ino, inode->i_nlink);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
@@ -2279,7 +2278,7 @@ retry:
ext4_handle_sync(handle);
inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
- &dentry->d_name, 0);
+ &dentry->d_name, 0, NULL);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -2530,7 +2529,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
cpu_to_le32(new_dir->i_ino);
BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
- retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+ retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
if (retval) {
ext4_std_error(old_dir->i_sb, retval);
goto end_rename;
@@ -2539,7 +2538,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_inode) {
/* checked empty_dir above, can't have another parent,
* ext4_dec_count() won't work for many-linked dirs */
- new_inode->i_nlink = 0;
+ clear_nlink(new_inode);
} else {
ext4_inc_count(handle, new_dir);
ext4_update_dx_flag(new_dir);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 92f38ee13f8..7e106c810c6 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -70,7 +70,6 @@ static void put_io_page(struct ext4_io_page *io_page)
void ext4_free_io_end(ext4_io_end_t *io)
{
int i;
- wait_queue_head_t *wq;
BUG_ON(!io);
if (io->page)
@@ -78,56 +77,43 @@ void ext4_free_io_end(ext4_io_end_t *io)
for (i = 0; i < io->num_io_pages; i++)
put_io_page(io->pages[i]);
io->num_io_pages = 0;
- wq = ext4_ioend_wq(io->inode);
- if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
- waitqueue_active(wq))
- wake_up_all(wq);
+ if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
+ wake_up_all(ext4_ioend_wq(io->inode));
kmem_cache_free(io_end_cachep, io);
}
/*
* check a range of space and convert unwritten extents to written.
+ *
+ * Called with inode->i_mutex; we depend on this when we manipulate
+ * io->flag, since we could otherwise race with ext4_flush_completed_IO()
*/
int ext4_end_io_nolock(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
ssize_t size = io->size;
- wait_queue_head_t *wq;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
- if (list_empty(&io->list))
- return ret;
-
- if (!(io->flag & EXT4_IO_END_UNWRITTEN))
- return ret;
-
ret = ext4_convert_unwritten_extents(inode, offset, size);
if (ret < 0) {
- printk(KERN_EMERG "%s: failed to convert unwritten "
- "extents to written extents, error is %d "
- "io is still on inode %lu aio dio list\n",
- __func__, ret, inode->i_ino);
- return ret;
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "failed to convert unwritten extents to written "
+ "extents -- potential data loss! "
+ "(inode %lu, offset %llu, size %zd, error %d)",
+ inode->i_ino, offset, size, ret);
}
if (io->iocb)
aio_complete(io->iocb, io->result, 0);
- /* clear the DIO AIO unwritten flag */
- if (io->flag & EXT4_IO_END_UNWRITTEN) {
- io->flag &= ~EXT4_IO_END_UNWRITTEN;
- /* Wake up anyone waiting on unwritten extent conversion */
- wq = ext4_ioend_wq(io->inode);
- if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
- waitqueue_active(wq)) {
- wake_up_all(wq);
- }
- }
+ /* Wake up anyone waiting on unwritten extent conversion */
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
+ wake_up_all(ext4_ioend_wq(io->inode));
return ret;
}
@@ -140,9 +126,15 @@ static void ext4_end_io_work(struct work_struct *work)
struct inode *inode = io->inode;
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned long flags;
- int ret;
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ if (list_empty(&io->list)) {
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ goto free;
+ }
if (!mutex_trylock(&inode->i_mutex)) {
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
/*
* Requeue the work instead of waiting so that the work
* items queued after this can be processed.
@@ -159,17 +151,11 @@ static void ext4_end_io_work(struct work_struct *work)
io->flag |= EXT4_IO_END_QUEUED;
return;
}
- ret = ext4_end_io_nolock(io);
- if (ret < 0) {
- mutex_unlock(&inode->i_mutex);
- return;
- }
-
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (!list_empty(&io->list))
- list_del_init(&io->list);
+ list_del_init(&io->list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ (void) ext4_end_io_nolock(io);
mutex_unlock(&inode->i_mutex);
+free:
ext4_free_io_end(io);
}
@@ -350,10 +336,8 @@ submit_and_retry:
if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
(io_end->pages[io_end->num_io_pages-1] != io_page))
goto submit_and_retry;
- if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
- }
+ if (buffer_uninit(bh))
+ ext4_set_io_unwritten_flag(inode, io_end);
io->io_end->size += bh->b_size;
io->io_next_block++;
ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
@@ -401,6 +385,18 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
block_end = block_start + blocksize;
if (block_start >= len) {
+ /*
+ * Comments copied from block_write_full_page_endio:
+ *
+ * The page straddles i_size. It must be zeroed out on
+ * each and every writepage invocation because it may
+ * be mmapped. "A file is mapped in multiples of the
+ * page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when
+ * mapped, and writes to that region are not written
+ * out to the file."
+ */
+ zero_user_segment(page, block_start, block_end);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
continue;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 707d3f16f7c..996780ab4f4 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -875,7 +875,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
- ext4_free_blks_set(sb, gdp, input->free_blocks_count);
+ ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count);
ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
@@ -937,8 +937,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
input->reserved_blocks);
/* Update the free space counts */
- percpu_counter_add(&sbi->s_freeblocks_counter,
- input->free_blocks_count);
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ EXT4_B2C(sbi, input->free_blocks_count));
percpu_counter_add(&sbi->s_freeinodes_counter,
EXT4_INODES_PER_GROUP(sb));
@@ -946,8 +946,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
flex_group = ext4_flex_group(sbi, input->group);
- atomic_add(input->free_blocks_count,
- &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic_add(EXT4_B2C(sbi, input->free_blocks_count),
+ &sbi->s_flex_groups[flex_group].free_clusters);
atomic_add(EXT4_INODES_PER_GROUP(sb),
&sbi->s_flex_groups[flex_group].free_inodes);
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 44d0c8db223..3e1329e2f82 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
#include <linux/freezer.h>
#include "ext4.h"
+#include "ext4_extents.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
@@ -163,8 +164,8 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
}
-__u32 ext4_free_blks_count(struct super_block *sb,
- struct ext4_group_desc *bg)
+__u32 ext4_free_group_clusters(struct super_block *sb,
+ struct ext4_group_desc *bg)
{
return le16_to_cpu(bg->bg_free_blocks_count_lo) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
@@ -219,8 +220,8 @@ void ext4_inode_table_set(struct super_block *sb,
bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
}
-void ext4_free_blks_set(struct super_block *sb,
- struct ext4_group_desc *bg, __u32 count)
+void ext4_free_group_clusters_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
{
bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
@@ -414,6 +415,22 @@ static void save_error_info(struct super_block *sb, const char *func,
ext4_commit_super(sb, 1);
}
+/*
+ * The del_gendisk() function uninitializes the disk-specific data
+ * structures, including the bdi structure, without telling anyone
+ * else. Once this happens, any attempt to call mark_buffer_dirty()
+ * (for example, by ext4_commit_super), will cause a kernel OOPS.
+ * This is a kludge to prevent these oops until we can put in a proper
+ * hook in del_gendisk() to inform the VFS and file system layers.
+ */
+static int block_device_ejected(struct super_block *sb)
+{
+ struct inode *bd_inode = sb->s_bdev->bd_inode;
+ struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+
+ return bdi->dev == NULL;
+}
+
/* Deal with the reporting of failure conditions on a filesystem such as
* inconsistencies detected or read IO failures.
@@ -821,10 +838,10 @@ static void ext4_put_super(struct super_block *sb)
brelse(sbi->s_group_desc[i]);
ext4_kvfree(sbi->s_group_desc);
ext4_kvfree(sbi->s_flex_groups);
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+ percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@@ -1057,8 +1074,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",nouid32");
if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
seq_puts(seq, ",debug");
- if (test_opt(sb, OLDALLOC))
- seq_puts(seq, ",oldalloc");
#ifdef CONFIG_EXT4_FS_XATTR
if (test_opt(sb, XATTR_USER))
seq_puts(seq, ",user_xattr");
@@ -1140,9 +1155,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",block_validity");
if (!test_opt(sb, INIT_INODE_TABLE))
- seq_puts(seq, ",noinit_inode_table");
+ seq_puts(seq, ",noinit_itable");
else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
- seq_printf(seq, ",init_inode_table=%u",
+ seq_printf(seq, ",init_itable=%u",
(unsigned) sbi->s_li_wait_mult);
ext4_show_quota_options(seq, sb);
@@ -1318,8 +1333,7 @@ enum {
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
- Opt_discard, Opt_nodiscard,
- Opt_init_inode_table, Opt_noinit_inode_table,
+ Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
};
static const match_table_t tokens = {
@@ -1392,9 +1406,9 @@ static const match_table_t tokens = {
{Opt_dioread_lock, "dioread_lock"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
- {Opt_init_inode_table, "init_itable=%u"},
- {Opt_init_inode_table, "init_itable"},
- {Opt_noinit_inode_table, "noinit_itable"},
+ {Opt_init_itable, "init_itable=%u"},
+ {Opt_init_itable, "init_itable"},
+ {Opt_noinit_itable, "noinit_itable"},
{Opt_err, NULL},
};
@@ -1567,10 +1581,12 @@ static int parse_options(char *options, struct super_block *sb,
set_opt(sb, DEBUG);
break;
case Opt_oldalloc:
- set_opt(sb, OLDALLOC);
+ ext4_msg(sb, KERN_WARNING,
+ "Ignoring deprecated oldalloc option");
break;
case Opt_orlov:
- clear_opt(sb, OLDALLOC);
+ ext4_msg(sb, KERN_WARNING,
+ "Ignoring deprecated orlov option");
break;
#ifdef CONFIG_EXT4_FS_XATTR
case Opt_user_xattr:
@@ -1666,7 +1682,9 @@ static int parse_options(char *options, struct super_block *sb,
data_opt = EXT4_MOUNT_WRITEBACK_DATA;
datacheck:
if (is_remount) {
- if (test_opt(sb, DATA_FLAGS) != data_opt) {
+ if (!sbi->s_journal)
+ ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+ else if (test_opt(sb, DATA_FLAGS) != data_opt) {
ext4_msg(sb, KERN_ERR,
"Cannot change data mode on remount");
return 0;
@@ -1801,6 +1819,7 @@ set_qf_format:
break;
case Opt_nodelalloc:
clear_opt(sb, DELALLOC);
+ clear_opt2(sb, EXPLICIT_DELALLOC);
break;
case Opt_mblk_io_submit:
set_opt(sb, MBLK_IO_SUBMIT);
@@ -1817,6 +1836,7 @@ set_qf_format:
break;
case Opt_delalloc:
set_opt(sb, DELALLOC);
+ set_opt2(sb, EXPLICIT_DELALLOC);
break;
case Opt_block_validity:
set_opt(sb, BLOCK_VALIDITY);
@@ -1871,7 +1891,7 @@ set_qf_format:
case Opt_dioread_lock:
clear_opt(sb, DIOREAD_NOLOCK);
break;
- case Opt_init_inode_table:
+ case Opt_init_itable:
set_opt(sb, INIT_INODE_TABLE);
if (args[0].from) {
if (match_int(&args[0], &option))
@@ -1882,7 +1902,7 @@ set_qf_format:
return 0;
sbi->s_li_wait_mult = option;
break;
- case Opt_noinit_inode_table:
+ case Opt_noinit_itable:
clear_opt(sb, INIT_INODE_TABLE);
break;
default:
@@ -1935,7 +1955,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
res = MS_RDONLY;
}
if (read_only)
- return res;
+ goto done;
if (!(sbi->s_mount_state & EXT4_VALID_FS))
ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
"running e2fsck is recommended");
@@ -1966,6 +1986,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
ext4_commit_super(sb, 1);
+done:
if (test_opt(sb, DEBUG))
printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
@@ -2015,8 +2036,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
flex_group = ext4_flex_group(sbi, i);
atomic_add(ext4_free_inodes_count(sb, gdp),
&sbi->s_flex_groups[flex_group].free_inodes);
- atomic_add(ext4_free_blks_count(sb, gdp),
- &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic_add(ext4_free_group_clusters(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_clusters);
atomic_add(ext4_used_dirs_count(sb, gdp),
&sbi->s_flex_groups[flex_group].used_dirs);
}
@@ -2134,7 +2155,8 @@ static int ext4_check_descriptors(struct super_block *sb,
if (NULL != first_not_zeroed)
*first_not_zeroed = grp;
- ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
+ ext4_free_blocks_count_set(sbi->s_es,
+ EXT4_C2B(sbi, ext4_count_free_clusters(sb)));
sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
return 1;
}
@@ -2454,7 +2476,8 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
char *buf)
{
return snprintf(buf, PAGE_SIZE, "%llu\n",
- (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+ (s64) EXT4_C2B(sbi,
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
}
static ssize_t session_write_kbytes_show(struct ext4_attr *a,
@@ -2682,6 +2705,13 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
}
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
+ !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_msg(sb, KERN_ERR,
+ "Can't support bigalloc feature without "
+ "extents feature\n");
+ return 0;
+ }
return 1;
}
@@ -3070,8 +3100,6 @@ static void ext4_destroy_lazyinit_thread(void)
}
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
- __releases(kernel_lock)
- __acquires(kernel_lock)
{
char *orig_data = kstrdup(data, GFP_KERNEL);
struct buffer_head *bh;
@@ -3087,10 +3115,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
char *cp;
const char *descr;
int ret = -ENOMEM;
- int blocksize;
+ int blocksize, clustersize;
unsigned int db_count;
unsigned int i;
- int needs_recovery, has_huge_files;
+ int needs_recovery, has_huge_files, has_bigalloc;
__u64 blocks_count;
int err;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -3224,6 +3252,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
&journal_ioprio, NULL, 0))
goto failed_mount;
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
+ "with data=journal disables delayed "
+ "allocation and O_DIRECT support!\n");
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ goto failed_mount;
+ }
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ goto failed_mount;
+ }
+ if (test_opt(sb, DELALLOC))
+ clear_opt(sb, DELALLOC);
+ }
+
+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ if (blocksize < PAGE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "dioread_nolock if block size != PAGE_SIZE");
+ goto failed_mount;
+ }
+ }
+
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -3265,8 +3320,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
goto failed_mount;
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
-
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
blocksize > EXT4_MAX_BLOCK_SIZE) {
ext4_msg(sb, KERN_ERR,
@@ -3369,12 +3422,53 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_dirt = 1;
}
- if (sbi->s_blocks_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#blocks per group too big: %lu",
- sbi->s_blocks_per_group);
- goto failed_mount;
+ /* Handle clustersize */
+ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+ has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+ if (has_bigalloc) {
+ if (clustersize < blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "cluster size (%d) smaller than "
+ "block size (%d)", clustersize, blocksize);
+ goto failed_mount;
+ }
+ sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+ le32_to_cpu(es->s_log_block_size);
+ sbi->s_clusters_per_group =
+ le32_to_cpu(es->s_clusters_per_group);
+ if (sbi->s_clusters_per_group > blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#clusters per group too big: %lu",
+ sbi->s_clusters_per_group);
+ goto failed_mount;
+ }
+ if (sbi->s_blocks_per_group !=
+ (sbi->s_clusters_per_group * (clustersize / blocksize))) {
+ ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+ "clusters per group (%lu) inconsistent",
+ sbi->s_blocks_per_group,
+ sbi->s_clusters_per_group);
+ goto failed_mount;
+ }
+ } else {
+ if (clustersize != blocksize) {
+ ext4_warning(sb, "fragment/cluster size (%d) != "
+ "block size (%d)", clustersize,
+ blocksize);
+ clustersize = blocksize;
+ }
+ if (sbi->s_blocks_per_group > blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#blocks per group too big: %lu",
+ sbi->s_blocks_per_group);
+ goto failed_mount;
+ }
+ sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+ sbi->s_cluster_bits = 0;
}
+ sbi->s_cluster_ratio = clustersize / blocksize;
+
if (sbi->s_inodes_per_group > blocksize * 8) {
ext4_msg(sb, KERN_ERR,
"#inodes per group too big: %lu",
@@ -3446,10 +3540,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
-#ifdef CONFIG_PROC_FS
if (ext4_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-#endif
bgl_lock_init(sbi->s_blockgroup_lock);
@@ -3483,8 +3575,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_err_report.function = print_daily_error_info;
sbi->s_err_report.data = (unsigned long) sb;
- err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext4_count_free_blocks(sb));
+ err = percpu_counter_init(&sbi->s_freeclusters_counter,
+ ext4_count_free_clusters(sb));
if (!err) {
err = percpu_counter_init(&sbi->s_freeinodes_counter,
ext4_count_free_inodes(sb));
@@ -3494,7 +3586,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_count_dirs(sb));
}
if (!err) {
- err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+ err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
}
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -3609,13 +3701,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* The journal may have updated the bg summary counts, so we
* need to update the global counters.
*/
- percpu_counter_set(&sbi->s_freeblocks_counter,
- ext4_count_free_blocks(sb));
+ percpu_counter_set(&sbi->s_freeclusters_counter,
+ ext4_count_free_clusters(sb));
percpu_counter_set(&sbi->s_freeinodes_counter,
ext4_count_free_inodes(sb));
percpu_counter_set(&sbi->s_dirs_counter,
ext4_count_dirs(sb));
- percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
+ percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
no_journal:
/*
@@ -3679,25 +3771,6 @@ no_journal:
"available");
}
- if (test_opt(sb, DELALLOC) &&
- (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
- ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
- "requested data journaling mode");
- clear_opt(sb, DELALLOC);
- }
- if (test_opt(sb, DIOREAD_NOLOCK)) {
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
- "option - requested data journaling mode");
- clear_opt(sb, DIOREAD_NOLOCK);
- }
- if (sb->s_blocksize < PAGE_SIZE) {
- ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
- "option - block size is too small");
- clear_opt(sb, DIOREAD_NOLOCK);
- }
- }
-
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -3710,22 +3783,19 @@ no_journal:
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
err);
- goto failed_mount4;
+ goto failed_mount5;
}
err = ext4_register_li_request(sb, first_not_zeroed);
if (err)
- goto failed_mount4;
+ goto failed_mount6;
sbi->s_kobj.kset = ext4_kset;
init_completion(&sbi->s_kobj_unregister);
err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
"%s", sb->s_id);
- if (err) {
- ext4_mb_release(sb);
- ext4_ext_release(sb);
- goto failed_mount4;
- };
+ if (err)
+ goto failed_mount7;
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
@@ -3759,13 +3829,19 @@ cantfind_ext4:
ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
goto failed_mount;
+failed_mount7:
+ ext4_unregister_li_request(sb);
+failed_mount6:
+ ext4_ext_release(sb);
+failed_mount5:
+ ext4_mb_release(sb);
+ ext4_release_system_zone(sb);
failed_mount4:
iput(root);
sb->s_root = NULL;
ext4_msg(sb, KERN_ERR, "mount failed");
destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
failed_mount_wq:
- ext4_release_system_zone(sb);
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -3774,10 +3850,10 @@ failed_mount3:
del_timer(&sbi->s_err_report);
if (sbi->s_flex_groups)
ext4_kvfree(sbi->s_flex_groups);
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+ percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
failed_mount2:
@@ -4064,7 +4140,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
int error = 0;
- if (!sbh)
+ if (!sbh || block_device_ejected(sb))
return error;
if (buffer_write_io_error(sbh)) {
/*
@@ -4100,8 +4176,9 @@ static int ext4_commit_super(struct super_block *sb, int sync)
else
es->s_kbytes_written =
cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
- ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
- &EXT4_SB(sb)->s_freeblocks_counter));
+ ext4_free_blocks_count_set(es,
+ EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
+ &EXT4_SB(sb)->s_freeclusters_counter)));
es->s_free_inodes_count =
cpu_to_le32(percpu_counter_sum_positive(
&EXT4_SB(sb)->s_freeinodes_counter));
@@ -4506,16 +4583,34 @@ restore_opts:
return err;
}
+/*
+ * Note: calculating the overhead so we can be compatible with
+ * historical BSD practice is quite difficult in the face of
+ * clusters/bigalloc. This is because multiple metadata blocks from
+ * different block group can end up in the same allocation cluster.
+ * Calculating the exact overhead in the face of clustered allocation
+ * requires either O(all block bitmaps) in memory or O(number of block
+ * groups**2) in time. We will still calculate the superblock for
+ * older file systems --- and if we come across with a bigalloc file
+ * system with zero in s_overhead_clusters the estimate will be close to
+ * correct especially for very large cluster sizes --- but for newer
+ * file systems, it's better to calculate this figure once at mkfs
+ * time, and store it in the superblock. If the superblock value is
+ * present (even for non-bigalloc file systems), we will use it.
+ */
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
+ struct ext4_group_desc *gdp;
u64 fsid;
s64 bfree;
if (test_opt(sb, MINIX_DF)) {
sbi->s_overhead_last = 0;
+ } else if (es->s_overhead_clusters) {
+ sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters);
} else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
ext4_fsblk_t overhead = 0;
@@ -4530,24 +4625,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
* All of the blocks before first_data_block are
* overhead
*/
- overhead = le32_to_cpu(es->s_first_data_block);
+ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
/*
- * Add the overhead attributed to the superblock and
- * block group descriptors. If the sparse superblocks
- * feature is turned on, then not all groups have this.
+ * Add the overhead found in each block group
*/
for (i = 0; i < ngroups; i++) {
- overhead += ext4_bg_has_super(sb, i) +
- ext4_bg_num_gdb(sb, i);
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ overhead += ext4_num_overhead_clusters(sb, i, gdp);
cond_resched();
}
-
- /*
- * Every block group has an inode bitmap, a block
- * bitmap, and an inode table.
- */
- overhead += ngroups * (2 + sbi->s_itb_per_group);
sbi->s_overhead_last = overhead;
smp_wmb();
sbi->s_blocks_last = ext4_blocks_count(es);
@@ -4555,11 +4642,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = EXT4_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
- bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
- percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+ buf->f_blocks = (ext4_blocks_count(es) -
+ EXT4_C2B(sbi, sbi->s_overhead_last));
+ bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+ percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
/* prevent underflow in case that few free space is available */
- buf->f_bfree = max_t(s64, bfree, 0);
+ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
buf->f_bavail = 0;
@@ -4980,13 +5068,11 @@ static int __init ext4_init_fs(void)
return err;
err = ext4_init_system_zone();
if (err)
- goto out7;
+ goto out6;
ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
if (!ext4_kset)
- goto out6;
- ext4_proc_root = proc_mkdir("fs/ext4", NULL);
- if (!ext4_proc_root)
goto out5;
+ ext4_proc_root = proc_mkdir("fs/ext4", NULL);
err = ext4_init_feat_adverts();
if (err)
@@ -5022,12 +5108,12 @@ out2:
out3:
ext4_exit_feat_adverts();
out4:
- remove_proc_entry("fs/ext4", NULL);
-out5:
+ if (ext4_proc_root)
+ remove_proc_entry("fs/ext4", NULL);
kset_unregister(ext4_kset);
-out6:
+out5:
ext4_exit_system_zone();
-out7:
+out6:
ext4_exit_pageio();
return err;
}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c757adc9725..93a00d89a22 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,14 @@ inserted:
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+ /*
+ * take i_data_sem because we will test
+ * i_delalloc_reserved_flag in ext4_mb_new_blocks
+ */
+ down_read((&EXT4_I(inode)->i_data_sem));
block = ext4_new_meta_blocks(handle, inode, goal, 0,
NULL, &error);
+ up_read((&EXT4_I(inode)->i_data_sem));
if (error)
goto cleanup;
@@ -985,11 +991,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
- error = ext4_get_inode_loc(inode, &is.iloc);
- if (error)
- goto cleanup;
-
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ error = ext4_reserve_inode_write(handle, inode, &is.iloc);
if (error)
goto cleanup;
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 007c3bfbf09..34e4350dd4d 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -48,28 +48,32 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-int
-ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *name;
+ const struct xattr *xattr;
+ handle_t *handle = fs_info;
+ int err = 0;
- err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ext4_xattr_set_handle(handle, inode,
+ EXT4_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
}
- err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
- name, value, len, 0);
- kfree(name);
- kfree(value);
return err;
}
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &ext4_initxattrs, handle);
+}
+
const struct xattr_handler ext4_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext4_xattr_security_list,
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 5efbd5d7701..aca191bd5f8 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -156,8 +156,8 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
} else {
if (uni_xlate == 1) {
*op++ = ':';
- op = pack_hex_byte(op, ec >> 8);
- op = pack_hex_byte(op, ec);
+ op = hex_byte_pack(op, ec >> 8);
+ op = hex_byte_pack(op, ec);
len -= 5;
} else {
*op++ = '?';
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index a5d3853822e..1510a4d5199 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -326,15 +326,14 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
struct inode *i2);
/* fat/misc.c */
-extern void
-__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4))) __cold;
+extern __printf(3, 4) __cold
+void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
#define fat_fs_error(sb, fmt, args...) \
__fat_fs_error(sb, 1, fmt , ## args)
#define fat_fs_error_ratelimit(sb, fmt, args...) \
__fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
-void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4))) __cold;
+__printf(3, 4) __cold
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
extern int fat_clusters_flush(struct super_block *sb);
extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 1726d730304..808cac7edcf 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -379,7 +379,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
return error;
MSDOS_I(inode)->mmu_private = inode->i_size;
- inode->i_nlink = fat_subdirs(inode);
+ set_nlink(inode, fat_subdirs(inode));
} else { /* not a directory */
inode->i_generation |= 1;
inode->i_mode = fat_make_mode(sbi, de->attr,
@@ -1233,7 +1233,7 @@ static int fat_read_root(struct inode *inode)
fat_save_attrs(inode, ATTR_DIR);
inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
- inode->i_nlink = fat_subdirs(inode)+2;
+ set_nlink(inode, fat_subdirs(inode)+2);
return 0;
}
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 66e83b84545..216b419f30e 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -387,7 +387,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
/* the directory was completed, just return a error */
goto out;
}
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index bb3f29c3557..a87a65663c2 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -900,7 +900,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
goto out;
}
inode->i_version++;
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 1a4311437a8..7b2af5abe2f 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -227,7 +227,7 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
ip->i_uid = (uid_t)vip->vii_uid;
ip->i_gid = (gid_t)vip->vii_gid;
- ip->i_nlink = vip->vii_nlink;
+ set_nlink(ip, vip->vii_nlink);
ip->i_size = vip->vii_size;
ip->i_atime.tv_sec = vip->vii_atime;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 04cf3b91e50..517f211a3bd 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,6 +41,7 @@ struct wb_writeback_work {
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
+ enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
struct completion *done; /* set if the caller waits */
@@ -115,7 +116,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
static void
__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- bool range_cyclic)
+ bool range_cyclic, enum wb_reason reason)
{
struct wb_writeback_work *work;
@@ -135,6 +136,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
work->sync_mode = WB_SYNC_NONE;
work->nr_pages = nr_pages;
work->range_cyclic = range_cyclic;
+ work->reason = reason;
bdi_queue_work(bdi, work);
}
@@ -143,6 +145,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
* bdi_start_writeback - start writeback
* @bdi: the backing device to write from
* @nr_pages: the number of pages to write
+ * @reason: reason why some writeback work was initiated
*
* Description:
* This does WB_SYNC_NONE opportunistic writeback. The IO is only
@@ -150,9 +153,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
* completion. Caller need not hold sb s_umount semaphore.
*
*/
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+ enum wb_reason reason)
{
- __bdi_start_writeback(bdi, nr_pages, true);
+ __bdi_start_writeback(bdi, nr_pages, true, reason);
}
/**
@@ -251,7 +255,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
- unsigned long *older_than_this)
+ struct wb_writeback_work *work)
{
LIST_HEAD(tmp);
struct list_head *pos, *node;
@@ -262,8 +266,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
- if (older_than_this &&
- inode_dirtied_after(inode, *older_than_this))
+ if (work->older_than_this &&
+ inode_dirtied_after(inode, *work->older_than_this))
break;
if (sb && sb != inode->i_sb)
do_sb_sort = 1;
@@ -302,13 +306,13 @@ out:
* |
* +--> dequeue for IO
*/
-static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
+static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{
int moved;
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
- moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
- trace_writeback_queue_io(wb, older_than_this, moved);
+ moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
+ trace_writeback_queue_io(wb, work, moved);
}
static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -641,31 +645,40 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
return wrote;
}
-long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+ enum wb_reason reason)
{
struct wb_writeback_work work = {
.nr_pages = nr_pages,
.sync_mode = WB_SYNC_NONE,
.range_cyclic = 1,
+ .reason = reason,
};
spin_lock(&wb->list_lock);
if (list_empty(&wb->b_io))
- queue_io(wb, NULL);
+ queue_io(wb, &work);
__writeback_inodes_wb(wb, &work);
spin_unlock(&wb->list_lock);
return nr_pages - work.nr_pages;
}
-static inline bool over_bground_thresh(void)
+static bool over_bground_thresh(struct backing_dev_info *bdi)
{
unsigned long background_thresh, dirty_thresh;
global_dirty_limits(&background_thresh, &dirty_thresh);
- return (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) > background_thresh);
+ if (global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) > background_thresh)
+ return true;
+
+ if (bdi_stat(bdi, BDI_RECLAIMABLE) >
+ bdi_dirty_limit(bdi, background_thresh))
+ return true;
+
+ return false;
}
/*
@@ -675,7 +688,7 @@ static inline bool over_bground_thresh(void)
static void wb_update_bandwidth(struct bdi_writeback *wb,
unsigned long start_time)
{
- __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
+ __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
}
/*
@@ -727,7 +740,7 @@ static long wb_writeback(struct bdi_writeback *wb,
* For background writeout, stop when we are below the
* background dirty threshold
*/
- if (work->for_background && !over_bground_thresh())
+ if (work->for_background && !over_bground_thresh(wb->bdi))
break;
if (work->for_kupdate) {
@@ -738,7 +751,7 @@ static long wb_writeback(struct bdi_writeback *wb,
trace_writeback_start(wb->bdi, work);
if (list_empty(&wb->b_io))
- queue_io(wb, work->older_than_this);
+ queue_io(wb, work);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
@@ -811,13 +824,14 @@ static unsigned long get_nr_dirty_pages(void)
static long wb_check_background_flush(struct bdi_writeback *wb)
{
- if (over_bground_thresh()) {
+ if (over_bground_thresh(wb->bdi)) {
struct wb_writeback_work work = {
.nr_pages = LONG_MAX,
.sync_mode = WB_SYNC_NONE,
.for_background = 1,
.range_cyclic = 1,
+ .reason = WB_REASON_BACKGROUND,
};
return wb_writeback(wb, &work);
@@ -851,6 +865,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
.sync_mode = WB_SYNC_NONE,
.for_kupdate = 1,
.range_cyclic = 1,
+ .reason = WB_REASON_PERIODIC,
};
return wb_writeback(wb, &work);
@@ -969,7 +984,7 @@ int bdi_writeback_thread(void *data)
* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
* the whole world.
*/
-void wakeup_flusher_threads(long nr_pages)
+void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
{
struct backing_dev_info *bdi;
@@ -982,7 +997,7 @@ void wakeup_flusher_threads(long nr_pages)
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
if (!bdi_has_dirty_io(bdi))
continue;
- __bdi_start_writeback(bdi, nr_pages, false);
+ __bdi_start_writeback(bdi, nr_pages, false, reason);
}
rcu_read_unlock();
}
@@ -1198,12 +1213,15 @@ static void wait_sb_inodes(struct super_block *sb)
* writeback_inodes_sb_nr - writeback dirty inodes from given super_block
* @sb: the superblock
* @nr: the number of pages to write
+ * @reason: reason why some writeback work initiated
*
* Start writeback on some inodes on this super_block. No guarantees are made
* on how many (if any) will be written, and this function does not wait
* for IO completion of submitted IO.
*/
-void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
+void writeback_inodes_sb_nr(struct super_block *sb,
+ unsigned long nr,
+ enum wb_reason reason)
{
DECLARE_COMPLETION_ONSTACK(done);
struct wb_writeback_work work = {
@@ -1212,6 +1230,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
.tagged_writepages = 1,
.done = &done,
.nr_pages = nr,
+ .reason = reason,
};
WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1223,29 +1242,31 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
/**
* writeback_inodes_sb - writeback dirty inodes from given super_block
* @sb: the superblock
+ * @reason: reason why some writeback work was initiated
*
* Start writeback on some inodes on this super_block. No guarantees are made
* on how many (if any) will be written, and this function does not wait
* for IO completion of submitted IO.
*/
-void writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
- return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
+ return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
EXPORT_SYMBOL(writeback_inodes_sb);
/**
* writeback_inodes_sb_if_idle - start writeback if none underway
* @sb: the superblock
+ * @reason: reason why some writeback work was initiated
*
* Invoke writeback_inodes_sb if no writeback is currently underway.
* Returns 1 if writeback was started, 0 if not.
*/
-int writeback_inodes_sb_if_idle(struct super_block *sb)
+int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
{
if (!writeback_in_progress(sb->s_bdi)) {
down_read(&sb->s_umount);
- writeback_inodes_sb(sb);
+ writeback_inodes_sb(sb, reason);
up_read(&sb->s_umount);
return 1;
} else
@@ -1257,16 +1278,18 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
* writeback_inodes_sb_if_idle - start writeback if none underway
* @sb: the superblock
* @nr: the number of pages to write
+ * @reason: reason why some writeback work was initiated
*
* Invoke writeback_inodes_sb if no writeback is currently underway.
* Returns 1 if writeback was started, 0 if not.
*/
int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
- unsigned long nr)
+ unsigned long nr,
+ enum wb_reason reason)
{
if (!writeback_in_progress(sb->s_bdi)) {
down_read(&sb->s_umount);
- writeback_inodes_sb_nr(sb, nr);
+ writeback_inodes_sb_nr(sb, nr, reason);
up_read(&sb->s_umount);
return 1;
} else
@@ -1290,6 +1313,7 @@ void sync_inodes_sb(struct super_block *sb)
.nr_pages = LONG_MAX,
.range_cyclic = 0,
.done = &done,
+ .reason = WB_REASON_SYNC,
};
WARN_ON(!rwsem_is_locked(&sb->s_umount));
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 85542a7daf4..42593c587d4 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -231,7 +231,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
if (iop)
inode->i_op = iop;
inode->i_fop = fop;
- inode->i_nlink = nlink;
+ set_nlink(inode, nlink);
inode->i_private = fc;
d_add(dentry, inode);
return dentry;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b6cca47f7b0..3426521f320 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -47,6 +47,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
+#include <linux/module.h>
#include "fuse_i.h"
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 5cb8614508c..2aaf3eaaf13 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1512,7 +1512,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
else if (outarg->offset + num > file_size)
num = file_size - outarg->offset;
- while (num) {
+ while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) {
struct page *page;
unsigned int this_num;
@@ -1526,6 +1526,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
num -= this_num;
total_len += this_num;
+ index++;
}
req->misc.retrieve_in.offset = outarg->offset;
req->misc.retrieve_in.size = total_len;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 594f07a81c2..0c84100acd4 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1556,7 +1556,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
struct inode *inode = file->f_path.dentry->d_inode;
mutex_lock(&inode->i_mutex);
- if (origin != SEEK_CUR || origin != SEEK_SET) {
+ if (origin != SEEK_CUR && origin != SEEK_SET) {
retval = fuse_update_attributes(inode, NULL, file, NULL);
if (retval)
goto exit;
@@ -1567,6 +1567,10 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
offset += i_size_read(inode);
break;
case SEEK_CUR:
+ if (offset == 0) {
+ retval = file->f_pos;
+ goto exit;
+ }
offset += file->f_pos;
break;
case SEEK_DATA:
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index add96f6ffda..aa83109b943 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -151,7 +151,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_ino = attr->ino;
inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
- inode->i_nlink = attr->nlink;
+ set_nlink(inode, attr->nlink);
inode->i_uid = attr->uid;
inode->i_gid = attr->gid;
inode->i_blocks = attr->blocks;
@@ -1138,28 +1138,28 @@ static int __init fuse_fs_init(void)
{
int err;
- err = register_filesystem(&fuse_fs_type);
- if (err)
- goto out;
-
- err = register_fuseblk();
- if (err)
- goto out_unreg;
-
fuse_inode_cachep = kmem_cache_create("fuse_inode",
sizeof(struct fuse_inode),
0, SLAB_HWCACHE_ALIGN,
fuse_inode_init_once);
err = -ENOMEM;
if (!fuse_inode_cachep)
- goto out_unreg2;
+ goto out;
+
+ err = register_fuseblk();
+ if (err)
+ goto out2;
+
+ err = register_filesystem(&fuse_fs_type);
+ if (err)
+ goto out3;
return 0;
- out_unreg2:
+ out3:
unregister_fuseblk();
- out_unreg:
- unregister_filesystem(&fuse_fs_type);
+ out2:
+ kmem_cache_destroy(fuse_inode_cachep);
out:
return err;
}
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 34501b64bc4..65978d7885c 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -82,7 +82,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode)
iattr.ia_valid = ATTR_MODE;
iattr.ia_mode = mode;
- error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
+ error = gfs2_setattr_simple(inode, &iattr);
}
return error;
@@ -160,6 +160,7 @@ out:
int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
{
+ struct inode *inode = &ip->i_inode;
struct posix_acl *acl;
char *data;
unsigned int len;
@@ -169,7 +170,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
if (IS_ERR(acl))
return PTR_ERR(acl);
if (!acl)
- return gfs2_setattr_simple(ip, attr);
+ return gfs2_setattr_simple(inode, attr);
error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode);
if (error)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index f9fbbe96c22..4858e1fed8b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -663,7 +663,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
if (&ip->i_inode == sdp->sd_rindex)
rblocks += 2 * RES_STATFS;
if (alloc_required)
- rblocks += gfs2_rg_blocks(al);
+ rblocks += gfs2_rg_blocks(ip);
error = gfs2_trans_begin(sdp, rblocks,
PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -787,7 +787,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
u64 to = pos + copied;
void *kaddr;
unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
- struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
kaddr = kmap_atomic(page, KM_USER0);
@@ -804,7 +803,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
if (copied) {
if (inode->i_size < to)
i_size_write(inode, to);
- gfs2_dinode_out(ip, di);
mark_inode_dirty(inode);
}
@@ -873,10 +871,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
gfs2_page_add_databufs(ip, page, from, to);
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
- if (ret > 0) {
- gfs2_dinode_out(ip, dibh->b_data);
- mark_inode_dirty(inode);
- }
if (inode == sdp->sd_rindex) {
adjust_fs_space(inode);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 7878c473ae6..41d494d7970 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -10,6 +10,7 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
@@ -36,11 +37,6 @@ struct metapath {
__u16 mp_list[GFS2_MAX_META_HEIGHT];
};
-typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
- struct buffer_head *bh, __be64 *top,
- __be64 *bottom, unsigned int height,
- void *data);
-
struct strip_mine {
int sm_first;
unsigned int sm_height;
@@ -273,6 +269,30 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp
return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
}
+static void gfs2_metapath_ra(struct gfs2_glock *gl,
+ const struct buffer_head *bh, const __be64 *pos)
+{
+ struct buffer_head *rabh;
+ const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
+ const __be64 *t;
+
+ for (t = pos; t < endp; t++) {
+ if (!*t)
+ continue;
+
+ rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
+ if (trylock_buffer(rabh)) {
+ if (!buffer_uptodate(rabh)) {
+ rabh->b_end_io = end_buffer_read_sync;
+ submit_bh(READA | REQ_META, rabh);
+ continue;
+ }
+ unlock_buffer(rabh);
+ }
+ brelse(rabh);
+ }
+}
+
/**
* lookup_metapath - Walk the metadata tree to a specific point
* @ip: The inode
@@ -432,12 +452,14 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct super_block *sb = sdp->sd_vfs;
struct buffer_head *dibh = mp->mp_bh[0];
u64 bn, dblock = 0;
unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
unsigned dblks = 0;
unsigned ptrs_per_blk;
const unsigned end_of_metadata = height - 1;
+ int ret;
int eob = 0;
enum alloc_state state;
__be64 *ptr;
@@ -540,6 +562,15 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
dblock = bn;
while (n-- > 0)
*ptr++ = cpu_to_be64(bn++);
+ if (buffer_zeronew(bh_map)) {
+ ret = sb_issue_zeroout(sb, dblock, dblks,
+ GFP_NOFS);
+ if (ret) {
+ fs_err(sdp,
+ "Failed to zero data buffers\n");
+ clear_buffer_zeronew(bh_map);
+ }
+ }
break;
}
} while ((state != ALLOC_DATA) || !dblock);
@@ -668,76 +699,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
}
/**
- * recursive_scan - recursively scan through the end of a file
- * @ip: the inode
- * @dibh: the dinode buffer
- * @mp: the path through the metadata to the point to start
- * @height: the height the recursion is at
- * @block: the indirect block to look at
- * @first: 1 if this is the first block
- * @bc: the call to make for each piece of metadata
- * @data: data opaque to this function to pass to @bc
- *
- * When this is first called @height and @block should be zero and
- * @first should be 1.
- *
- * Returns: errno
- */
-
-static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
- struct metapath *mp, unsigned int height,
- u64 block, int first, block_call_t bc,
- void *data)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct buffer_head *bh = NULL;
- __be64 *top, *bottom;
- u64 bn;
- int error;
- int mh_size = sizeof(struct gfs2_meta_header);
-
- if (!height) {
- error = gfs2_meta_inode_buffer(ip, &bh);
- if (error)
- return error;
- dibh = bh;
-
- top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
- bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
- } else {
- error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
- if (error)
- return error;
-
- top = (__be64 *)(bh->b_data + mh_size) +
- (first ? mp->mp_list[height] : 0);
-
- bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
- }
-
- error = bc(ip, dibh, bh, top, bottom, height, data);
- if (error)
- goto out;
-
- if (height < ip->i_height - 1)
- for (; top < bottom; top++, first = 0) {
- if (!*top)
- continue;
-
- bn = be64_to_cpu(*top);
-
- error = recursive_scan(ip, dibh, mp, height + 1, bn,
- first, bc, data);
- if (error)
- break;
- }
-
-out:
- brelse(bh);
- return error;
-}
-
-/**
* do_strip - Look for a layer a particular layer of the file and strip it off
* @ip: the inode
* @dibh: the dinode buffer
@@ -752,9 +713,8 @@ out:
static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
struct buffer_head *bh, __be64 *top, __be64 *bottom,
- unsigned int height, void *data)
+ unsigned int height, struct strip_mine *sm)
{
- struct strip_mine *sm = data;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrp_list rlist;
u64 bn, bstart;
@@ -783,11 +743,6 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
else if (ip->i_depth)
revokes = sdp->sd_inptrs;
- if (ip != GFS2_I(sdp->sd_rindex))
- error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
- else if (!sdp->sd_rgrps)
- error = gfs2_ri_update(ip);
-
if (error)
return error;
@@ -805,7 +760,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
blen++;
else {
if (bstart)
- gfs2_rlist_add(sdp, &rlist, bstart);
+ gfs2_rlist_add(ip, &rlist, bstart);
bstart = bn;
blen = 1;
@@ -813,7 +768,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
}
if (bstart)
- gfs2_rlist_add(sdp, &rlist, bstart);
+ gfs2_rlist_add(ip, &rlist, bstart);
else
goto out; /* Nothing to do */
@@ -887,12 +842,82 @@ out_rg_gunlock:
out_rlist:
gfs2_rlist_free(&rlist);
out:
- if (ip != GFS2_I(sdp->sd_rindex))
- gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
return error;
}
/**
+ * recursive_scan - recursively scan through the end of a file
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @mp: the path through the metadata to the point to start
+ * @height: the height the recursion is at
+ * @block: the indirect block to look at
+ * @first: 1 if this is the first block
+ * @sm: data opaque to this function to pass to @bc
+ *
+ * When this is first called @height and @block should be zero and
+ * @first should be 1.
+ *
+ * Returns: errno
+ */
+
+static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
+ struct metapath *mp, unsigned int height,
+ u64 block, int first, struct strip_mine *sm)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *bh = NULL;
+ __be64 *top, *bottom;
+ u64 bn;
+ int error;
+ int mh_size = sizeof(struct gfs2_meta_header);
+
+ if (!height) {
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ return error;
+ dibh = bh;
+
+ top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
+ bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
+ } else {
+ error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
+ if (error)
+ return error;
+
+ top = (__be64 *)(bh->b_data + mh_size) +
+ (first ? mp->mp_list[height] : 0);
+
+ bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+ }
+
+ error = do_strip(ip, dibh, bh, top, bottom, height, sm);
+ if (error)
+ goto out;
+
+ if (height < ip->i_height - 1) {
+
+ gfs2_metapath_ra(ip->i_gl, bh, top);
+
+ for (; top < bottom; top++, first = 0) {
+ if (!*top)
+ continue;
+
+ bn = be64_to_cpu(*top);
+
+ error = recursive_scan(ip, dibh, mp, height + 1, bn,
+ first, sm);
+ if (error)
+ break;
+ }
+ }
+out:
+ brelse(bh);
+ return error;
+}
+
+
+/**
* gfs2_block_truncate_page - Deal with zeroing out data for truncate
*
* This is partly borrowed from ext3.
@@ -1031,7 +1056,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
sm.sm_first = !!size;
sm.sm_height = height;
- error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
+ error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
if (error)
break;
}
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1cc2f8ec52a..8ccad2467cb 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -240,16 +240,15 @@ fail:
return error;
}
-static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
- u64 offset, unsigned int size)
+static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf,
+ unsigned int size)
{
struct buffer_head *dibh;
int error;
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
- offset += sizeof(struct gfs2_dinode);
- memcpy(buf, dibh->b_data + offset, size);
+ memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
brelse(dibh);
}
@@ -261,13 +260,12 @@ static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
* gfs2_dir_read_data - Read a data from a directory inode
* @ip: The GFS2 Inode
* @buf: The buffer to place result into
- * @offset: File offset to begin jdata_readng from
* @size: Amount of data to transfer
*
* Returns: The amount of data actually copied or the error
*/
-static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
- unsigned int size, unsigned ra)
+static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
+ unsigned int size)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
u64 lblock, dblock;
@@ -275,24 +273,14 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
unsigned int o;
int copied = 0;
int error = 0;
- u64 disksize = i_size_read(&ip->i_inode);
-
- if (offset >= disksize)
- return 0;
-
- if (offset + size > disksize)
- size = disksize - offset;
-
- if (!size)
- return 0;
if (gfs2_is_stuffed(ip))
- return gfs2_dir_read_stuffed(ip, buf, offset, size);
+ return gfs2_dir_read_stuffed(ip, buf, size);
if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
return -EINVAL;
- lblock = offset;
+ lblock = 0;
o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
while (copied < size) {
@@ -311,8 +299,6 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
if (error || !dblock)
goto fail;
BUG_ON(extlen < 1);
- if (!ra)
- extlen = 1;
bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
} else {
error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
@@ -328,7 +314,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
extlen--;
memcpy(buf, bh->b_data + o, amount);
brelse(bh);
- buf += amount;
+ buf += (amount/sizeof(__be64));
copied += amount;
lblock++;
o = sizeof(struct gfs2_meta_header);
@@ -371,7 +357,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
if (hc == NULL)
return ERR_PTR(-ENOMEM);
- ret = gfs2_dir_read_data(ip, (char *)hc, 0, hsize, 1);
+ ret = gfs2_dir_read_data(ip, hc, hsize);
if (ret < 0) {
kfree(hc);
return ERR_PTR(ret);
@@ -1695,7 +1681,6 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
const struct qstr *name = &dentry->d_name;
struct gfs2_dirent *dent, *prev = NULL;
struct buffer_head *bh;
- int error;
/* Returns _either_ the entry (if its first in block) or the
previous entry otherwise */
@@ -1724,22 +1709,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
}
brelse(bh);
- error = gfs2_meta_inode_buffer(dip, &bh);
- if (error)
- return error;
-
if (!dip->i_entries)
gfs2_consist_inode(dip);
- gfs2_trans_add_bh(dip->i_gl, bh, 1);
dip->i_entries--;
dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
if (S_ISDIR(dentry->d_inode->i_mode))
drop_nlink(&dip->i_inode);
- gfs2_dinode_out(dip, bh->b_data);
- brelse(bh);
mark_inode_dirty(&dip->i_inode);
- return error;
+ return 0;
}
/**
@@ -1829,10 +1807,6 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
if (error)
goto out_put;
- error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
- if (error)
- goto out_qs;
-
/* Count the number of leaves */
bh = leaf_bh;
@@ -1847,7 +1821,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
if (blk != leaf_no)
brelse(bh);
- gfs2_rlist_add(sdp, &rlist, blk);
+ gfs2_rlist_add(dip, &rlist, blk);
l_blocks++;
}
@@ -1911,8 +1885,6 @@ out_rg_gunlock:
gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
out_rlist:
gfs2_rlist_free(&rlist);
- gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
-out_qs:
gfs2_quota_unhold(dip);
out_put:
gfs2_alloc_put(dip);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index edeb9e80290..ce36a56dfea 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -59,15 +59,24 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
struct gfs2_holder i_gh;
loff_t error;
- if (origin == 2) {
+ switch (origin) {
+ case SEEK_END: /* These reference inode->i_size */
+ case SEEK_DATA:
+ case SEEK_HOLE:
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
&i_gh);
if (!error) {
- error = generic_file_llseek_unlocked(file, offset, origin);
+ error = generic_file_llseek(file, offset, origin);
gfs2_glock_dq_uninit(&i_gh);
}
- } else
- error = generic_file_llseek_unlocked(file, offset, origin);
+ break;
+ case SEEK_CUR:
+ case SEEK_SET:
+ error = generic_file_llseek(file, offset, origin);
+ break;
+ default:
+ error = -EINVAL;
+ }
return error;
}
@@ -357,8 +366,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
unsigned int data_blocks, ind_blocks, rblocks;
struct gfs2_holder gh;
struct gfs2_alloc *al;
+ loff_t size;
int ret;
+ /* Wait if fs is frozen. This is racy so we check again later on
+ * and retry if the fs has been frozen after the page lock has
+ * been acquired
+ */
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret)
@@ -367,8 +383,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
set_bit(GIF_SW_PAGED, &ip->i_flags);
- if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE))
+ if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {
+ lock_page(page);
+ if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+ ret = -EAGAIN;
+ unlock_page(page);
+ }
goto out_unlock;
+ }
+
ret = -ENOMEM;
al = gfs2_alloc_get(ip);
if (al == NULL)
@@ -388,7 +411,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
rblocks += data_blocks ? data_blocks : 1;
if (ind_blocks || data_blocks) {
rblocks += RES_STATFS + RES_QUOTA;
- rblocks += gfs2_rg_blocks(al);
+ rblocks += gfs2_rg_blocks(ip);
}
ret = gfs2_trans_begin(sdp, rblocks, 0);
if (ret)
@@ -396,21 +419,29 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
lock_page(page);
ret = -EINVAL;
- last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
- if (page->index > last_index)
- goto out_unlock_page;
+ size = i_size_read(inode);
+ last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+ /* Check page index against inode size */
+ if (size == 0 || (page->index > last_index))
+ goto out_trans_end;
+
+ ret = -EAGAIN;
+ /* If truncated, we must retry the operation, we may have raced
+ * with the glock demotion code.
+ */
+ if (!PageUptodate(page) || page->mapping != inode->i_mapping)
+ goto out_trans_end;
+
+ /* Unstuff, if required, and allocate backing blocks for page */
ret = 0;
- if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
- goto out_unlock_page;
- if (gfs2_is_stuffed(ip)) {
+ if (gfs2_is_stuffed(ip))
ret = gfs2_unstuff_dinode(ip, page);
- if (ret)
- goto out_unlock_page;
- }
- ret = gfs2_allocate_page_backing(page);
+ if (ret == 0)
+ ret = gfs2_allocate_page_backing(page);
-out_unlock_page:
- unlock_page(page);
+out_trans_end:
+ if (ret)
+ unlock_page(page);
gfs2_trans_end(sdp);
out_trans_fail:
gfs2_inplace_release(ip);
@@ -422,11 +453,17 @@ out_unlock:
gfs2_glock_dq(&gh);
out:
gfs2_holder_uninit(&gh);
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
- else if (ret)
- ret = VM_FAULT_SIGBUS;
- return ret;
+ if (ret == 0) {
+ set_page_dirty(page);
+ /* This check must be post dropping of transaction lock */
+ if (inode->i_sb->s_frozen == SB_UNFROZEN) {
+ wait_on_page_writeback(page);
+ } else {
+ ret = -EAGAIN;
+ unlock_page(page);
+ }
+ }
+ return block_page_mkwrite_return(ret);
}
static const struct vm_operations_struct gfs2_vm_ops = {
@@ -551,8 +588,16 @@ static int gfs2_close(struct inode *inode, struct file *file)
* @end: the end position in the file to sync
* @datasync: set if we can ignore timestamp changes
*
- * The VFS will flush data for us. We only need to worry
- * about metadata here.
+ * We split the data flushing here so that we don't wait for the data
+ * until after we've also sent the metadata to disk. Note that for
+ * data=ordered, we will write & wait for the data at the log flush
+ * stage anyway, so this is unlikely to make much of a difference
+ * except in the data=writeback case.
+ *
+ * If the fdatawrite fails due to any reason except -EIO, we will
+ * continue the remainder of the fsync, although we'll still report
+ * the error at the end. This is to match filemap_write_and_wait_range()
+ * behaviour.
*
* Returns: errno
*/
@@ -560,30 +605,34 @@ static int gfs2_close(struct inode *inode, struct file *file)
static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
- struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
struct gfs2_inode *ip = GFS2_I(inode);
- int ret;
+ int ret, ret1 = 0;
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
- mutex_lock(&inode->i_mutex);
+ if (mapping->nrpages) {
+ ret1 = filemap_fdatawrite_range(mapping, start, end);
+ if (ret1 == -EIO)
+ return ret1;
+ }
if (datasync)
sync_state &= ~I_DIRTY_SYNC;
if (sync_state) {
ret = sync_inode_metadata(inode, 1);
- if (ret) {
- mutex_unlock(&inode->i_mutex);
+ if (ret)
return ret;
- }
- gfs2_ail_flush(ip->i_gl);
+ if (gfs2_is_jdata(ip))
+ filemap_write_and_wait(mapping);
+ gfs2_ail_flush(ip->i_gl, 1);
}
- mutex_unlock(&inode->i_mutex);
- return 0;
+ if (mapping->nrpages)
+ ret = filemap_fdatawait_range(mapping, start, end);
+
+ return ret ? ret : ret1;
}
/**
@@ -620,135 +669,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return generic_file_aio_write(iocb, iov, nr_segs, pos);
}
-static int empty_write_end(struct page *page, unsigned from,
- unsigned to, int mode)
-{
- struct inode *inode = page->mapping->host;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct buffer_head *bh;
- unsigned offset, blksize = 1 << inode->i_blkbits;
- pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-
- zero_user(page, from, to-from);
- mark_page_accessed(page);
-
- if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
- if (!gfs2_is_writeback(ip))
- gfs2_page_add_databufs(ip, page, from, to);
-
- block_commit_write(page, from, to);
- return 0;
- }
-
- offset = 0;
- bh = page_buffers(page);
- while (offset < to) {
- if (offset >= from) {
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- clear_buffer_new(bh);
- write_dirty_buffer(bh, WRITE);
- }
- offset += blksize;
- bh = bh->b_this_page;
- }
-
- offset = 0;
- bh = page_buffers(page);
- while (offset < to) {
- if (offset >= from) {
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- return -EIO;
- }
- offset += blksize;
- bh = bh->b_this_page;
- }
- return 0;
-}
-
-static int needs_empty_write(sector_t block, struct inode *inode)
-{
- int error;
- struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
-
- bh_map.b_size = 1 << inode->i_blkbits;
- error = gfs2_block_map(inode, block, &bh_map, 0);
- if (unlikely(error))
- return error;
- return !buffer_mapped(&bh_map);
-}
-
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
- int mode)
-{
- struct inode *inode = page->mapping->host;
- unsigned start, end, next, blksize;
- sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- int ret;
-
- blksize = 1 << inode->i_blkbits;
- next = end = 0;
- while (next < from) {
- next += blksize;
- block++;
- }
- start = next;
- do {
- next += blksize;
- ret = needs_empty_write(block, inode);
- if (unlikely(ret < 0))
- return ret;
- if (ret == 0) {
- if (end) {
- ret = __block_write_begin(page, start, end - start,
- gfs2_block_map);
- if (unlikely(ret))
- return ret;
- ret = empty_write_end(page, start, end, mode);
- if (unlikely(ret))
- return ret;
- end = 0;
- }
- start = next;
- }
- else
- end = next;
- block++;
- } while (next < to);
-
- if (end) {
- ret = __block_write_begin(page, start, end - start, gfs2_block_map);
- if (unlikely(ret))
- return ret;
- ret = empty_write_end(page, start, end, mode);
- if (unlikely(ret))
- return ret;
- }
-
- return 0;
-}
-
static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
int mode)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct buffer_head *dibh;
int error;
- u64 start = offset >> PAGE_CACHE_SHIFT;
- unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
- u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
- pgoff_t curr;
- struct page *page;
- unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
- unsigned int from, to;
-
- if (!end_offset)
- end_offset = PAGE_CACHE_SIZE;
+ unsigned int nr_blks;
+ sector_t lblock = offset >> inode->i_blkbits;
error = gfs2_meta_inode_buffer(ip, &dibh);
if (unlikely(error))
- goto out;
+ return error;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -758,40 +690,31 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
goto out;
}
- curr = start;
- offset = start << PAGE_CACHE_SHIFT;
- from = start_offset;
- to = PAGE_CACHE_SIZE;
- while (curr <= end) {
- page = grab_cache_page_write_begin(inode->i_mapping, curr,
- AOP_FLAG_NOFS);
- if (unlikely(!page)) {
- error = -ENOMEM;
- goto out;
- }
+ while (len) {
+ struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+ bh_map.b_size = len;
+ set_buffer_zeronew(&bh_map);
- if (curr == end)
- to = end_offset;
- error = write_empty_blocks(page, from, to, mode);
- if (!error && offset + to > inode->i_size &&
- !(mode & FALLOC_FL_KEEP_SIZE)) {
- i_size_write(inode, offset + to);
- }
- unlock_page(page);
- page_cache_release(page);
- if (error)
+ error = gfs2_block_map(inode, lblock, &bh_map, 1);
+ if (unlikely(error))
goto out;
- curr++;
- offset += PAGE_CACHE_SIZE;
- from = 0;
+ len -= bh_map.b_size;
+ nr_blks = bh_map.b_size >> inode->i_blkbits;
+ lblock += nr_blks;
+ if (!buffer_new(&bh_map))
+ continue;
+ if (unlikely(!buffer_zeronew(&bh_map))) {
+ error = -EIO;
+ goto out;
+ }
}
+ if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
+ i_size_write(inode, offset + len);
- gfs2_dinode_out(ip, dibh->b_data);
mark_inode_dirty(inode);
- brelse(dibh);
-
out:
+ brelse(dibh);
return error;
}
@@ -799,7 +722,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
unsigned int *data_blocks, unsigned int *ind_blocks)
{
const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+ unsigned int max_blocks = ip->i_rgd->rd_free_clone;
unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
for (tmp = max_data; tmp > sdp->sd_diptrs;) {
@@ -831,6 +754,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
int error;
loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+ loff_t max_chunk_size = UINT_MAX & bsize_mask;
next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
/* We only support the FALLOC_FL_KEEP_SIZE mode */
@@ -884,11 +808,12 @@ retry:
goto out_qunlock;
}
max_bytes = bytes;
- calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+ calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
+ &max_bytes, &data_blocks, &ind_blocks);
al->al_requested = data_blocks + ind_blocks;
rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
- RES_RG_HDR + gfs2_rg_blocks(al);
+ RES_RG_HDR + gfs2_rg_blocks(ip);
if (gfs2_is_jdata(ip))
rblocks += data_blocks ? data_blocks : 1;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 66707118af2..2553b858a72 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -201,7 +201,7 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
-__attribute__ ((format(printf, 2, 3)))
+__printf(2, 3)
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index da21ecaafcc..1656df7aacd 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,40 +28,55 @@
#include "trans.h"
#include "dir.h"
+static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
+{
+ fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
+ bh, (unsigned long long)bh->b_blocknr, bh->b_state,
+ bh->b_page->mapping, bh->b_page->flags);
+ fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n",
+ gl->gl_name.ln_type, gl->gl_name.ln_number,
+ gfs2_glock2aspace(gl));
+ gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n");
+}
+
/**
* __gfs2_ail_flush - remove all buffers for a given lock from the AIL
* @gl: the glock
+ * @fsync: set when called from fsync (not all buffers will be clean)
*
* None of the buffers should be dirty, locked, or pinned.
*/
-static void __gfs2_ail_flush(struct gfs2_glock *gl)
+static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct list_head *head = &gl->gl_ail_list;
- struct gfs2_bufdata *bd;
+ struct gfs2_bufdata *bd, *tmp;
struct buffer_head *bh;
+ const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
+ sector_t blocknr;
+ gfs2_log_lock(sdp);
spin_lock(&sdp->sd_ail_lock);
- while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata,
- bd_ail_gl_list);
+ list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) {
bh = bd->bd_bh;
- gfs2_remove_from_ail(bd);
- bd->bd_bh = NULL;
+ if (bh->b_state & b_state) {
+ if (fsync)
+ continue;
+ gfs2_ail_error(gl, bh);
+ }
+ blocknr = bh->b_blocknr;
bh->b_private = NULL;
- spin_unlock(&sdp->sd_ail_lock);
+ gfs2_remove_from_ail(bd); /* drops ref on bh */
- bd->bd_blkno = bh->b_blocknr;
- gfs2_log_lock(sdp);
- gfs2_assert_withdraw(sdp, !buffer_busy(bh));
- gfs2_trans_add_revoke(sdp, bd);
- gfs2_log_unlock(sdp);
+ bd->bd_bh = NULL;
+ bd->bd_blkno = blocknr;
- spin_lock(&sdp->sd_ail_lock);
+ gfs2_trans_add_revoke(sdp, bd);
}
- gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+ BUG_ON(!fsync && atomic_read(&gl->gl_ail_count));
spin_unlock(&sdp->sd_ail_lock);
+ gfs2_log_unlock(sdp);
}
@@ -84,13 +99,13 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
BUG_ON(current->journal_info);
current->journal_info = &tr;
- __gfs2_ail_flush(gl);
+ __gfs2_ail_flush(gl, 0);
gfs2_trans_end(sdp);
gfs2_log_flush(sdp, NULL);
}
-void gfs2_ail_flush(struct gfs2_glock *gl)
+void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
unsigned int revokes = atomic_read(&gl->gl_ail_count);
@@ -102,7 +117,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
ret = gfs2_trans_begin(sdp, 0, revokes);
if (ret)
return;
- __gfs2_ail_flush(gl);
+ __gfs2_ail_flush(gl, fsync);
gfs2_trans_end(sdp);
gfs2_log_flush(sdp, NULL);
}
@@ -119,6 +134,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
static void rgrp_go_sync(struct gfs2_glock *gl)
{
struct address_space *metamapping = gfs2_glock2aspace(gl);
+ struct gfs2_rgrpd *rgd;
int error;
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -130,6 +146,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
error = filemap_fdatawait(metamapping);
mapping_set_error(metamapping, error);
gfs2_ail_empty_gl(gl);
+
+ spin_lock(&gl->gl_spin);
+ rgd = gl->gl_object;
+ if (rgd)
+ gfs2_free_clones(rgd);
+ spin_unlock(&gl->gl_spin);
}
/**
@@ -277,7 +299,7 @@ static void gfs2_set_nlink(struct inode *inode, u32 nlink)
if (nlink == 0)
clear_nlink(inode);
else
- inode->i_nlink = nlink;
+ set_nlink(inode, nlink);
}
}
@@ -430,33 +452,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
}
/**
- * rgrp_go_lock - operation done after an rgrp lock is locked by
- * a first holder on this node.
- * @gl: the glock
- * @flags:
- *
- * Returns: errno
- */
-
-static int rgrp_go_lock(struct gfs2_holder *gh)
-{
- return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
-}
-
-/**
- * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
- * a last holder on this node.
- * @gl: the glock
- * @flags:
- *
- */
-
-static void rgrp_go_unlock(struct gfs2_holder *gh)
-{
- gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
-}
-
-/**
* trans_go_sync - promote/demote the transaction glock
* @gl: the glock
* @state: the requested state
@@ -558,8 +553,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_xmote_th = rgrp_go_sync,
.go_inval = rgrp_go_inval,
- .go_lock = rgrp_go_lock,
- .go_unlock = rgrp_go_unlock,
+ .go_lock = gfs2_rgrp_go_lock,
+ .go_unlock = gfs2_rgrp_go_unlock,
.go_dump = gfs2_rgrp_dump,
.go_type = LM_TYPE_RGRP,
.go_flags = GLOF_ASPACE,
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 6fce409b5a5..bf95a2dc166 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -23,6 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
extern const struct gfs2_glock_operations gfs2_journal_glops;
extern const struct gfs2_glock_operations *gfs2_glops_list[];
-extern void gfs2_ail_flush(struct gfs2_glock *gl);
+extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 892ac37de8a..7389dfdcc9e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -18,6 +18,7 @@
#include <linux/rcupdate.h>
#include <linux/rculist_bl.h>
#include <linux/completion.h>
+#include <linux/rbtree.h>
#define DIO_WAIT 0x00000010
#define DIO_METADATA 0x00000020
@@ -78,8 +79,7 @@ struct gfs2_bitmap {
};
struct gfs2_rgrpd {
- struct list_head rd_list; /* Link with superblock */
- struct list_head rd_list_mru;
+ struct rb_node rd_node; /* Link with superblock */
struct gfs2_glock *rd_gl; /* Glock for this rgrp */
u64 rd_addr; /* grp block disk address */
u64 rd_data0; /* first data location */
@@ -91,10 +91,7 @@ struct gfs2_rgrpd {
u32 rd_dinodes;
u64 rd_igeneration;
struct gfs2_bitmap *rd_bits;
- struct mutex rd_mutex;
- struct gfs2_log_element rd_le;
struct gfs2_sbd *rd_sbd;
- unsigned int rd_bh_count;
u32 rd_last_alloc;
u32 rd_flags;
#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
@@ -106,12 +103,15 @@ struct gfs2_rgrpd {
enum gfs2_state_bits {
BH_Pinned = BH_PrivateStart,
BH_Escaped = BH_PrivateStart + 1,
+ BH_Zeronew = BH_PrivateStart + 2,
};
BUFFER_FNS(Pinned, pinned)
TAS_BUFFER_FNS(Pinned, pinned)
BUFFER_FNS(Escaped, escaped)
TAS_BUFFER_FNS(Escaped, escaped)
+BUFFER_FNS(Zeronew, zeronew)
+TAS_BUFFER_FNS(Zeronew, zeronew)
struct gfs2_bufdata {
struct buffer_head *bd_bh;
@@ -246,7 +246,6 @@ struct gfs2_glock {
struct gfs2_alloc {
/* Quota stuff */
-
struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
unsigned int al_qd_num;
@@ -255,18 +254,13 @@ struct gfs2_alloc {
u32 al_alloced; /* Filled in by gfs2_alloc_*() */
/* Filled in by gfs2_inplace_reserve() */
-
- unsigned int al_line;
- char *al_file;
- struct gfs2_holder al_ri_gh;
struct gfs2_holder al_rgd_gh;
- struct gfs2_rgrpd *al_rgd;
-
};
enum {
GIF_INVALID = 0,
GIF_QD_LOCKED = 1,
+ GIF_ALLOC_FAILED = 2,
GIF_SW_PAGED = 3,
};
@@ -282,6 +276,7 @@ struct gfs2_inode {
struct gfs2_holder i_iopen_gh;
struct gfs2_holder i_gh; /* for prepare/commit_write only */
struct gfs2_alloc *i_alloc;
+ struct gfs2_rgrpd *i_rgd;
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
struct list_head i_trunc_list;
@@ -574,9 +569,7 @@ struct gfs2_sbd {
int sd_rindex_uptodate;
spinlock_t sd_rindex_spin;
struct mutex sd_rindex_mutex;
- struct list_head sd_rindex_list;
- struct list_head sd_rindex_mru_list;
- struct gfs2_rgrpd *sd_rindex_forward;
+ struct rb_root sd_rindex_tree;
unsigned int sd_rgrps;
unsigned int sd_max_rg_data;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 900cf986aad..cfd4959b218 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -583,7 +583,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
goto fail_quota_locks;
error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- al->al_rgd->rd_length +
+ dip->i_rgd->rd_length +
2 * RES_DINODE +
RES_STATFS + RES_QUOTA, 0);
if (error)
@@ -613,8 +613,7 @@ fail_end_trans:
gfs2_trans_end(sdp);
fail_ipreserv:
- if (dip->i_alloc->al_rgd)
- gfs2_inplace_release(dip);
+ gfs2_inplace_release(dip);
fail_quota_locks:
gfs2_quota_unlock(dip);
@@ -624,31 +623,29 @@ fail:
return error;
}
-static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
- const struct qstr *qstr)
+int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *name;
-
- err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
- &name, &value, &len);
-
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
+ const struct xattr *xattr;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = __gfs2_xattr_set(inode, xattr->name, xattr->value,
+ xattr->value_len, 0,
+ GFS2_EATYPE_SECURITY);
+ if (err < 0)
+ break;
}
-
- err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0,
- GFS2_EATYPE_SECURITY);
- kfree(value);
- kfree(name);
-
return err;
}
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
+ &gfs2_initxattrs, NULL);
+}
+
/**
* gfs2_create_inode - Create a new inode
* @dir: The parent directory
@@ -663,7 +660,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
unsigned int mode, dev_t dev, const char *symname,
- unsigned int size)
+ unsigned int size, int excl)
{
const struct qstr *name = &dentry->d_name;
struct gfs2_holder ghs[2];
@@ -683,6 +680,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail;
error = create_ok(dip, name, mode);
+ if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
+ inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+ gfs2_glock_dq_uninit(ghs);
+ d_instantiate(dentry, inode);
+ return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+ }
if (error)
goto fail_gunlock;
@@ -725,21 +728,22 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
brelse(bh);
gfs2_trans_end(sdp);
- if (dip->i_alloc->al_rgd)
- gfs2_inplace_release(dip);
+ gfs2_inplace_release(dip);
gfs2_quota_unlock(dip);
gfs2_alloc_put(dip);
- gfs2_glock_dq_uninit_m(2, ghs);
mark_inode_dirty(inode);
+ gfs2_glock_dq_uninit_m(2, ghs);
d_instantiate(dentry, inode);
return 0;
fail_gunlock2:
gfs2_glock_dq_uninit(ghs + 1);
- if (inode && !IS_ERR(inode))
- iput(inode);
fail_gunlock:
gfs2_glock_dq_uninit(ghs);
+ if (inode && !IS_ERR(inode)) {
+ set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags);
+ iput(inode);
+ }
fail:
if (bh)
brelse(bh);
@@ -758,24 +762,10 @@ fail:
static int gfs2_create(struct inode *dir, struct dentry *dentry,
int mode, struct nameidata *nd)
{
- struct inode *inode;
- int ret;
-
- for (;;) {
- ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0);
- if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL)))
- return ret;
-
- inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- if (inode) {
- if (!IS_ERR(inode))
- break;
- return PTR_ERR(inode);
- }
- }
-
- d_instantiate(dentry, inode);
- return 0;
+ int excl = 0;
+ if (nd && (nd->flags & LOOKUP_EXCL))
+ excl = 1;
+ return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
}
/**
@@ -902,7 +892,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
goto out_gunlock_q;
error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- gfs2_rg_blocks(al) +
+ gfs2_rg_blocks(dip) +
2 * RES_DINODE + RES_STATFS +
RES_QUOTA, 0);
if (error)
@@ -924,8 +914,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
inc_nlink(&ip->i_inode);
ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_dinode_out(ip, dibh->b_data);
- mark_inode_dirty(&ip->i_inode);
+ ihold(inode);
+ d_instantiate(dentry, inode);
+ mark_inode_dirty(inode);
out_brelse:
brelse(dibh);
@@ -947,11 +938,6 @@ out_child:
out_parent:
gfs2_holder_uninit(ghs);
gfs2_holder_uninit(ghs + 1);
- if (!error) {
- ihold(inode);
- d_instantiate(dentry, inode);
- mark_inode_dirty(inode);
- }
return error;
}
@@ -1024,8 +1010,6 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip,
clear_nlink(inode);
else
drop_nlink(inode);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
- gfs2_dinode_out(ip, bh->b_data);
mark_inode_dirty(inode);
if (inode->i_nlink == 0)
gfs2_unlink_di(inode);
@@ -1053,13 +1037,8 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
struct buffer_head *bh;
struct gfs2_holder ghs[3];
struct gfs2_rgrpd *rgd;
- struct gfs2_holder ri_gh;
int error;
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- return error;
-
gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
@@ -1116,7 +1095,6 @@ out_child:
gfs2_glock_dq(ghs);
out_parent:
gfs2_holder_uninit(ghs);
- gfs2_glock_dq_uninit(&ri_gh);
return error;
}
@@ -1139,7 +1117,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
return -ENAMETOOLONG;
- return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size);
+ return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
}
/**
@@ -1153,7 +1131,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
- return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0);
+ return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0);
}
/**
@@ -1168,7 +1146,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
dev_t dev)
{
- return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0);
+ return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
}
/*
@@ -1234,7 +1212,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
struct gfs2_inode *nip = NULL;
struct gfs2_sbd *sdp = GFS2_SB(odir);
- struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
+ struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
struct gfs2_rgrpd *nrgd;
unsigned int num_gh;
int dir_rename = 0;
@@ -1248,10 +1226,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
return 0;
}
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- return error;
-
if (odip != ndip) {
error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
0, &r_gh);
@@ -1388,12 +1362,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
al->al_requested = sdp->sd_max_dirres;
- error = gfs2_inplace_reserve_ri(ndip);
+ error = gfs2_inplace_reserve(ndip);
if (error)
goto out_gunlock_q;
error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- gfs2_rg_blocks(al) +
+ gfs2_rg_blocks(ndip) +
4 * RES_DINODE + 4 * RES_LEAF +
RES_STATFS + RES_QUOTA + 4, 0);
if (error)
@@ -1459,7 +1433,6 @@ out_gunlock_r:
if (r_gh.gh_gl)
gfs2_glock_dq_uninit(&r_gh);
out:
- gfs2_glock_dq_uninit(&ri_gh);
return error;
}
@@ -1563,21 +1536,10 @@ int gfs2_permission(struct inode *inode, int mask)
return error;
}
-static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
{
- struct inode *inode = &ip->i_inode;
- struct buffer_head *dibh;
- int error;
-
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- return error;
-
setattr_copy(inode, attr);
mark_inode_dirty(inode);
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
return 0;
}
@@ -1589,19 +1551,19 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
* Returns: errno
*/
-int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+int gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
{
int error;
if (current->journal_info)
- return __gfs2_setattr_simple(ip, attr);
+ return __gfs2_setattr_simple(inode, attr);
- error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
+ error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0);
if (error)
return error;
- error = __gfs2_setattr_simple(ip, attr);
- gfs2_trans_end(GFS2_SB(&ip->i_inode));
+ error = __gfs2_setattr_simple(inode, attr);
+ gfs2_trans_end(GFS2_SB(inode));
return error;
}
@@ -1639,7 +1601,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (error)
goto out_gunlock_q;
- error = gfs2_setattr_simple(ip, attr);
+ error = gfs2_setattr_simple(inode, attr);
if (error)
goto out_end_trans;
@@ -1695,12 +1657,12 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
error = gfs2_acl_chmod(ip, attr);
else
- error = gfs2_setattr_simple(ip, attr);
+ error = gfs2_setattr_simple(inode, attr);
out:
- gfs2_glock_dq_uninit(&i_gh);
if (!error)
mark_inode_dirty(inode);
+ gfs2_glock_dq_uninit(&i_gh);
return error;
}
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 8d90e0c0767..276e7b52b65 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,7 +109,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip);
extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
int is_root);
extern int gfs2_permission(struct inode *inode, int mask);
-extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
+extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 05bbb124699..0301be655b1 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -60,6 +60,29 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
trace_gfs2_pin(bd, 1);
}
+static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
+{
+ return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP;
+}
+
+static void maybe_release_space(struct gfs2_bufdata *bd)
+{
+ struct gfs2_glock *gl = bd->bd_gl;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_rgrpd *rgd = gl->gl_object;
+ unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
+ struct gfs2_bitmap *bi = rgd->rd_bits + index;
+
+ if (bi->bi_clone == 0)
+ return;
+ if (sdp->sd_args.ar_discard)
+ gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi);
+ memcpy(bi->bi_clone + bi->bi_offset,
+ bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
+ clear_bit(GBF_FULL, &bi->bi_flags);
+ rgd->rd_free_clone = rgd->rd_free;
+}
+
/**
* gfs2_unpin - Unpin a buffer
* @sdp: the filesystem the buffer belongs to
@@ -81,6 +104,9 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
mark_buffer_dirty(bh);
clear_buffer_pinned(bh);
+ if (buffer_is_rgrp(bd))
+ maybe_release_space(bd);
+
spin_lock(&sdp->sd_ail_lock);
if (bd->bd_ail) {
list_del(&bd->bd_ail_st_list);
@@ -469,42 +495,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
gfs2_revoke_clean(sdp);
}
-static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
-{
- struct gfs2_rgrpd *rgd;
- struct gfs2_trans *tr = current->journal_info;
-
- tr->tr_touched = 1;
-
- rgd = container_of(le, struct gfs2_rgrpd, rd_le);
-
- gfs2_log_lock(sdp);
- if (!list_empty(&le->le_list)){
- gfs2_log_unlock(sdp);
- return;
- }
- gfs2_rgrp_bh_hold(rgd);
- sdp->sd_log_num_rg++;
- list_add(&le->le_list, &sdp->sd_log_le_rg);
- gfs2_log_unlock(sdp);
-}
-
-static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-{
- struct list_head *head = &sdp->sd_log_le_rg;
- struct gfs2_rgrpd *rgd;
-
- while (!list_empty(head)) {
- rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
- list_del_init(&rgd->rd_le.le_list);
- sdp->sd_log_num_rg--;
-
- gfs2_rgrp_repolish_clones(rgd);
- gfs2_rgrp_bh_put(rgd);
- }
- gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
-}
-
/**
* databuf_lo_add - Add a databuf to the transaction.
*
@@ -705,8 +695,6 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
brelse(bh_log);
brelse(bh_ip);
- if (error)
- break;
sdp->sd_replayed_blocks++;
}
@@ -771,8 +759,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
};
const struct gfs2_log_operations gfs2_rg_lops = {
- .lo_add = rg_lo_add,
- .lo_after_commit = rg_lo_after_commit,
.lo_name = "rg",
};
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 079587e5384..cb23c2be731 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -14,6 +14,7 @@
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/kthread.h>
+#include <linux/export.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/gfs2_ondisk.h>
@@ -77,8 +78,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
spin_lock_init(&sdp->sd_rindex_spin);
mutex_init(&sdp->sd_rindex_mutex);
- INIT_LIST_HEAD(&sdp->sd_rindex_list);
- INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
+ sdp->sd_rindex_tree.rb_node = NULL;
INIT_LIST_HEAD(&sdp->sd_jindex_list);
spin_lock_init(&sdp->sd_jindex_spin);
@@ -652,7 +652,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
fs_err(sdp, "can't lookup journal index: %d\n", error);
return PTR_ERR(sdp->sd_jindex);
}
- ip = GFS2_I(sdp->sd_jindex);
/* Load in the journal index special file */
@@ -764,7 +763,6 @@ fail:
static int init_inodes(struct gfs2_sbd *sdp, int undo)
{
int error = 0;
- struct gfs2_inode *ip;
struct inode *master = sdp->sd_master_dir->d_inode;
if (undo)
@@ -789,7 +787,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
fs_err(sdp, "can't get resource index inode: %d\n", error);
goto fail_statfs;
}
- ip = GFS2_I(sdp->sd_rindex);
sdp->sd_rindex_uptodate = 0;
/* Read in the quota inode */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 0e8bb13381e..7e528dc14f8 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -638,15 +638,18 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
unsigned long index = loc >> PAGE_CACHE_SHIFT;
unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
unsigned blocksize, iblock, pos;
- struct buffer_head *bh, *dibh;
+ struct buffer_head *bh;
struct page *page;
void *kaddr, *ptr;
struct gfs2_quota q, *qp;
int err, nbytes;
u64 size;
- if (gfs2_is_stuffed(ip))
- gfs2_unstuff_dinode(ip, NULL);
+ if (gfs2_is_stuffed(ip)) {
+ err = gfs2_unstuff_dinode(ip, NULL);
+ if (err)
+ return err;
+ }
memset(&q, 0, sizeof(struct gfs2_quota));
err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
@@ -736,22 +739,13 @@ get_a_page:
goto get_a_page;
}
- /* Update the disk inode timestamp and size (if extended) */
- err = gfs2_meta_inode_buffer(ip, &dibh);
- if (err)
- goto out;
-
size = loc + sizeof(struct gfs2_quota);
if (size > inode->i_size)
i_size_write(inode, size);
inode->i_mtime = inode->i_atime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
mark_inode_dirty(inode);
-
-out:
return err;
+
unlock_out:
unlock_page(page);
page_cache_release(page);
@@ -822,7 +816,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
goto out_alloc;
if (nalloc)
- blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
+ blocks += gfs2_rg_blocks(ip) + nalloc * ind_blocks + RES_STATFS;
error = gfs2_trans_begin(sdp, blocks, 0);
if (error)
@@ -936,7 +930,9 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
unsigned int x;
int error = 0;
- gfs2_quota_hold(ip, uid, gid);
+ error = gfs2_quota_hold(ip, uid, gid);
+ if (error)
+ return error;
if (capable(CAP_SYS_RESOURCE) ||
sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
@@ -1607,7 +1603,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
error = gfs2_inplace_reserve(ip);
if (error)
goto out_alloc;
- blocks += gfs2_rg_blocks(al);
+ blocks += gfs2_rg_blocks(ip);
}
/* Some quotas span block boundaries and can update two blocks,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7f8af1eb02d..96bd6d759f2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -15,6 +15,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/prefetch.h>
#include <linux/blkdev.h>
+#include <linux/rbtree.h>
#include "gfs2.h"
#include "incore.h"
@@ -328,18 +329,22 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
{
- struct gfs2_rgrpd *rgd;
+ struct rb_node **newn;
+ struct gfs2_rgrpd *cur;
spin_lock(&sdp->sd_rindex_spin);
-
- list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
- if (rgrp_contains_block(rgd, blk)) {
- list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+ newn = &sdp->sd_rindex_tree.rb_node;
+ while (*newn) {
+ cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node);
+ if (blk < cur->rd_addr)
+ newn = &((*newn)->rb_left);
+ else if (blk >= cur->rd_data0 + cur->rd_data)
+ newn = &((*newn)->rb_right);
+ else {
spin_unlock(&sdp->sd_rindex_spin);
- return rgd;
+ return cur;
}
}
-
spin_unlock(&sdp->sd_rindex_spin);
return NULL;
@@ -354,8 +359,15 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
{
- gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
- return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
+ const struct rb_node *n;
+ struct gfs2_rgrpd *rgd;
+
+ spin_lock(&sdp->sd_rindex_spin);
+ n = rb_first(&sdp->sd_rindex_tree);
+ rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
+ spin_unlock(&sdp->sd_rindex_spin);
+
+ return rgd;
}
/**
@@ -367,47 +379,60 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
{
- if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ const struct rb_node *n;
+
+ spin_lock(&sdp->sd_rindex_spin);
+ n = rb_next(&rgd->rd_node);
+ if (n == NULL)
+ n = rb_first(&sdp->sd_rindex_tree);
+
+ if (unlikely(&rgd->rd_node == n)) {
+ spin_unlock(&sdp->sd_rindex_spin);
return NULL;
- return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
+ }
+ rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
+ spin_unlock(&sdp->sd_rindex_spin);
+ return rgd;
}
-static void clear_rgrpdi(struct gfs2_sbd *sdp)
+void gfs2_free_clones(struct gfs2_rgrpd *rgd)
{
- struct list_head *head;
+ int x;
+
+ for (x = 0; x < rgd->rd_length; x++) {
+ struct gfs2_bitmap *bi = rgd->rd_bits + x;
+ kfree(bi->bi_clone);
+ bi->bi_clone = NULL;
+ }
+}
+
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
+{
+ struct rb_node *n;
struct gfs2_rgrpd *rgd;
struct gfs2_glock *gl;
- spin_lock(&sdp->sd_rindex_spin);
- sdp->sd_rindex_forward = NULL;
- spin_unlock(&sdp->sd_rindex_spin);
-
- head = &sdp->sd_rindex_list;
- while (!list_empty(head)) {
- rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
+ while ((n = rb_first(&sdp->sd_rindex_tree))) {
+ rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
gl = rgd->rd_gl;
- list_del(&rgd->rd_list);
- list_del(&rgd->rd_list_mru);
+ rb_erase(n, &sdp->sd_rindex_tree);
if (gl) {
+ spin_lock(&gl->gl_spin);
gl->gl_object = NULL;
+ spin_unlock(&gl->gl_spin);
gfs2_glock_add_to_lru(gl);
gfs2_glock_put(gl);
}
+ gfs2_free_clones(rgd);
kfree(rgd->rd_bits);
kmem_cache_free(gfs2_rgrpd_cachep, rgd);
}
}
-void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
-{
- mutex_lock(&sdp->sd_rindex_mutex);
- clear_rgrpdi(sdp);
- mutex_unlock(&sdp->sd_rindex_mutex);
-}
-
static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
{
printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
@@ -524,22 +549,34 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
return total_data;
}
-static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf)
+static void rgd_insert(struct gfs2_rgrpd *rgd)
{
- const struct gfs2_rindex *str = buf;
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*newn) {
+ struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
+ rd_node);
+
+ parent = *newn;
+ if (rgd->rd_addr < cur->rd_addr)
+ newn = &((*newn)->rb_left);
+ else if (rgd->rd_addr > cur->rd_addr)
+ newn = &((*newn)->rb_right);
+ else
+ return;
+ }
- rgd->rd_addr = be64_to_cpu(str->ri_addr);
- rgd->rd_length = be32_to_cpu(str->ri_length);
- rgd->rd_data0 = be64_to_cpu(str->ri_data0);
- rgd->rd_data = be32_to_cpu(str->ri_data);
- rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes);
+ rb_link_node(&rgd->rd_node, parent, newn);
+ rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
}
/**
* read_rindex_entry - Pull in a new resource index entry from the disk
* @gl: The glock covering the rindex inode
*
- * Returns: 0 on success, error code otherwise
+ * Returns: 0 on success, > 0 on EOF, error code otherwise
*/
static int read_rindex_entry(struct gfs2_inode *ip,
@@ -547,44 +584,53 @@ static int read_rindex_entry(struct gfs2_inode *ip,
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
- char buf[sizeof(struct gfs2_rindex)];
+ struct gfs2_rindex buf;
int error;
struct gfs2_rgrpd *rgd;
- error = gfs2_internal_read(ip, ra_state, buf, &pos,
+ if (pos >= i_size_read(&ip->i_inode))
+ return 1;
+
+ error = gfs2_internal_read(ip, ra_state, (char *)&buf, &pos,
sizeof(struct gfs2_rindex));
- if (!error)
- return 0;
- if (error != sizeof(struct gfs2_rindex)) {
- if (error > 0)
- error = -EIO;
- return error;
- }
+
+ if (error != sizeof(struct gfs2_rindex))
+ return (error == 0) ? 1 : error;
rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
error = -ENOMEM;
if (!rgd)
return error;
- mutex_init(&rgd->rd_mutex);
- lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
rgd->rd_sbd = sdp;
+ rgd->rd_addr = be64_to_cpu(buf.ri_addr);
+ rgd->rd_length = be32_to_cpu(buf.ri_length);
+ rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
+ rgd->rd_data = be32_to_cpu(buf.ri_data);
+ rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
- list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
- list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
-
- gfs2_rindex_in(rgd, buf);
error = compute_bitstructs(rgd);
if (error)
- return error;
+ goto fail;
error = gfs2_glock_get(sdp, rgd->rd_addr,
&gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
if (error)
- return error;
+ goto fail;
rgd->rd_gl->gl_object = rgd;
rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+ if (rgd->rd_data > sdp->sd_max_rg_data)
+ sdp->sd_max_rg_data = rgd->rd_data;
+ spin_lock(&sdp->sd_rindex_spin);
+ rgd_insert(rgd);
+ sdp->sd_rgrps++;
+ spin_unlock(&sdp->sd_rindex_spin);
+ return error;
+
+fail:
+ kfree(rgd->rd_bits);
+ kmem_cache_free(gfs2_rgrpd_cachep, rgd);
return error;
}
@@ -595,40 +641,28 @@ static int read_rindex_entry(struct gfs2_inode *ip,
* Returns: 0 on successful update, error code otherwise
*/
-int gfs2_ri_update(struct gfs2_inode *ip)
+static int gfs2_ri_update(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct inode *inode = &ip->i_inode;
struct file_ra_state ra_state;
- u64 rgrp_count = i_size_read(inode);
- struct gfs2_rgrpd *rgd;
- unsigned int max_data = 0;
int error;
- do_div(rgrp_count, sizeof(struct gfs2_rindex));
- clear_rgrpdi(sdp);
-
file_ra_state_init(&ra_state, inode->i_mapping);
- for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) {
+ do {
error = read_rindex_entry(ip, &ra_state);
- if (error) {
- clear_rgrpdi(sdp);
- return error;
- }
- }
+ } while (error == 0);
+
+ if (error < 0)
+ return error;
- list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
- if (rgd->rd_data > max_data)
- max_data = rgd->rd_data;
- sdp->sd_max_rg_data = max_data;
sdp->sd_rindex_uptodate = 1;
return 0;
}
/**
- * gfs2_rindex_hold - Grab a lock on the rindex
+ * gfs2_rindex_update - Update the rindex if required
* @sdp: The GFS2 superblock
- * @ri_gh: the glock holder
*
* We grab a lock on the rindex inode to make sure that it doesn't
* change whilst we are performing an operation. We keep this lock
@@ -640,30 +674,29 @@ int gfs2_ri_update(struct gfs2_inode *ip)
* special file, which might have been updated if someone expanded the
* filesystem (via gfs2_grow utility), which adds new resource groups.
*
- * Returns: 0 on success, error code otherwise
+ * Returns: 0 on succeess, error code otherwise
*/
-int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
+int gfs2_rindex_update(struct gfs2_sbd *sdp)
{
struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
struct gfs2_glock *gl = ip->i_gl;
- int error;
-
- error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
- if (error)
- return error;
+ struct gfs2_holder ri_gh;
+ int error = 0;
/* Read new copy from disk if we don't have the latest */
if (!sdp->sd_rindex_uptodate) {
mutex_lock(&sdp->sd_rindex_mutex);
- if (!sdp->sd_rindex_uptodate) {
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
+ if (error)
+ return error;
+ if (!sdp->sd_rindex_uptodate)
error = gfs2_ri_update(ip);
- if (error)
- gfs2_glock_dq_uninit(ri_gh);
- }
+ gfs2_glock_dq_uninit(&ri_gh);
mutex_unlock(&sdp->sd_rindex_mutex);
}
+
return error;
}
@@ -694,7 +727,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
}
/**
- * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
+ * gfs2_rgrp_go_lock - Read in a RG's header and bitmaps
* @rgd: the struct gfs2_rgrpd describing the RG to read in
*
* Read in all of a Resource Group's header and bitmap blocks.
@@ -703,8 +736,9 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
* Returns: errno
*/
-int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
{
+ struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
struct gfs2_sbd *sdp = rgd->rd_sbd;
struct gfs2_glock *gl = rgd->rd_gl;
unsigned int length = rgd->rd_length;
@@ -712,17 +746,6 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
unsigned int x, y;
int error;
- mutex_lock(&rgd->rd_mutex);
-
- spin_lock(&sdp->sd_rindex_spin);
- if (rgd->rd_bh_count) {
- rgd->rd_bh_count++;
- spin_unlock(&sdp->sd_rindex_spin);
- mutex_unlock(&rgd->rd_mutex);
- return 0;
- }
- spin_unlock(&sdp->sd_rindex_spin);
-
for (x = 0; x < length; x++) {
bi = rgd->rd_bits + x;
error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
@@ -747,15 +770,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
+ rgd->rd_free_clone = rgd->rd_free;
}
- spin_lock(&sdp->sd_rindex_spin);
- rgd->rd_free_clone = rgd->rd_free;
- rgd->rd_bh_count++;
- spin_unlock(&sdp->sd_rindex_spin);
-
- mutex_unlock(&rgd->rd_mutex);
-
return 0;
fail:
@@ -765,52 +782,32 @@ fail:
bi->bi_bh = NULL;
gfs2_assert_warn(sdp, !bi->bi_clone);
}
- mutex_unlock(&rgd->rd_mutex);
return error;
}
-void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
-{
- struct gfs2_sbd *sdp = rgd->rd_sbd;
-
- spin_lock(&sdp->sd_rindex_spin);
- gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
- rgd->rd_bh_count++;
- spin_unlock(&sdp->sd_rindex_spin);
-}
-
/**
- * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get()
* @rgd: the struct gfs2_rgrpd describing the RG to read in
*
*/
-void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
+void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
{
- struct gfs2_sbd *sdp = rgd->rd_sbd;
+ struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
int x, length = rgd->rd_length;
- spin_lock(&sdp->sd_rindex_spin);
- gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
- if (--rgd->rd_bh_count) {
- spin_unlock(&sdp->sd_rindex_spin);
- return;
- }
-
for (x = 0; x < length; x++) {
struct gfs2_bitmap *bi = rgd->rd_bits + x;
- kfree(bi->bi_clone);
- bi->bi_clone = NULL;
brelse(bi->bi_bh);
bi->bi_bh = NULL;
}
- spin_unlock(&sdp->sd_rindex_spin);
}
-static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
- const struct gfs2_bitmap *bi)
+void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+ struct buffer_head *bh,
+ const struct gfs2_bitmap *bi)
{
struct super_block *sb = sdp->sd_vfs;
struct block_device *bdev = sb->s_bdev;
@@ -823,7 +820,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
unsigned int x;
for (x = 0; x < bi->bi_len; x++) {
- const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x;
+ const u8 *orig = bh->b_data + bi->bi_offset + x;
const u8 *clone = bi->bi_clone + bi->bi_offset + x;
u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
diff &= 0x55;
@@ -862,28 +859,6 @@ fail:
sdp->sd_args.ar_discard = 0;
}
-void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
-{
- struct gfs2_sbd *sdp = rgd->rd_sbd;
- unsigned int length = rgd->rd_length;
- unsigned int x;
-
- for (x = 0; x < length; x++) {
- struct gfs2_bitmap *bi = rgd->rd_bits + x;
- if (!bi->bi_clone)
- continue;
- if (sdp->sd_args.ar_discard)
- gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
- clear_bit(GBF_FULL, &bi->bi_flags);
- memcpy(bi->bi_clone + bi->bi_offset,
- bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
- }
-
- spin_lock(&sdp->sd_rindex_spin);
- rgd->rd_free_clone = rgd->rd_free;
- spin_unlock(&sdp->sd_rindex_spin);
-}
-
/**
* gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
* @ip: the incore GFS2 inode structure
@@ -893,38 +868,35 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ int error;
BUG_ON(ip->i_alloc != NULL);
ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS);
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ fs_warn(sdp, "rindex update returns %d\n", error);
return ip->i_alloc;
}
/**
* try_rgrp_fit - See if a given reservation will fit in a given RG
* @rgd: the RG data
- * @al: the struct gfs2_alloc structure describing the reservation
+ * @ip: the inode
*
* If there's room for the requested blocks to be allocated from the RG:
- * Sets the $al_rgd field in @al.
*
* Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
*/
-static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
+static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
{
- struct gfs2_sbd *sdp = rgd->rd_sbd;
- int ret = 0;
+ const struct gfs2_alloc *al = ip->i_alloc;
if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
return 0;
-
- spin_lock(&sdp->sd_rindex_spin);
- if (rgd->rd_free_clone >= al->al_requested) {
- al->al_rgd = rgd;
- ret = 1;
- }
- spin_unlock(&sdp->sd_rindex_spin);
-
- return ret;
+ if (rgd->rd_free_clone >= al->al_requested)
+ return 1;
+ return 0;
}
/**
@@ -992,76 +964,6 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
}
/**
- * recent_rgrp_next - get next RG from "recent" list
- * @cur_rgd: current rgrp
- *
- * Returns: The next rgrp in the recent list
- */
-
-static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
-{
- struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
- struct list_head *head;
- struct gfs2_rgrpd *rgd;
-
- spin_lock(&sdp->sd_rindex_spin);
- head = &sdp->sd_rindex_mru_list;
- if (unlikely(cur_rgd->rd_list_mru.next == head)) {
- spin_unlock(&sdp->sd_rindex_spin);
- return NULL;
- }
- rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
- spin_unlock(&sdp->sd_rindex_spin);
- return rgd;
-}
-
-/**
- * forward_rgrp_get - get an rgrp to try next from full list
- * @sdp: The GFS2 superblock
- *
- * Returns: The rgrp to try next
- */
-
-static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
-{
- struct gfs2_rgrpd *rgd;
- unsigned int journals = gfs2_jindex_size(sdp);
- unsigned int rg = 0, x;
-
- spin_lock(&sdp->sd_rindex_spin);
-
- rgd = sdp->sd_rindex_forward;
- if (!rgd) {
- if (sdp->sd_rgrps >= journals)
- rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
-
- for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
- x++, rgd = gfs2_rgrpd_get_next(rgd))
- /* Do Nothing */;
-
- sdp->sd_rindex_forward = rgd;
- }
-
- spin_unlock(&sdp->sd_rindex_spin);
-
- return rgd;
-}
-
-/**
- * forward_rgrp_set - set the forward rgrp pointer
- * @sdp: the filesystem
- * @rgd: The new forward rgrp
- *
- */
-
-static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
-{
- spin_lock(&sdp->sd_rindex_spin);
- sdp->sd_rindex_forward = rgd;
- spin_unlock(&sdp->sd_rindex_spin);
-}
-
-/**
* get_local_rgrp - Choose and lock a rgrp for allocation
* @ip: the inode to reserve space for
* @rgp: the chosen and locked rgrp
@@ -1076,14 +978,18 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *rgd, *begin = NULL;
struct gfs2_alloc *al = ip->i_alloc;
- int flags = LM_FLAG_TRY;
- int skipped = 0;
- int loops = 0;
int error, rg_locked;
+ int loops = 0;
+
+ if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
+ rgd = begin = ip->i_rgd;
+ else
+ rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal);
- rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
+ if (rgd == NULL)
+ return -EBADSLT;
- while (rgd) {
+ while (loops < 3) {
rg_locked = 0;
if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
@@ -1095,92 +1001,36 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
}
switch (error) {
case 0:
- if (try_rgrp_fit(rgd, al))
- goto out;
+ if (try_rgrp_fit(rgd, ip)) {
+ ip->i_rgd = rgd;
+ return 0;
+ }
if (rgd->rd_flags & GFS2_RDF_CHECK)
try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
if (!rg_locked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
/* fall through */
case GLR_TRYFAILED:
- rgd = recent_rgrp_next(rgd);
- break;
-
- default:
- return error;
- }
- }
-
- /* Go through full list of rgrps */
-
- begin = rgd = forward_rgrp_get(sdp);
-
- for (;;) {
- rg_locked = 0;
-
- if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
- rg_locked = 1;
- error = 0;
- } else {
- error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
- &al->al_rgd_gh);
- }
- switch (error) {
- case 0:
- if (try_rgrp_fit(rgd, al))
- goto out;
- if (rgd->rd_flags & GFS2_RDF_CHECK)
- try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
- if (!rg_locked)
- gfs2_glock_dq_uninit(&al->al_rgd_gh);
- break;
-
- case GLR_TRYFAILED:
- skipped++;
+ rgd = gfs2_rgrpd_get_next(rgd);
+ if (rgd == begin)
+ loops++;
break;
-
default:
return error;
}
-
- rgd = gfs2_rgrpd_get_next(rgd);
- if (!rgd)
- rgd = gfs2_rgrpd_get_first(sdp);
-
- if (rgd == begin) {
- if (++loops >= 3)
- return -ENOSPC;
- if (!skipped)
- loops++;
- flags = 0;
- if (loops == 2)
- gfs2_log_flush(sdp, NULL);
- }
}
-out:
- if (begin) {
- spin_lock(&sdp->sd_rindex_spin);
- list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
- spin_unlock(&sdp->sd_rindex_spin);
- rgd = gfs2_rgrpd_get_next(rgd);
- if (!rgd)
- rgd = gfs2_rgrpd_get_first(sdp);
- forward_rgrp_set(sdp, rgd);
- }
-
- return 0;
+ return -ENOSPC;
}
/**
- * gfs2_inplace_reserve_i - Reserve space in the filesystem
+ * gfs2_inplace_reserve - Reserve space in the filesystem
* @ip: the inode to reserve space for
*
* Returns: errno
*/
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
- char *file, unsigned int line)
+int gfs2_inplace_reserve(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_alloc *al = ip->i_alloc;
@@ -1191,45 +1041,22 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
if (gfs2_assert_warn(sdp, al->al_requested))
return -EINVAL;
- if (hold_rindex) {
- /* We need to hold the rindex unless the inode we're using is
- the rindex itself, in which case it's already held. */
- if (ip != GFS2_I(sdp->sd_rindex))
- error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
- else if (!sdp->sd_rgrps) /* We may not have the rindex read
- in, so: */
- error = gfs2_ri_update(ip);
- if (error)
- return error;
- }
-
-try_again:
do {
error = get_local_rgrp(ip, &last_unlinked);
- /* If there is no space, flushing the log may release some */
- if (error) {
- if (ip == GFS2_I(sdp->sd_rindex) &&
- !sdp->sd_rindex_uptodate) {
- error = gfs2_ri_update(ip);
- if (error)
- return error;
- goto try_again;
- }
- gfs2_log_flush(sdp, NULL);
+ if (error != -ENOSPC)
+ break;
+ /* Check that fs hasn't grown if writing to rindex */
+ if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
+ error = gfs2_ri_update(ip);
+ if (error)
+ break;
+ continue;
}
- } while (error && tries++ < 3);
-
- if (error) {
- if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
- gfs2_glock_dq_uninit(&al->al_ri_gh);
- return error;
- }
-
- /* no error, so we have the rgrp set in the inode's allocation. */
- al->al_file = file;
- al->al_line = line;
+ /* Flushing the log may release space */
+ gfs2_log_flush(sdp, NULL);
+ } while (tries++ < 3);
- return 0;
+ return error;
}
/**
@@ -1241,20 +1068,10 @@ try_again:
void gfs2_inplace_release(struct gfs2_inode *ip)
{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_alloc *al = ip->i_alloc;
- if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
- fs_warn(sdp, "al_alloced = %u, al_requested = %u "
- "al_file = %s, al_line = %u\n",
- al->al_alloced, al->al_requested, al->al_file,
- al->al_line);
-
- al->al_rgd = NULL;
if (al->al_rgd_gh.gh_gl)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
- if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
- gfs2_glock_dq_uninit(&al->al_ri_gh);
}
/**
@@ -1352,6 +1169,7 @@ do_search:
/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
bitmaps, so we must search the originals for that. */
buffer = bi->bi_bh->b_data + bi->bi_offset;
+ WARN_ON(!buffer_uptodate(bi->bi_bh));
if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
buffer = bi->bi_clone + bi->bi_offset;
@@ -1371,6 +1189,7 @@ skip:
if (blk == BFITNOENT)
return blk;
+
*n = 1;
if (old_state == new_state)
goto out;
@@ -1503,7 +1322,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
if (al == NULL)
return -ECANCELED;
- rgd = al->al_rgd;
+ rgd = ip->i_rgd;
if (rgrp_contains_block(rgd, ip->i_goal))
goal = ip->i_goal - rgd->rd_data0;
@@ -1518,7 +1337,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
rgd->rd_last_alloc = blk;
block = rgd->rd_data0 + blk;
- ip->i_goal = block;
+ ip->i_goal = block + *n - 1;
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error == 0) {
struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
@@ -1539,9 +1358,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
gfs2_statfs_change(sdp, 0, -(s64)*n, 0);
gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
- spin_lock(&sdp->sd_rindex_spin);
rgd->rd_free_clone -= *n;
- spin_unlock(&sdp->sd_rindex_spin);
trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED);
*bn = block;
return 0;
@@ -1564,7 +1381,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_alloc *al = dip->i_alloc;
- struct gfs2_rgrpd *rgd = al->al_rgd;
+ struct gfs2_rgrpd *rgd = dip->i_rgd;
u32 blk;
u64 block;
unsigned int n = 1;
@@ -1594,9 +1411,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
gfs2_statfs_change(sdp, 0, -1, +1);
gfs2_trans_add_unrevoke(sdp, block, 1);
- spin_lock(&sdp->sd_rindex_spin);
rgd->rd_free_clone--;
- spin_unlock(&sdp->sd_rindex_spin);
trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
*bn = block;
return 0;
@@ -1629,8 +1444,6 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
- gfs2_trans_add_rg(rgd);
-
/* Directories keep their data in the metadata address space */
if (meta || ip->i_depth)
gfs2_meta_wipe(ip, bstart, blen);
@@ -1666,7 +1479,6 @@ void gfs2_unlink_di(struct inode *inode)
trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED);
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
- gfs2_trans_add_rg(rgd);
}
static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
@@ -1688,7 +1500,6 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_statfs_change(sdp, 0, +1, -1);
- gfs2_trans_add_rg(rgd);
}
@@ -1714,41 +1525,33 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
{
struct gfs2_rgrpd *rgd;
- struct gfs2_holder ri_gh, rgd_gh;
- struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
- int ri_locked = 0;
+ struct gfs2_holder rgd_gh;
int error;
- if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- goto fail;
- ri_locked = 1;
- }
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
error = -EINVAL;
rgd = gfs2_blk2rgrpd(sdp, no_addr);
if (!rgd)
- goto fail_rindex;
+ goto fail;
error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
if (error)
- goto fail_rindex;
+ goto fail;
if (gfs2_get_block_type(rgd, no_addr) != type)
error = -ESTALE;
gfs2_glock_dq_uninit(&rgd_gh);
-fail_rindex:
- if (ri_locked)
- gfs2_glock_dq_uninit(&ri_gh);
fail:
return error;
}
/**
* gfs2_rlist_add - add a RG to a list of RGs
- * @sdp: the filesystem
+ * @ip: the inode
* @rlist: the list of resource groups
* @block: the block
*
@@ -1758,9 +1561,10 @@ fail:
*
*/
-void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
u64 block)
{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *rgd;
struct gfs2_rgrpd **tmp;
unsigned int new_space;
@@ -1769,12 +1573,15 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
return;
- rgd = gfs2_blk2rgrpd(sdp, block);
+ if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block))
+ rgd = ip->i_rgd;
+ else
+ rgd = gfs2_blk2rgrpd(sdp, block);
if (!rgd) {
- if (gfs2_consist(sdp))
- fs_err(sdp, "block = %llu\n", (unsigned long long)block);
+ fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block);
return;
}
+ ip->i_rgd = rgd;
for (x = 0; x < rlist->rl_rgrps; x++)
if (rlist->rl_rgd[x] == rgd)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index d253f9a8c70..cf5c5018019 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -18,18 +18,15 @@ struct gfs2_holder;
extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
-struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
-struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
-struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
+extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
-
-extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
-extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
-extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
-
-extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
+extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
+extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
+extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
static inline void gfs2_alloc_put(struct gfs2_inode *ip)
@@ -39,16 +36,9 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
ip->i_alloc = NULL;
}
-extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
- char *file, unsigned int line);
-#define gfs2_inplace_reserve(ip) \
- gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
-#define gfs2_inplace_reserve_ri(ip) \
- gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
-
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip);
extern void gfs2_inplace_release(struct gfs2_inode *ip);
-extern int gfs2_ri_update(struct gfs2_inode *ip);
extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
@@ -66,11 +56,14 @@ struct gfs2_rgrp_list {
struct gfs2_holder *rl_ghs;
};
-extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
u64 block);
extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+ struct buffer_head *bh,
+ const struct gfs2_bitmap *bi);
#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b7beadd9ba4..71e420989f7 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -752,51 +752,77 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
struct backing_dev_info *bdi = metamapping->backing_dev_info;
- struct gfs2_holder gh;
+ int ret = 0;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+ if (bdi->dirty_exceeded)
+ gfs2_ail1_flush(sdp, wbc);
+ else
+ filemap_fdatawrite(metamapping);
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ ret = filemap_fdatawait(metamapping);
+ if (ret)
+ mark_inode_dirty_sync(inode);
+ return ret;
+}
+
+/**
+ * gfs2_dirty_inode - check for atime updates
+ * @inode: The inode in question
+ * @flags: The type of dirty
+ *
+ * Unfortunately it can be called under any combination of inode
+ * glock and transaction lock, so we have to check carefully.
+ *
+ * At the moment this deals only with atime - it should be possible
+ * to expand that role in future, once a review of the locking has
+ * been carried out.
+ */
+
+static void gfs2_dirty_inode(struct inode *inode, int flags)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
struct buffer_head *bh;
- struct timespec atime;
- struct gfs2_dinode *di;
- int ret = -EAGAIN;
- int unlock_required = 0;
-
- /* Skip timestamp update, if this is from a memalloc */
- if (current->flags & PF_MEMALLOC)
- goto do_flush;
+ struct gfs2_holder gh;
+ int need_unlock = 0;
+ int need_endtrans = 0;
+ int ret;
+
+ if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
+ return;
+
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- if (ret)
- goto do_flush;
- unlock_required = 1;
+ if (ret) {
+ fs_err(sdp, "dirty_inode: glock %d\n", ret);
+ return;
+ }
+ need_unlock = 1;
}
- ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
- if (ret)
- goto do_unlock;
+
+ if (current->journal_info == NULL) {
+ ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (ret) {
+ fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret);
+ goto out;
+ }
+ need_endtrans = 1;
+ }
+
ret = gfs2_meta_inode_buffer(ip, &bh);
if (ret == 0) {
- di = (struct gfs2_dinode *)bh->b_data;
- atime.tv_sec = be64_to_cpu(di->di_atime);
- atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
- if (timespec_compare(&inode->i_atime, &atime) > 0) {
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
- gfs2_dinode_out(ip, bh->b_data);
- }
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
}
- gfs2_trans_end(sdp);
-do_unlock:
- if (unlock_required)
+
+ if (need_endtrans)
+ gfs2_trans_end(sdp);
+out:
+ if (need_unlock)
gfs2_glock_dq_uninit(&gh);
-do_flush:
- if (wbc->sync_mode == WB_SYNC_ALL)
- gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
- filemap_fdatawrite(metamapping);
- if (bdi->dirty_exceeded)
- gfs2_ail1_flush(sdp, wbc);
- if (!ret && (wbc->sync_mode == WB_SYNC_ALL))
- ret = filemap_fdatawait(metamapping);
- if (ret)
- mark_inode_dirty_sync(inode);
- return ret;
}
/**
@@ -1011,7 +1037,6 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
{
- struct gfs2_holder ri_gh;
struct gfs2_rgrpd *rgd_next;
struct gfs2_holder *gha, *gh;
unsigned int slots = 64;
@@ -1024,10 +1049,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
if (!gha)
return -ENOMEM;
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- goto out;
-
rgd_next = gfs2_rgrpd_get_first(sdp);
for (;;) {
@@ -1070,9 +1091,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
yield();
}
- gfs2_glock_dq_uninit(&ri_gh);
-
-out:
kfree(gha);
return error;
}
@@ -1124,6 +1142,10 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
struct gfs2_statfs_change_host sc;
int error;
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
+
if (gfs2_tune_get(sdp, gt_statfs_slow))
error = gfs2_statfs_slow(sdp, &sc);
else
@@ -1394,21 +1416,17 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
if (error)
goto out;
- error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
- if (error)
- goto out_qs;
-
rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
if (!rgd) {
gfs2_consist_inode(ip);
error = -EIO;
- goto out_rindex_relse;
+ goto out_qs;
}
error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
&al->al_rgd_gh);
if (error)
- goto out_rindex_relse;
+ goto out_qs;
error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
sdp->sd_jdesc->jd_blocks);
@@ -1423,8 +1441,6 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
out_rg_gunlock:
gfs2_glock_dq_uninit(&al->al_rgd_gh);
-out_rindex_relse:
- gfs2_glock_dq_uninit(&al->al_ri_gh);
out_qs:
gfs2_quota_unhold(ip);
out:
@@ -1471,9 +1487,11 @@ static void gfs2_evict_inode(struct inode *inode)
goto out;
}
- error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
- if (error)
- goto out_truncate;
+ if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
+ error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+ if (error)
+ goto out_truncate;
+ }
if (test_bit(GIF_INVALID, &ip->i_flags)) {
error = gfs2_inode_refresh(ip);
@@ -1513,6 +1531,10 @@ static void gfs2_evict_inode(struct inode *inode)
goto out_unlock;
out_truncate:
+ gfs2_log_flush(sdp, ip->i_gl);
+ write_inode_now(inode, 1);
+ gfs2_ail_flush(ip->i_gl, 0);
+
/* Case 2 starts here */
error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
if (error)
@@ -1552,6 +1574,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
if (ip) {
ip->i_flags = 0;
ip->i_gl = NULL;
+ ip->i_rgd = NULL;
}
return &ip->i_inode;
}
@@ -1572,6 +1595,7 @@ const struct super_operations gfs2_super_ops = {
.alloc_inode = gfs2_alloc_inode,
.destroy_inode = gfs2_destroy_inode,
.write_inode = gfs2_write_inode,
+ .dirty_inode = gfs2_dirty_inode,
.evict_inode = gfs2_evict_inode,
.put_super = gfs2_put_super,
.sync_fs = gfs2_sync_fs,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 9ec73a85411..86ac75d99d3 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -185,8 +185,3 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
gfs2_log_unlock(sdp);
}
-void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
-{
- lops_add(rgd->rd_sbd, &rgd->rd_le);
-}
-
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index fb56b783e02..f8f101ef600 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -28,20 +28,20 @@ struct gfs2_glock;
/* reserve either the number of blocks to be allocated plus the rg header
* block, or all of the blocks in the rg, whichever is smaller */
-static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip)
{
- return (al->al_requested < al->al_rgd->rd_length)?
- al->al_requested + 1 : al->al_rgd->rd_length;
+ const struct gfs2_alloc *al = ip->i_alloc;
+ if (al->al_requested < ip->i_rgd->rd_length)
+ return al->al_requested + 1;
+ return ip->i_rgd->rd_length;
}
-int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
- unsigned int revokes);
+extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+ unsigned int revokes);
-void gfs2_trans_end(struct gfs2_sbd *sdp);
-
-void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
-void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
-void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
+extern void gfs2_trans_end(struct gfs2_sbd *sdp);
+extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 439b61c0326..71d7bf830c0 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -332,15 +332,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
if (error)
goto out_alloc;
- error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
- if (error)
- goto out_quota;
-
error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
- gfs2_glock_dq_uninit(&al->al_ri_gh);
-
-out_quota:
gfs2_quota_unhold(ip);
out_alloc:
gfs2_alloc_put(ip);
@@ -734,7 +727,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
goto out_gunlock_q;
error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
- blks + gfs2_rg_blocks(al) +
+ blks + gfs2_rg_blocks(ip) +
RES_DINODE + RES_STATFS + RES_QUOTA, 0);
if (error)
goto out_ipres;
@@ -1296,7 +1289,8 @@ fail:
int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct inode *inode = &ip->i_inode;
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_ea_location el;
int error;
@@ -1319,7 +1313,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
if (error)
return error;
- error = gfs2_setattr_simple(ip, attr);
+ error = gfs2_setattr_simple(inode, attr);
gfs2_trans_end(sdp);
return error;
}
@@ -1362,14 +1356,14 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
blen++;
else {
if (bstart)
- gfs2_rlist_add(sdp, &rlist, bstart);
+ gfs2_rlist_add(ip, &rlist, bstart);
bstart = bn;
blen = 1;
}
blks++;
}
if (bstart)
- gfs2_rlist_add(sdp, &rlist, bstart);
+ gfs2_rlist_add(ip, &rlist, bstart);
else
goto out;
@@ -1501,24 +1495,18 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
if (error)
goto out_alloc;
- error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
- if (error)
- goto out_quota;
-
error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
if (error)
- goto out_rindex;
+ goto out_quota;
if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
error = ea_dealloc_indirect(ip);
if (error)
- goto out_rindex;
+ goto out_quota;
}
error = ea_dealloc_block(ip);
-out_rindex:
- gfs2_glock_dq_uninit(&al->al_ri_gh);
out_quota:
gfs2_quota_unhold(ip);
out_alloc:
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 3ebc437736f..1cbdeea1db4 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -46,11 +46,26 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
case HFS_EXT_CNID:
hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize,
mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz));
+ if (HFS_I(tree->inode)->alloc_blocks >
+ HFS_I(tree->inode)->first_blocks) {
+ printk(KERN_ERR "hfs: invalid btree extent records\n");
+ unlock_new_inode(tree->inode);
+ goto free_inode;
+ }
+
tree->inode->i_mapping->a_ops = &hfs_btree_aops;
break;
case HFS_CAT_CNID:
hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize,
mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz));
+
+ if (!HFS_I(tree->inode)->first_blocks) {
+ printk(KERN_ERR "hfs: invalid btree extent records "
+ "(0 size).\n");
+ unlock_new_inode(tree->inode);
+ goto free_inode;
+ }
+
tree->inode->i_mapping->a_ops = &hfs_btree_aops;
break;
default:
@@ -59,11 +74,6 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
}
unlock_new_inode(tree->inode);
- if (!HFS_I(tree->inode)->first_blocks) {
- printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n");
- goto free_inode;
- }
-
mapping = tree->inode->i_mapping;
page = read_mapping_page(mapping, 0, NULL);
if (IS_ERR(page))
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b4d70b13be9..bce4eef91a0 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -198,7 +198,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, int mode,
res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
if (res) {
- inode->i_nlink = 0;
+ clear_nlink(inode);
hfs_delete_inode(inode);
iput(inode);
return res;
@@ -227,7 +227,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
if (res) {
- inode->i_nlink = 0;
+ clear_nlink(inode);
hfs_delete_inode(inode);
iput(inode);
return res;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 96a1b625fc7..a1a9fdcd2a0 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -183,7 +183,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
HFS_I(inode)->flags = 0;
HFS_I(inode)->rsrc_inode = NULL;
@@ -313,7 +313,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
/* Initialize the inode */
inode->i_uid = hsb->s_uid;
inode->i_gid = hsb->s_gid;
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
if (idata->key)
HFS_I(inode)->cat_key = *idata->key;
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index e673a88b8ae..b1ce4c7ad3f 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -40,6 +40,8 @@ int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in)
src = in->name;
srclen = in->len;
+ if (srclen > HFS_NAMELEN)
+ srclen = HFS_NAMELEN;
dst = out;
dstlen = HFS_MAX_NAMELEN;
if (nls_io) {
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 25b2443a004..4536cd3f15a 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -415,7 +415,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
goto out;
out_err:
- inode->i_nlink = 0;
+ clear_nlink(inode);
hfsplus_delete_inode(inode);
iput(inode);
out:
@@ -440,7 +440,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
if (res) {
- inode->i_nlink = 0;
+ clear_nlink(inode);
hfsplus_delete_inode(inode);
iput(inode);
goto out;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 4cc1e3a36ec..40e1413be4c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -391,7 +391,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
hip = HFSPLUS_I(inode);
@@ -512,7 +512,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
sizeof(struct hfsplus_cat_folder));
hfsplus_get_perms(inode, &folder->permissions, 1);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_size = 2 + be32_to_cpu(folder->valence);
inode->i_atime = hfsp_mt2ut(folder->access_date);
inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
@@ -532,11 +532,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
&file->rsrc_fork : &file->data_fork);
hfsplus_get_perms(inode, &file->permissions, 0);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
if (S_ISREG(inode->i_mode)) {
if (file->permissions.dev)
- inode->i_nlink =
- be32_to_cpu(file->permissions.dev);
+ set_nlink(inode,
+ be32_to_cpu(file->permissions.dev));
inode->i_op = &hfsplus_file_inode_operations;
inode->i_fop = &hfsplus_file_operations;
inode->i_mapping->a_ops = &hfsplus_aops;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 0d22afdd461..2f72da5ae68 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -541,7 +541,7 @@ static int read_name(struct inode *ino, char *name)
ino->i_ino = st.ino;
ino->i_mode = st.mode;
- ino->i_nlink = st.nlink;
+ set_nlink(ino, st.nlink);
ino->i_uid = st.uid;
ino->i_gid = st.gid;
ino->i_atime = st.atime;
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index d51a98384bc..dd7bc38a382 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -16,7 +16,6 @@
#include <sys/vfs.h>
#include "hostfs.h"
#include "os.h"
-#include "user.h"
#include <utime.h>
static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 96a8ed91ced..2fa0089a02a 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -247,7 +247,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
result->i_mode &= ~0111;
result->i_op = &hpfs_file_iops;
result->i_fop = &hpfs_file_ops;
- result->i_nlink = 1;
+ set_nlink(result, 1);
}
unlock_new_inode(result);
}
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 331b5e234ef..de946170ebb 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -311,8 +311,8 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
/* super.c */
-void hpfs_error(struct super_block *, const char *, ...)
- __attribute__((format (printf, 2, 3)));
+__printf(2, 3)
+void hpfs_error(struct super_block *, const char *, ...);
int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
unsigned hpfs_count_one_bitmap(struct super_block *, secno);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 338cd836845..3b2cec29972 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -53,7 +53,7 @@ void hpfs_read_inode(struct inode *i)
i->i_mode &= ~0111;
i->i_op = &hpfs_file_iops;
i->i_fop = &hpfs_file_ops;
- i->i_nlink = 0;*/
+ clear_nlink(i);*/
make_bad_inode(i);
return;
}
@@ -77,7 +77,7 @@ void hpfs_read_inode(struct inode *i)
i->i_mode = S_IFLNK | 0777;
i->i_op = &page_symlink_inode_operations;
i->i_data.a_ops = &hpfs_symlink_aops;
- i->i_nlink = 1;
+ set_nlink(i, 1);
i->i_size = ea_size;
i->i_blocks = 1;
brelse(bh);
@@ -101,7 +101,7 @@ void hpfs_read_inode(struct inode *i)
}
if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) {
brelse(bh);
- i->i_nlink = 1;
+ set_nlink(i, 1);
i->i_size = 0;
i->i_blocks = 1;
init_special_inode(i, mode,
@@ -125,13 +125,13 @@ void hpfs_read_inode(struct inode *i)
hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL);
i->i_blocks = 4 * n_dnodes;
i->i_size = 2048 * n_dnodes;
- i->i_nlink = 2 + n_subdirs;
+ set_nlink(i, 2 + n_subdirs);
} else {
i->i_mode |= S_IFREG;
if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111;
i->i_op = &hpfs_file_iops;
i->i_fop = &hpfs_file_ops;
- i->i_nlink = 1;
+ set_nlink(i, 1);
i->i_size = le32_to_cpu(fnode->file_size);
i->i_blocks = ((i->i_size + 511) >> 9) + 1;
i->i_data.a_ops = &hpfs_aops;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 2df69e2f07c..ea91fcb0ef9 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -56,7 +56,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
result->i_fop = &hpfs_dir_ops;
result->i_blocks = 4;
result->i_size = 2048;
- result->i_nlink = 2;
+ set_nlink(result, 2);
if (dee.read_only)
result->i_mode &= ~0222;
@@ -150,7 +150,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
result->i_mode &= ~0111;
result->i_op = &hpfs_file_iops;
result->i_fop = &hpfs_file_ops;
- result->i_nlink = 1;
+ set_nlink(result, 1);
hpfs_i(result)->i_parent_dir = dir->i_ino;
result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
result->i_ctime.tv_nsec = 0;
@@ -242,7 +242,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
hpfs_i(result)->i_ea_size = 0;
result->i_uid = current_fsuid();
result->i_gid = current_fsgid();
- result->i_nlink = 1;
+ set_nlink(result, 1);
result->i_size = 0;
result->i_blocks = 1;
init_special_inode(result, mode, rdev);
@@ -318,7 +318,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
result->i_uid = current_fsuid();
result->i_gid = current_fsgid();
result->i_blocks = 1;
- result->i_nlink = 1;
+ set_nlink(result, 1);
result->i_size = strlen(symlink);
result->i_op = &page_symlink_inode_operations;
result->i_data.a_ops = &hpfs_symlink_aops;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 970ea987b3f..f590b1160c6 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -702,7 +702,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
inode->i_ctime = proc_ino->i_ctime;
inode->i_ino = proc_ino->i_ino;
inode->i_mode = proc_ino->i_mode;
- inode->i_nlink = proc_ino->i_nlink;
+ set_nlink(inode, proc_ino->i_nlink);
inode->i_size = proc_ino->i_size;
inode->i_blocks = proc_ino->i_blocks;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ec889538e5a..0be5a78598d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -970,7 +970,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
d_instantiate(path.dentry, inode);
inode->i_size = size;
- inode->i_nlink = 0;
+ clear_nlink(inode);
error = -ENFILE;
file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
diff --git a/fs/inode.c b/fs/inode.c
index ec7924696a1..ee4e66b998f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -142,7 +142,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
inode->i_fop = &empty_fops;
- inode->i_nlink = 1;
+ inode->__i_nlink = 1;
inode->i_opflags = 0;
inode->i_uid = 0;
inode->i_gid = 0;
@@ -634,7 +634,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
* inode to the back of the list so we don't spin on it.
*/
if (!spin_trylock(&inode->i_lock)) {
- list_move(&inode->i_lru, &sb->s_inode_lru);
+ list_move_tail(&inode->i_lru, &sb->s_inode_lru);
continue;
}
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 7da2a06508e..f79dab83e17 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -21,6 +21,7 @@
*/
#include <linux/gfp.h>
#include <linux/kernel.h>
+#include <linux/export.h>
#include <linux/ioprio.h>
#include <linux/blkdev.h>
#include <linux/capability.h>
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a5d03672d04..f950059525f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -20,6 +20,7 @@
#include <linux/statfs.h>
#include <linux/cdrom.h>
#include <linux/parser.h>
+#include <linux/mpage.h>
#include "isofs.h"
#include "zisofs.h"
@@ -1148,7 +1149,13 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block)
static int isofs_readpage(struct file *file, struct page *page)
{
- return block_read_full_page(page,isofs_get_block);
+ return mpage_readpage(page, isofs_get_block);
+}
+
+static int isofs_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, isofs_get_block);
}
static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
@@ -1158,6 +1165,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
static const struct address_space_operations isofs_aops = {
.readpage = isofs_readpage,
+ .readpages = isofs_readpages,
.bmap = _isofs_bmap
};
@@ -1319,7 +1327,7 @@ static int isofs_read_inode(struct inode *inode)
inode->i_mode = S_IFDIR | sbi->s_dmode;
else
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
- inode->i_nlink = 1; /*
+ set_nlink(inode, 1); /*
* Set to 1. We know there are 2, but
* the find utility tries to optimize
* if it is 2, and it screws up. It is
@@ -1337,7 +1345,7 @@ static int isofs_read_inode(struct inode *inode)
*/
inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
}
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
inode->i_uid = sbi->s_uid;
inode->i_gid = sbi->s_gid;
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 1fbc7de88f5..70e79d0c756 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -363,7 +363,7 @@ repeat:
break;
case SIG('P', 'X'):
inode->i_mode = isonum_733(rr->u.PX.mode);
- inode->i_nlink = isonum_733(rr->u.PX.n_links);
+ set_nlink(inode, isonum_733(rr->u.PX.n_links));
inode->i_uid = isonum_733(rr->u.PX.uid);
inode->i_gid = isonum_733(rr->u.PX.gid);
break;
@@ -496,7 +496,7 @@ repeat:
goto out;
}
inode->i_mode = reloc->i_mode;
- inode->i_nlink = reloc->i_nlink;
+ set_nlink(inode, reloc->i_nlink);
inode->i_uid = reloc->i_uid;
inode->i_gid = reloc->i_gid;
inode->i_rdev = reloc->i_rdev;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9fe061fb877..fea8dd661d2 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1135,6 +1135,14 @@ static int journal_get_superblock(journal_t *journal)
goto out;
}
+ if (be32_to_cpu(sb->s_first) == 0 ||
+ be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+ printk(KERN_WARNING
+ "JBD: Invalid start block of journal: %u\n",
+ be32_to_cpu(sb->s_first));
+ goto out;
+ }
+
return 0;
out:
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index eef6979821a..68d704db787 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -352,7 +352,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(commit_transaction->t_state == T_RUNNING);
trace_jbd2_start_commit(journal, commit_transaction);
- jbd_debug(1, "JBD: starting commit of transaction %d\n",
+ jbd_debug(1, "JBD2: starting commit of transaction %d\n",
commit_transaction->t_tid);
write_lock(&journal->j_state_lock);
@@ -427,7 +427,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
__jbd2_journal_clean_checkpoint_list(journal);
spin_unlock(&journal->j_list_lock);
- jbd_debug (3, "JBD: commit phase 1\n");
+ jbd_debug(3, "JBD2: commit phase 1\n");
/*
* Switch to a new revoke table.
@@ -447,7 +447,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
wake_up(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
- jbd_debug (3, "JBD: commit phase 2\n");
+ jbd_debug(3, "JBD2: commit phase 2\n");
/*
* Now start flushing things to disk, in the order they appear
@@ -462,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
WRITE_SYNC);
blk_finish_plug(&plug);
- jbd_debug(3, "JBD: commit phase 2\n");
+ jbd_debug(3, "JBD2: commit phase 2\n");
/*
* Way to go: we have now written out all of the data for a
@@ -522,7 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT (bufs == 0);
- jbd_debug(4, "JBD: get descriptor\n");
+ jbd_debug(4, "JBD2: get descriptor\n");
descriptor = jbd2_journal_get_descriptor_buffer(journal);
if (!descriptor) {
@@ -531,7 +531,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
}
bh = jh2bh(descriptor);
- jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+ jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
(unsigned long long)bh->b_blocknr, bh->b_data);
header = (journal_header_t *)&bh->b_data[0];
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
@@ -625,7 +625,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
commit_transaction->t_buffers == NULL ||
space_left < tag_bytes + 16) {
- jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+ jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
/* Write an end-of-descriptor marker before
submitting the IOs. "tag" still points to
@@ -707,7 +707,7 @@ start_journal_io:
so we incur less scheduling load.
*/
- jbd_debug(3, "JBD: commit phase 3\n");
+ jbd_debug(3, "JBD2: commit phase 3\n");
/*
* akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -771,7 +771,7 @@ wait_for_iobuf:
J_ASSERT (commit_transaction->t_shadow_list == NULL);
- jbd_debug(3, "JBD: commit phase 4\n");
+ jbd_debug(3, "JBD2: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
wait_for_ctlbuf:
@@ -801,7 +801,7 @@ wait_for_iobuf:
if (err)
jbd2_journal_abort(journal, err);
- jbd_debug(3, "JBD: commit phase 5\n");
+ jbd_debug(3, "JBD2: commit phase 5\n");
write_lock(&journal->j_state_lock);
J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
commit_transaction->t_state = T_COMMIT_JFLUSH;
@@ -830,7 +830,7 @@ wait_for_iobuf:
transaction can be removed from any checkpoint list it was on
before. */
- jbd_debug(3, "JBD: commit phase 6\n");
+ jbd_debug(3, "JBD2: commit phase 6\n");
J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
@@ -964,7 +964,7 @@ restart_loop:
/* Done with this transaction! */
- jbd_debug(3, "JBD: commit phase 7\n");
+ jbd_debug(3, "JBD2: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
@@ -1039,7 +1039,7 @@ restart_loop:
journal->j_commit_callback(journal, commit_transaction);
trace_jbd2_end_commit(journal, commit_transaction);
- jbd_debug(1, "JBD: commit %d complete, head %d\n",
+ jbd_debug(1, "JBD2: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
if (to_free)
kfree(commit_transaction);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f24df13adc4..0fa0123151d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -491,7 +491,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
*/
journal->j_commit_request = target;
- jbd_debug(1, "JBD: requesting commit %d/%d\n",
+ jbd_debug(1, "JBD2: requesting commit %d/%d\n",
journal->j_commit_request,
journal->j_commit_sequence);
wake_up(&journal->j_wait_commit);
@@ -500,7 +500,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
/* This should never happen, but if it does, preserve
the evidence before kjournald goes into a loop and
increments j_commit_sequence beyond all recognition. */
- WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+ WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
journal->j_commit_request,
journal->j_commit_sequence,
target, journal->j_running_transaction ?
@@ -645,7 +645,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
}
#endif
while (tid_gt(tid, journal->j_commit_sequence)) {
- jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+ jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
tid, journal->j_commit_sequence);
wake_up(&journal->j_wait_commit);
read_unlock(&journal->j_state_lock);
@@ -1093,7 +1093,7 @@ static int journal_reset(journal_t *journal)
first = be32_to_cpu(sb->s_first);
last = be32_to_cpu(sb->s_maxlen);
if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
- printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+ printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
first, last);
journal_fail_superblock(journal);
return -EINVAL;
@@ -1139,7 +1139,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
*/
if (sb->s_start == 0 && journal->j_tail_sequence ==
journal->j_transaction_sequence) {
- jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
+ jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
"(start %ld, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence,
journal->j_errno);
@@ -1163,7 +1163,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
}
read_lock(&journal->j_state_lock);
- jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+ jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence, journal->j_errno);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1216,8 +1216,8 @@ static int journal_get_superblock(journal_t *journal)
ll_rw_block(READ, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- printk (KERN_ERR
- "JBD: IO error reading journal superblock\n");
+ printk(KERN_ERR
+ "JBD2: IO error reading journal superblock\n");
goto out;
}
}
@@ -1228,7 +1228,7 @@ static int journal_get_superblock(journal_t *journal)
if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
- printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+ printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
goto out;
}
@@ -1240,14 +1240,22 @@ static int journal_get_superblock(journal_t *journal)
journal->j_format_version = 2;
break;
default:
- printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+ printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
goto out;
}
if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
- printk (KERN_WARNING "JBD: journal file too short\n");
+ printk(KERN_WARNING "JBD2: journal file too short\n");
+ goto out;
+ }
+
+ if (be32_to_cpu(sb->s_first) == 0 ||
+ be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+ printk(KERN_WARNING
+ "JBD2: Invalid start block of journal: %u\n",
+ be32_to_cpu(sb->s_first));
goto out;
}
@@ -1310,8 +1318,8 @@ int jbd2_journal_load(journal_t *journal)
~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
(sb->s_feature_incompat &
~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
- printk (KERN_WARNING
- "JBD: Unrecognised features on journal\n");
+ printk(KERN_WARNING
+ "JBD2: Unrecognised features on journal\n");
return -EINVAL;
}
}
@@ -1346,7 +1354,7 @@ int jbd2_journal_load(journal_t *journal)
return 0;
recovery_error:
- printk (KERN_WARNING "JBD: recovery failed\n");
+ printk(KERN_WARNING "JBD2: recovery failed\n");
return -EIO;
}
@@ -1577,7 +1585,7 @@ static int journal_convert_superblock_v1(journal_t *journal,
struct buffer_head *bh;
printk(KERN_WARNING
- "JBD: Converting superblock from version 1 to 2.\n");
+ "JBD2: Converting superblock from version 1 to 2.\n");
/* Pre-initialise new fields to zero */
offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
@@ -1694,7 +1702,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
if (!journal->j_tail)
goto no_recovery;
- printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+ printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
write ? "Clearing" : "Ignoring");
err = jbd2_journal_skip_recovery(journal);
@@ -2020,7 +2028,7 @@ static int journal_init_jbd2_journal_head_cache(void)
retval = 0;
if (!jbd2_journal_head_cache) {
retval = -ENOMEM;
- printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+ printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
}
return retval;
}
@@ -2383,7 +2391,7 @@ static void __exit journal_exit(void)
#ifdef CONFIG_JBD2_DEBUG
int n = atomic_read(&nr_journal_heads);
if (n)
- printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+ printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);
#endif
jbd2_remove_debugfs_entry();
jbd2_remove_jbd_stats_proc_entry();
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 1cad869494f..da6d7baf139 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -89,7 +89,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
err = jbd2_journal_bmap(journal, next, &blocknr);
if (err) {
- printk (KERN_ERR "JBD: bad block at offset %u\n",
+ printk(KERN_ERR "JBD2: bad block at offset %u\n",
next);
goto failed;
}
@@ -138,14 +138,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
*bhp = NULL;
if (offset >= journal->j_maxlen) {
- printk(KERN_ERR "JBD: corrupted journal superblock\n");
+ printk(KERN_ERR "JBD2: corrupted journal superblock\n");
return -EIO;
}
err = jbd2_journal_bmap(journal, offset, &blocknr);
if (err) {
- printk (KERN_ERR "JBD: bad block at offset %u\n",
+ printk(KERN_ERR "JBD2: bad block at offset %u\n",
offset);
return err;
}
@@ -163,7 +163,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
}
if (!buffer_uptodate(bh)) {
- printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+ printk(KERN_ERR "JBD2: Failed to read block at offset %u\n",
offset);
brelse(bh);
return -EIO;
@@ -251,10 +251,10 @@ int jbd2_journal_recover(journal_t *journal)
if (!err)
err = do_one_pass(journal, &info, PASS_REPLAY);
- jbd_debug(1, "JBD: recovery, exit status %d, "
+ jbd_debug(1, "JBD2: recovery, exit status %d, "
"recovered transactions %u to %u\n",
err, info.start_transaction, info.end_transaction);
- jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
+ jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
/* Restart the log at the next transaction ID, thus invalidating
@@ -293,14 +293,14 @@ int jbd2_journal_skip_recovery(journal_t *journal)
err = do_one_pass(journal, &info, PASS_SCAN);
if (err) {
- printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+ printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
++journal->j_transaction_sequence;
} else {
#ifdef CONFIG_JBD2_DEBUG
int dropped = info.end_transaction -
be32_to_cpu(journal->j_superblock->s_sequence);
jbd_debug(1,
- "JBD: ignoring %d transaction%s from the journal.\n",
+ "JBD2: ignoring %d transaction%s from the journal.\n",
dropped, (dropped == 1) ? "" : "s");
#endif
journal->j_transaction_sequence = ++info.end_transaction;
@@ -338,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
wrap(journal, *next_log_block);
err = jread(&obh, journal, io_block);
if (err) {
- printk(KERN_ERR "JBD: IO error %d recovering block "
+ printk(KERN_ERR "JBD2: IO error %d recovering block "
"%lu in log\n", err, io_block);
return 1;
} else {
@@ -411,7 +411,7 @@ static int do_one_pass(journal_t *journal,
* either the next descriptor block or the final commit
* record. */
- jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+ jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
@@ -491,8 +491,8 @@ static int do_one_pass(journal_t *journal,
/* Recover what we can, but
* report failure at the end. */
success = err;
- printk (KERN_ERR
- "JBD: IO error %d recovering "
+ printk(KERN_ERR
+ "JBD2: IO error %d recovering "
"block %ld in log\n",
err, io_block);
} else {
@@ -520,7 +520,7 @@ static int do_one_pass(journal_t *journal,
journal->j_blocksize);
if (nbh == NULL) {
printk(KERN_ERR
- "JBD: Out of memory "
+ "JBD2: Out of memory "
"during recovery.\n");
err = -ENOMEM;
brelse(bh);
@@ -689,7 +689,7 @@ static int do_one_pass(journal_t *journal,
/* It's really bad news if different passes end up at
* different places (but possible due to IO errors). */
if (info->end_transaction != next_commit_ID) {
- printk (KERN_ERR "JBD: recovery pass %d ended at "
+ printk(KERN_ERR "JBD2: recovery pass %d ended at "
"transaction %u, expected %u\n",
pass, next_commit_ID, info->end_transaction);
if (!success)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 2d7109414cd..a0e41a4c080 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -27,6 +27,7 @@
#include <linux/highmem.h>
#include <linux/hrtimer.h>
#include <linux/backing-dev.h>
+#include <linux/bug.h>
#include <linux/module.h>
static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -115,7 +116,7 @@ static inline void update_t_max_wait(transaction_t *transaction,
*/
static int start_this_handle(journal_t *journal, handle_t *handle,
- int gfp_mask)
+ gfp_t gfp_mask)
{
transaction_t *transaction, *new_transaction = NULL;
tid_t tid;
@@ -124,7 +125,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
unsigned long ts = jiffies;
if (nblocks > journal->j_max_transaction_buffers) {
- printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
+ printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
current->comm, nblocks,
journal->j_max_transaction_buffers);
return -ENOSPC;
@@ -320,7 +321,7 @@ static handle_t *new_handle(int nblocks)
* Return a pointer to a newly allocated handle, or an ERR_PTR() value
* on failure.
*/
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
{
handle_t *handle = journal_current_handle();
int err;
@@ -443,7 +444,7 @@ out:
* transaction capabable of guaranteeing the requested number of
* credits.
*/
-int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
+int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
@@ -563,7 +564,7 @@ static void warn_dirty_buffer(struct buffer_head *bh)
char b[BDEVNAME_SIZE];
printk(KERN_WARNING
- "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+ "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
"There's a risk of filesystem corruption in case of system "
"crash.\n",
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
@@ -1049,6 +1050,10 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
* mark dirty metadata which needs to be journaled as part of the current
* transaction.
*
+ * The buffer must have previously had jbd2_journal_get_write_access()
+ * called so that it has a valid journal_head attached to the buffer
+ * head.
+ *
* The buffer is placed on the transaction's metadata list and is marked
* as belonging to the transaction.
*
@@ -1065,11 +1070,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
struct journal_head *jh = bh2jh(bh);
+ int ret = 0;
jbd_debug(5, "journal_head %p\n", jh);
JBUFFER_TRACE(jh, "entry");
if (is_handle_aborted(handle))
goto out;
+ if (!buffer_jbd(bh)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
jbd_lock_bh_state(bh);
@@ -1093,8 +1103,20 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
*/
if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
JBUFFER_TRACE(jh, "fastpath");
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_running_transaction);
+ if (unlikely(jh->b_transaction !=
+ journal->j_running_transaction)) {
+ printk(KERN_EMERG "JBD: %s: "
+ "jh->b_transaction (%llu, %p, %u) != "
+ "journal->j_running_transaction (%p, %u)",
+ journal->j_devname,
+ (unsigned long long) bh->b_blocknr,
+ jh->b_transaction,
+ jh->b_transaction ? jh->b_transaction->t_tid : 0,
+ journal->j_running_transaction,
+ journal->j_running_transaction ?
+ journal->j_running_transaction->t_tid : 0);
+ ret = -EINVAL;
+ }
goto out_unlock_bh;
}
@@ -1108,9 +1130,32 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
*/
if (jh->b_transaction != transaction) {
JBUFFER_TRACE(jh, "already on other transaction");
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
- J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+ if (unlikely(jh->b_transaction !=
+ journal->j_committing_transaction)) {
+ printk(KERN_EMERG "JBD: %s: "
+ "jh->b_transaction (%llu, %p, %u) != "
+ "journal->j_committing_transaction (%p, %u)",
+ journal->j_devname,
+ (unsigned long long) bh->b_blocknr,
+ jh->b_transaction,
+ jh->b_transaction ? jh->b_transaction->t_tid : 0,
+ journal->j_committing_transaction,
+ journal->j_committing_transaction ?
+ journal->j_committing_transaction->t_tid : 0);
+ ret = -EINVAL;
+ }
+ if (unlikely(jh->b_next_transaction != transaction)) {
+ printk(KERN_EMERG "JBD: %s: "
+ "jh->b_next_transaction (%llu, %p, %u) != "
+ "transaction (%p, %u)",
+ journal->j_devname,
+ (unsigned long long) bh->b_blocknr,
+ jh->b_next_transaction,
+ jh->b_next_transaction ?
+ jh->b_next_transaction->t_tid : 0,
+ transaction, transaction->t_tid);
+ ret = -EINVAL;
+ }
/* And this case is illegal: we can't reuse another
* transaction's data buffer, ever. */
goto out_unlock_bh;
@@ -1127,7 +1172,8 @@ out_unlock_bh:
jbd_unlock_bh_state(bh);
out:
JBUFFER_TRACE(jh, "exit");
- return 0;
+ WARN_ON(ret); /* All errors are bugs, so dump the stack */
+ return ret;
}
/*
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index de4247021d2..5b6c9d1a2fb 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -53,6 +53,78 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this,
return 0;
}
+/*
+ * jffs2_selected_compress:
+ * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB).
+ * If 0, just take the first available compression mode.
+ * @data_in: Pointer to uncompressed data
+ * @cpage_out: Pointer to returned pointer to buffer for compressed data
+ * @datalen: On entry, holds the amount of data available for compression.
+ * On exit, expected to hold the amount of data actually compressed.
+ * @cdatalen: On entry, holds the amount of space available for compressed
+ * data. On exit, expected to hold the actual size of the compressed
+ * data.
+ *
+ * Returns: the compression type used. Zero is used to show that the data
+ * could not be compressed; probably because we couldn't find the requested
+ * compression mode.
+ */
+static int jffs2_selected_compress(u8 compr, unsigned char *data_in,
+ unsigned char **cpage_out, u32 *datalen, u32 *cdatalen)
+{
+ struct jffs2_compressor *this;
+ int err, ret = JFFS2_COMPR_NONE;
+ uint32_t orig_slen, orig_dlen;
+ char *output_buf;
+
+ output_buf = kmalloc(*cdatalen, GFP_KERNEL);
+ if (!output_buf) {
+ printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
+ return ret;
+ }
+ orig_slen = *datalen;
+ orig_dlen = *cdatalen;
+ spin_lock(&jffs2_compressor_list_lock);
+ list_for_each_entry(this, &jffs2_compressor_list, list) {
+ /* Skip decompress-only and disabled modules */
+ if (!this->compress || this->disabled)
+ continue;
+
+ /* Skip if not the desired compression type */
+ if (compr && (compr != this->compr))
+ continue;
+
+ /*
+ * Either compression type was unspecified, or we found our
+ * compressor; either way, we're good to go.
+ */
+ this->usecount++;
+ spin_unlock(&jffs2_compressor_list_lock);
+
+ *datalen = orig_slen;
+ *cdatalen = orig_dlen;
+ err = this->compress(data_in, output_buf, datalen, cdatalen);
+
+ spin_lock(&jffs2_compressor_list_lock);
+ this->usecount--;
+ if (!err) {
+ /* Success */
+ ret = this->compr;
+ this->stat_compr_blocks++;
+ this->stat_compr_orig_size += *datalen;
+ this->stat_compr_new_size += *cdatalen;
+ break;
+ }
+ }
+ spin_unlock(&jffs2_compressor_list_lock);
+ if (ret == JFFS2_COMPR_NONE)
+ kfree(output_buf);
+ else
+ *cpage_out = output_buf;
+
+ return ret;
+}
+
/* jffs2_compress:
* @data_in: Pointer to uncompressed data
* @cpage_out: Pointer to returned pointer to buffer for compressed data
@@ -76,47 +148,23 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
uint32_t *datalen, uint32_t *cdatalen)
{
int ret = JFFS2_COMPR_NONE;
- int compr_ret;
+ int mode, compr_ret;
struct jffs2_compressor *this, *best=NULL;
unsigned char *output_buf = NULL, *tmp_buf;
uint32_t orig_slen, orig_dlen;
uint32_t best_slen=0, best_dlen=0;
- switch (jffs2_compression_mode) {
+ if (c->mount_opts.override_compr)
+ mode = c->mount_opts.compr;
+ else
+ mode = jffs2_compression_mode;
+
+ switch (mode) {
case JFFS2_COMPR_MODE_NONE:
break;
case JFFS2_COMPR_MODE_PRIORITY:
- output_buf = kmalloc(*cdatalen,GFP_KERNEL);
- if (!output_buf) {
- printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
- goto out;
- }
- orig_slen = *datalen;
- orig_dlen = *cdatalen;
- spin_lock(&jffs2_compressor_list_lock);
- list_for_each_entry(this, &jffs2_compressor_list, list) {
- /* Skip decompress-only backwards-compatibility and disabled modules */
- if ((!this->compress)||(this->disabled))
- continue;
-
- this->usecount++;
- spin_unlock(&jffs2_compressor_list_lock);
- *datalen = orig_slen;
- *cdatalen = orig_dlen;
- compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
- spin_lock(&jffs2_compressor_list_lock);
- this->usecount--;
- if (!compr_ret) {
- ret = this->compr;
- this->stat_compr_blocks++;
- this->stat_compr_orig_size += *datalen;
- this->stat_compr_new_size += *cdatalen;
- break;
- }
- }
- spin_unlock(&jffs2_compressor_list_lock);
- if (ret == JFFS2_COMPR_NONE)
- kfree(output_buf);
+ ret = jffs2_selected_compress(0, data_in, cpage_out, datalen,
+ cdatalen);
break;
case JFFS2_COMPR_MODE_SIZE:
case JFFS2_COMPR_MODE_FAVOURLZO:
@@ -174,22 +222,28 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
best->stat_compr_orig_size += best_slen;
best->stat_compr_new_size += best_dlen;
ret = best->compr;
+ *cpage_out = output_buf;
}
spin_unlock(&jffs2_compressor_list_lock);
break;
+ case JFFS2_COMPR_MODE_FORCELZO:
+ ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in,
+ cpage_out, datalen, cdatalen);
+ break;
+ case JFFS2_COMPR_MODE_FORCEZLIB:
+ ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in,
+ cpage_out, datalen, cdatalen);
+ break;
default:
printk(KERN_ERR "JFFS2: unknown compression mode.\n");
}
- out:
+
if (ret == JFFS2_COMPR_NONE) {
*cpage_out = data_in;
*datalen = *cdatalen;
none_stat_compr_blocks++;
none_stat_compr_size += *datalen;
}
- else {
- *cpage_out = output_buf;
- }
return ret;
}
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 13bb7597ab3..5e91d578f4e 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -40,6 +40,8 @@
#define JFFS2_COMPR_MODE_PRIORITY 1
#define JFFS2_COMPR_MODE_SIZE 2
#define JFFS2_COMPR_MODE_FAVOURLZO 3
+#define JFFS2_COMPR_MODE_FORCELZO 4
+#define JFFS2_COMPR_MODE_FORCEZLIB 5
#define FAVOUR_LZO_PERCENT 80
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9659b7c0046..be6169bd8ac 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -245,7 +245,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
dentry->d_name.len, dead_f, now);
if (dead_f->inocache)
- dentry->d_inode->i_nlink = dead_f->inocache->pino_nlink;
+ set_nlink(dentry->d_inode, dead_f->inocache->pino_nlink);
if (!ret)
dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
return ret;
@@ -278,7 +278,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
if (!ret) {
mutex_lock(&f->sem);
- old_dentry->d_inode->i_nlink = ++f->inocache->pino_nlink;
+ set_nlink(old_dentry->d_inode, ++f->inocache->pino_nlink);
mutex_unlock(&f->sem);
d_instantiate(dentry, old_dentry->d_inode);
dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
@@ -497,7 +497,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
f = JFFS2_INODE_INFO(inode);
/* Directories get nlink 2 at start */
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
/* but ic->pino_nlink is the parent ino# */
f->inocache->pino_nlink = dir_i->i_ino;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index bbcb9755dd2..4b8afe39a87 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -278,7 +278,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime));
- inode->i_nlink = f->inocache->pino_nlink;
+ set_nlink(inode, f->inocache->pino_nlink);
inode->i_blocks = (inode->i_size + 511) >> 9;
@@ -291,7 +291,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
case S_IFDIR:
{
struct jffs2_full_dirent *fd;
- inode->i_nlink = 2; /* parent and '.' */
+ set_nlink(inode, 2); /* parent and '.' */
for (fd=f->dents; fd; fd = fd->next) {
if (fd->type == DT_DIR && fd->ino)
@@ -379,7 +379,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
jffs2_do_setattr(inode, &iattr);
}
-int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
+int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
{
struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
@@ -453,7 +453,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
iput(inode);
return ERR_PTR(ret);
}
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_ino = je32_to_cpu(ri->ino);
inode->i_mode = jemode_to_cpu(ri->mode);
inode->i_gid = je16_to_cpu(ri->gid);
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 0bc6a6c80a5..55a0c1dcead 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -29,6 +29,11 @@
struct jffs2_inodirty;
+struct jffs2_mount_opts {
+ bool override_compr;
+ unsigned int compr;
+};
+
/* A struct for the overall file system control. Pointers to
jffs2_sb_info structs are named `c' in the source code.
Nee jffs_control
@@ -126,6 +131,7 @@ struct jffs2_sb_info {
#endif
struct jffs2_summary *summary; /* Summary information */
+ struct jffs2_mount_opts mount_opts;
#ifdef CONFIG_JFFS2_FS_XATTR
#define XATTRINDEX_HASHSIZE (57)
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 6c1755c59c0..ab65ee3ec85 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -176,7 +176,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags);
struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode,
struct jffs2_raw_inode *ri);
int jffs2_statfs (struct dentry *, struct kstatfs *);
-int jffs2_remount_fs (struct super_block *, int *, char *);
+int jffs2_do_remount_fs(struct super_block *, int *, char *);
int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
void jffs2_gc_release_inode(struct jffs2_sb_info *c,
struct jffs2_inode_info *f);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 8d8cd3419d0..28107ca136e 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -275,9 +275,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
else
c->mtd->unpoint(c->mtd, 0, c->mtd->size);
#endif
- if (s)
- kfree(s);
-
+ kfree(s);
return ret;
}
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index cfeb7164b08..0f20208df60 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -22,26 +22,29 @@
#include <linux/security.h>
#include "nodelist.h"
-/* ---- Initial Security Label Attachment -------------- */
-int jffs2_init_security(struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+/* ---- Initial Security Label(s) Attachment callback --- */
+int jffs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int rc;
- size_t len;
- void *value;
- char *name;
+ const struct xattr *xattr;
+ int err = 0;
- rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
- if (rc) {
- if (rc == -EOPNOTSUPP)
- return 0;
- return rc;
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
}
- rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0);
+ return err;
+}
- kfree(name);
- kfree(value);
- return rc;
+/* ---- Initial Security Label(s) Attachment ----------- */
+int jffs2_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &jffs2_initxattrs, NULL);
}
/* ---- XATTR Handler for "security.*" ----------------- */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 853b8e30008..e7e97445411 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -17,11 +17,13 @@
#include <linux/fs.h>
#include <linux/err.h>
#include <linux/mount.h>
+#include <linux/parser.h>
#include <linux/jffs2.h>
#include <linux/pagemap.h>
#include <linux/mtd/super.h>
#include <linux/ctype.h>
#include <linux/namei.h>
+#include <linux/seq_file.h>
#include <linux/exportfs.h>
#include "compr.h"
#include "nodelist.h"
@@ -75,6 +77,37 @@ static void jffs2_write_super(struct super_block *sb)
unlock_super(sb);
}
+static const char *jffs2_compr_name(unsigned int compr)
+{
+ switch (compr) {
+ case JFFS2_COMPR_MODE_NONE:
+ return "none";
+#ifdef CONFIG_JFFS2_LZO
+ case JFFS2_COMPR_MODE_FORCELZO:
+ return "lzo";
+#endif
+#ifdef CONFIG_JFFS2_ZLIB
+ case JFFS2_COMPR_MODE_FORCEZLIB:
+ return "zlib";
+#endif
+ default:
+ /* should never happen; programmer error */
+ WARN_ON(1);
+ return "";
+ }
+}
+
+static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb);
+ struct jffs2_mount_opts *opts = &c->mount_opts;
+
+ if (opts->override_compr)
+ seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr));
+
+ return 0;
+}
+
static int jffs2_sync_fs(struct super_block *sb, int wait)
{
struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
@@ -133,6 +166,85 @@ static const struct export_operations jffs2_export_ops = {
.fh_to_parent = jffs2_fh_to_parent,
};
+/*
+ * JFFS2 mount options.
+ *
+ * Opt_override_compr: override default compressor
+ * Opt_err: just end of array marker
+ */
+enum {
+ Opt_override_compr,
+ Opt_err,
+};
+
+static const match_table_t tokens = {
+ {Opt_override_compr, "compr=%s"},
+ {Opt_err, NULL},
+};
+
+static int jffs2_parse_options(struct jffs2_sb_info *c, char *data)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *p, *name;
+
+ if (!data)
+ return 0;
+
+ while ((p = strsep(&data, ","))) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_override_compr:
+ name = match_strdup(&args[0]);
+
+ if (!name)
+ return -ENOMEM;
+ if (!strcmp(name, "none"))
+ c->mount_opts.compr = JFFS2_COMPR_MODE_NONE;
+#ifdef CONFIG_JFFS2_LZO
+ else if (!strcmp(name, "lzo"))
+ c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO;
+#endif
+#ifdef CONFIG_JFFS2_ZLIB
+ else if (!strcmp(name, "zlib"))
+ c->mount_opts.compr =
+ JFFS2_COMPR_MODE_FORCEZLIB;
+#endif
+ else {
+ printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"",
+ name);
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ c->mount_opts.override_compr = true;
+ break;
+ default:
+ printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n",
+ p);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+ int err;
+
+ err = jffs2_parse_options(c, data);
+ if (err)
+ return -EINVAL;
+
+ return jffs2_do_remount_fs(sb, flags, data);
+}
+
static const struct super_operations jffs2_super_operations =
{
.alloc_inode = jffs2_alloc_inode,
@@ -143,6 +255,7 @@ static const struct super_operations jffs2_super_operations =
.remount_fs = jffs2_remount_fs,
.evict_inode = jffs2_evict_inode,
.dirty_inode = jffs2_dirty_inode,
+ .show_options = jffs2_show_options,
.sync_fs = jffs2_sync_fs,
};
@@ -166,6 +279,12 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
c->os_priv = sb;
sb->s_fs_info = c;
+ ret = jffs2_parse_options(c, data);
+ if (ret) {
+ kfree(c);
+ return -EINVAL;
+ }
+
/* Initialize JFFS2 superblock locks, the further initialization will
* be done later */
mutex_init(&c->alloc_sem);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4515bea0268..b09e51d2f81 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -578,8 +578,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
if (!jffs2_is_writebuffered(c))
return 0;
- if (mutex_trylock(&c->alloc_sem)) {
- mutex_unlock(&c->alloc_sem);
+ if (!mutex_is_locked(&c->alloc_sem)) {
printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n");
BUG();
}
@@ -1026,7 +1025,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
struct mtd_oob_ops ops;
- ops.mode = MTD_OOB_AUTO;
+ ops.mode = MTD_OPS_AUTO_OOB;
ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
ops.oobbuf = c->oobbuf;
ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1069,7 +1068,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
struct mtd_oob_ops ops;
int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
- ops.mode = MTD_OOB_AUTO;
+ ops.mode = MTD_OPS_AUTO_OOB;
ops.ooblen = cmlen;
ops.oobbuf = c->oobbuf;
ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1095,7 +1094,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
struct mtd_oob_ops ops;
int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
- ops.mode = MTD_OOB_AUTO;
+ ops.mode = MTD_OPS_AUTO_OOB;
ops.ooblen = cmlen;
ops.oobbuf = (uint8_t *)&oob_cleanmarker;
ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index b78b2f978f0..1b6f15f191b 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -457,7 +457,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
/* read the page of fixed disk inode (AIT) in raw mode */
mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
if (mp == NULL) {
- ip->i_nlink = 1; /* Don't want iput() deleting it */
+ set_nlink(ip, 1); /* Don't want iput() deleting it */
iput(ip);
return (NULL);
}
@@ -469,7 +469,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
/* copy on-disk inode to in-memory inode */
if ((copy_from_dinode(dp, ip)) != 0) {
/* handle bad return by returning NULL for ip */
- ip->i_nlink = 1; /* Don't want iput() deleting it */
+ set_nlink(ip, 1); /* Don't want iput() deleting it */
iput(ip);
/* release the page */
release_metapage(mp);
@@ -3076,7 +3076,7 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
ip->i_mode |= 0001;
}
}
- ip->i_nlink = le32_to_cpu(dip->di_nlink);
+ set_nlink(ip, le32_to_cpu(dip->di_nlink));
jfs_ip->saved_uid = le32_to_cpu(dip->di_uid);
if (sbi->uid == -1)
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 2686531e235..c1a3e603279 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -157,7 +157,7 @@ fail_drop:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
fail_unlock:
- inode->i_nlink = 0;
+ clear_nlink(inode);
unlock_new_inode(inode);
fail_put:
iput(inode);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 583636f745e..cc5f811ed38 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -67,6 +67,7 @@
#include <linux/buffer_head.h> /* for sync_blockdev() */
#include <linux/bio.h>
#include <linux/freezer.h>
+#include <linux/export.h>
#include <linux/delay.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index e17545e1566..a112ad96e47 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -172,7 +172,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
mutex_unlock(&JFS_IP(dip)->commit_mutex);
if (rc) {
free_ea_wmap(ip);
- ip->i_nlink = 0;
+ clear_nlink(ip);
unlock_new_inode(ip);
iput(ip);
} else {
@@ -292,7 +292,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
goto out3;
}
- ip->i_nlink = 2; /* for '.' */
+ set_nlink(ip, 2); /* for '.' */
ip->i_op = &jfs_dir_inode_operations;
ip->i_fop = &jfs_dir_operations;
@@ -311,7 +311,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
mutex_unlock(&JFS_IP(dip)->commit_mutex);
if (rc) {
free_ea_wmap(ip);
- ip->i_nlink = 0;
+ clear_nlink(ip);
unlock_new_inode(ip);
iput(ip);
} else {
@@ -844,7 +844,7 @@ static int jfs_link(struct dentry *old_dentry,
rc = txCommit(tid, 2, &iplist[0], 0);
if (rc) {
- ip->i_nlink--; /* never instantiated */
+ drop_nlink(ip); /* never instantiated */
iput(ip);
} else
d_instantiate(dentry, ip);
@@ -1048,7 +1048,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
mutex_unlock(&JFS_IP(dip)->commit_mutex);
if (rc) {
free_ea_wmap(ip);
- ip->i_nlink = 0;
+ clear_nlink(ip);
unlock_new_inode(ip);
iput(ip);
} else {
@@ -1433,7 +1433,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
mutex_unlock(&JFS_IP(dir)->commit_mutex);
if (rc) {
free_ea_wmap(ip);
- ip->i_nlink = 0;
+ clear_nlink(ip);
unlock_new_inode(ip);
iput(ip);
} else {
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 06c8a67cbe7..a44eff076c1 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -485,7 +485,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
goto out_unload;
}
inode->i_ino = 0;
- inode->i_nlink = 1;
inode->i_size = sb->s_bdev->bd_inode->i_size;
inode->i_mapping->a_ops = &jfs_metapage_aops;
insert_inode_hash(inode);
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index e87fedef23d..26683e15b3a 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -1089,38 +1089,37 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
}
#ifdef CONFIG_JFS_SECURITY
-int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int rc;
- size_t len;
- void *value;
- char *suffix;
+ const struct xattr *xattr;
+ tid_t *tid = fs_info;
char *name;
-
- rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
- &len);
- if (rc) {
- if (rc == -EOPNOTSUPP)
- return 0;
- return rc;
- }
- name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix),
- GFP_NOFS);
- if (!name) {
- rc = -ENOMEM;
- goto kmalloc_failed;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1, GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ break;
+ }
+ strcpy(name, XATTR_SECURITY_PREFIX);
+ strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+
+ err = __jfs_setxattr(*tid, inode, name,
+ xattr->value, xattr->value_len, 0);
+ kfree(name);
+ if (err < 0)
+ break;
}
- strcpy(name, XATTR_SECURITY_PREFIX);
- strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
-
- rc = __jfs_setxattr(tid, inode, name, value, len, 0);
-
- kfree(name);
-kmalloc_failed:
- kfree(suffix);
- kfree(value);
+ return err;
+}
- return rc;
+int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &jfs_initxattrs, &tid);
}
#endif
diff --git a/fs/libfs.c b/fs/libfs.c
index c18e9a1235b..f6d411eef1e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -490,7 +490,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
root = d_alloc_root(inode);
if (!root) {
iput(inode);
@@ -510,8 +510,10 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
if (!dentry)
goto out;
inode = new_inode(s);
- if (!inode)
+ if (!inode) {
+ dput(dentry);
goto out;
+ }
inode->i_mode = S_IFREG | files->mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_fop = files->ops;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index b7c99bfb3da..6f29836ec0c 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -316,14 +316,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
struct hlist_node *pos;
struct nlm_host *host = NULL;
struct nsm_handle *nsm = NULL;
- struct sockaddr_in sin = {
- .sin_family = AF_INET,
- };
- struct sockaddr_in6 sin6 = {
- .sin6_family = AF_INET6,
- };
- struct sockaddr *src_sap;
- size_t src_len = rqstp->rq_addrlen;
+ struct sockaddr *src_sap = svc_daddr(rqstp);
+ size_t src_len = rqstp->rq_daddrlen;
struct nlm_lookup_host_info ni = {
.server = 1,
.sap = svc_addr(rqstp),
@@ -340,21 +334,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
mutex_lock(&nlm_host_mutex);
- switch (ni.sap->sa_family) {
- case AF_INET:
- sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
- src_sap = (struct sockaddr *)&sin;
- break;
- case AF_INET6:
- ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
- src_sap = (struct sockaddr *)&sin6;
- break;
- default:
- dprintk("lockd: %s failed; unrecognized address family\n",
- __func__);
- goto out;
- }
-
if (time_after_eq(jiffies, next_gc))
nlm_gc_hosts();
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index abfff9d7979..c061b9aa7dd 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -282,7 +282,7 @@ int lockd_up(void)
/*
* Create the kernel thread and wait for it to start.
*/
- nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+ nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
if (IS_ERR(nlmsvc_rqst)) {
error = PTR_ERR(nlmsvc_rqst);
nlmsvc_rqst = NULL;
diff --git a/fs/locks.c b/fs/locks.c
index 703f545097d..637694bf3a0 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -60,7 +60,7 @@
*
* Initial implementation of mandatory locks. SunOS turned out to be
* a rotten model, so I implemented the "obvious" semantics.
- * See 'Documentation/mandatory.txt' for details.
+ * See 'Documentation/filesystems/mandatory-locking.txt' for details.
* Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
*
* Don't allow mandatory locks on mmap()'ed files. Added simple functions to
@@ -133,6 +133,20 @@
#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE)
+static bool lease_breaking(struct file_lock *fl)
+{
+ return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
+}
+
+static int target_leasetype(struct file_lock *fl)
+{
+ if (fl->fl_flags & FL_UNLOCK_PENDING)
+ return F_UNLCK;
+ if (fl->fl_flags & FL_DOWNGRADE_PENDING)
+ return F_RDLCK;
+ return fl->fl_type;
+}
+
int leases_enable = 1;
int lease_break_time = 45;
@@ -1119,6 +1133,17 @@ int locks_mandatory_area(int read_write, struct inode *inode,
EXPORT_SYMBOL(locks_mandatory_area);
+static void lease_clear_pending(struct file_lock *fl, int arg)
+{
+ switch (arg) {
+ case F_UNLCK:
+ fl->fl_flags &= ~FL_UNLOCK_PENDING;
+ /* fall through: */
+ case F_RDLCK:
+ fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
+ }
+}
+
/* We already had a lease on this file; just change its type */
int lease_modify(struct file_lock **before, int arg)
{
@@ -1127,6 +1152,7 @@ int lease_modify(struct file_lock **before, int arg)
if (error)
return error;
+ lease_clear_pending(fl, arg);
locks_wake_up_blocks(fl);
if (arg == F_UNLCK)
locks_delete_lock(before);
@@ -1135,19 +1161,25 @@ int lease_modify(struct file_lock **before, int arg)
EXPORT_SYMBOL(lease_modify);
+static bool past_time(unsigned long then)
+{
+ if (!then)
+ /* 0 is a special value meaning "this never expires": */
+ return false;
+ return time_after(jiffies, then);
+}
+
static void time_out_leases(struct inode *inode)
{
struct file_lock **before;
struct file_lock *fl;
before = &inode->i_flock;
- while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) {
- if ((fl->fl_break_time == 0)
- || time_before(jiffies, fl->fl_break_time)) {
- before = &fl->fl_next;
- continue;
- }
- lease_modify(before, fl->fl_type & ~F_INPROGRESS);
+ while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
+ if (past_time(fl->fl_downgrade_time))
+ lease_modify(before, F_RDLCK);
+ if (past_time(fl->fl_break_time))
+ lease_modify(before, F_UNLCK);
if (fl == *before) /* lease_modify may have freed fl */
before = &fl->fl_next;
}
@@ -1165,7 +1197,7 @@ static void time_out_leases(struct inode *inode)
*/
int __break_lease(struct inode *inode, unsigned int mode)
{
- int error = 0, future;
+ int error = 0;
struct file_lock *new_fl, *flock;
struct file_lock *fl;
unsigned long break_time;
@@ -1173,6 +1205,8 @@ int __break_lease(struct inode *inode, unsigned int mode)
int want_write = (mode & O_ACCMODE) != O_RDONLY;
new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
+ if (IS_ERR(new_fl))
+ return PTR_ERR(new_fl);
lock_flocks();
@@ -1182,30 +1216,13 @@ int __break_lease(struct inode *inode, unsigned int mode)
if ((flock == NULL) || !IS_LEASE(flock))
goto out;
+ if (!locks_conflict(flock, new_fl))
+ goto out;
+
for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next)
if (fl->fl_owner == current->files)
i_have_this_lease = 1;
- if (want_write) {
- /* If we want write access, we have to revoke any lease. */
- future = F_UNLCK | F_INPROGRESS;
- } else if (flock->fl_type & F_INPROGRESS) {
- /* If the lease is already being broken, we just leave it */
- future = flock->fl_type;
- } else if (flock->fl_type & F_WRLCK) {
- /* Downgrade the exclusive lease to a read-only lease. */
- future = F_RDLCK | F_INPROGRESS;
- } else {
- /* the existing lease was read-only, so we can read too. */
- goto out;
- }
-
- if (IS_ERR(new_fl) && !i_have_this_lease
- && ((mode & O_NONBLOCK) == 0)) {
- error = PTR_ERR(new_fl);
- goto out;
- }
-
break_time = 0;
if (lease_break_time > 0) {
break_time = jiffies + lease_break_time * HZ;
@@ -1214,12 +1231,18 @@ int __break_lease(struct inode *inode, unsigned int mode)
}
for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
- if (fl->fl_type != future) {
- fl->fl_type = future;
+ if (want_write) {
+ if (fl->fl_flags & FL_UNLOCK_PENDING)
+ continue;
+ fl->fl_flags |= FL_UNLOCK_PENDING;
fl->fl_break_time = break_time;
- /* lease must have lmops break callback */
- fl->fl_lmops->lm_break(fl);
+ } else {
+ if (lease_breaking(flock))
+ continue;
+ fl->fl_flags |= FL_DOWNGRADE_PENDING;
+ fl->fl_downgrade_time = break_time;
}
+ fl->fl_lmops->lm_break(fl);
}
if (i_have_this_lease || (mode & O_NONBLOCK)) {
@@ -1243,10 +1266,13 @@ restart:
if (error >= 0) {
if (error == 0)
time_out_leases(inode);
- /* Wait for the next lease that has not been broken yet */
+ /*
+ * Wait for the next conflicting lease that has not been
+ * broken yet
+ */
for (flock = inode->i_flock; flock && IS_LEASE(flock);
flock = flock->fl_next) {
- if (flock->fl_type & F_INPROGRESS)
+ if (locks_conflict(new_fl, flock))
goto restart;
}
error = 0;
@@ -1254,8 +1280,7 @@ restart:
out:
unlock_flocks();
- if (!IS_ERR(new_fl))
- locks_free_lock(new_fl);
+ locks_free_lock(new_fl);
return error;
}
@@ -1314,7 +1339,7 @@ int fcntl_getlease(struct file *filp)
for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
fl = fl->fl_next) {
if (fl->fl_file == filp) {
- type = fl->fl_type & ~F_INPROGRESS;
+ type = target_leasetype(fl);
break;
}
}
@@ -1322,50 +1347,23 @@ int fcntl_getlease(struct file *filp)
return type;
}
-/**
- * generic_setlease - sets a lease on an open file
- * @filp: file pointer
- * @arg: type of lease to obtain
- * @flp: input - file_lock to use, output - file_lock inserted
- *
- * The (input) flp->fl_lmops->lm_break function is required
- * by break_lease().
- *
- * Called with file_lock_lock held.
- */
-int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
+int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
{
struct file_lock *fl, **before, **my_before = NULL, *lease;
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
- int error, rdlease_count = 0, wrlease_count = 0;
+ int error;
lease = *flp;
- error = -EACCES;
- if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
- goto out;
- error = -EINVAL;
- if (!S_ISREG(inode->i_mode))
+ error = -EAGAIN;
+ if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
goto out;
- error = security_file_lock(filp, arg);
- if (error)
+ if ((arg == F_WRLCK)
+ && ((dentry->d_count > 1)
+ || (atomic_read(&inode->i_count) > 1)))
goto out;
- time_out_leases(inode);
-
- BUG_ON(!(*flp)->fl_lmops->lm_break);
-
- if (arg != F_UNLCK) {
- error = -EAGAIN;
- if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
- goto out;
- if ((arg == F_WRLCK)
- && ((dentry->d_count > 1)
- || (atomic_read(&inode->i_count) > 1)))
- goto out;
- }
-
/*
* At this point, we know that if there is an exclusive
* lease on this file, then we hold it on this filp
@@ -1374,27 +1372,28 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
* then the file is not open by anyone (including us)
* except for this filp.
*/
+ error = -EAGAIN;
for (before = &inode->i_flock;
((fl = *before) != NULL) && IS_LEASE(fl);
before = &fl->fl_next) {
- if (fl->fl_file == filp)
+ if (fl->fl_file == filp) {
my_before = before;
- else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
- /*
- * Someone is in the process of opening this
- * file for writing so we may not take an
- * exclusive lease on it.
- */
- wrlease_count++;
- else
- rdlease_count++;
+ continue;
+ }
+ /*
+ * No exclusive leases if someone else has a lease on
+ * this file:
+ */
+ if (arg == F_WRLCK)
+ goto out;
+ /*
+ * Modifying our existing lease is OK, but no getting a
+ * new lease if someone else is opening for write:
+ */
+ if (fl->fl_flags & FL_UNLOCK_PENDING)
+ goto out;
}
- error = -EAGAIN;
- if ((arg == F_RDLCK && (wrlease_count > 0)) ||
- (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0)))
- goto out;
-
if (my_before != NULL) {
error = lease->fl_lmops->lm_change(my_before, arg);
if (!error)
@@ -1402,9 +1401,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
goto out;
}
- if (arg == F_UNLCK)
- goto out;
-
error = -EINVAL;
if (!leases_enable)
goto out;
@@ -1415,6 +1411,62 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
out:
return error;
}
+
+int generic_delete_lease(struct file *filp, struct file_lock **flp)
+{
+ struct file_lock *fl, **before;
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+
+ for (before = &inode->i_flock;
+ ((fl = *before) != NULL) && IS_LEASE(fl);
+ before = &fl->fl_next) {
+ if (fl->fl_file != filp)
+ continue;
+ return (*flp)->fl_lmops->lm_change(before, F_UNLCK);
+ }
+ return -EAGAIN;
+}
+
+/**
+ * generic_setlease - sets a lease on an open file
+ * @filp: file pointer
+ * @arg: type of lease to obtain
+ * @flp: input - file_lock to use, output - file_lock inserted
+ *
+ * The (input) flp->fl_lmops->lm_break function is required
+ * by break_lease().
+ *
+ * Called with file_lock_lock held.
+ */
+int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
+{
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ int error;
+
+ if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
+ return -EACCES;
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+ error = security_file_lock(filp, arg);
+ if (error)
+ return error;
+
+ time_out_leases(inode);
+
+ BUG_ON(!(*flp)->fl_lmops->lm_break);
+
+ switch (arg) {
+ case F_UNLCK:
+ return generic_delete_lease(filp, flp);
+ case F_RDLCK:
+ case F_WRLCK:
+ return generic_add_lease(filp, arg, flp);
+ default:
+ BUG();
+ }
+}
EXPORT_SYMBOL(generic_setlease);
static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
@@ -2126,7 +2178,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
}
} else if (IS_LEASE(fl)) {
seq_printf(f, "LEASE ");
- if (fl->fl_type & F_INPROGRESS)
+ if (lease_breaking(fl))
seq_printf(f, "BREAKING ");
else if (fl->fl_file)
seq_printf(f, "ACTIVE ");
@@ -2142,7 +2194,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
: (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
} else {
seq_printf(f, "%s ",
- (fl->fl_type & F_INPROGRESS)
+ (lease_breaking(fl))
? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ "
: (fl->fl_type & F_WRLCK) ? "WRITE" : "READ ");
}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b3ff3d89416..b7d7f67cee5 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -197,7 +197,7 @@ static int logfs_remove_inode(struct inode *inode)
{
int ret;
- inode->i_nlink--;
+ drop_nlink(inode);
ret = write_inode(inode);
LOGFS_BUG_ON(ret, inode->i_sb);
return ret;
@@ -433,7 +433,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry,
ta = kzalloc(sizeof(*ta), GFP_KERNEL);
if (!ta) {
- inode->i_nlink--;
+ drop_nlink(inode);
iput(inode);
return -ENOMEM;
}
@@ -456,7 +456,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry,
abort_transaction(inode, ta);
li->li_flags |= LOGFS_IF_STILLBORN;
/* FIXME: truncate symlink */
- inode->i_nlink--;
+ drop_nlink(inode);
iput(inode);
goto out;
}
@@ -563,7 +563,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
ihold(inode);
- inode->i_nlink++;
+ inc_nlink(inode);
mark_inode_dirty_sync(inode);
return __logfs_create(dir, dentry, inode, NULL, 0);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index edfea7a3a74..7e441ad5f79 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -93,7 +93,7 @@ static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
/* inode->i_nlink == 0 can be true when called from
* block validator */
/* set i_nlink to 0 to prevent caching */
- inode->i_nlink = 0;
+ clear_nlink(inode);
logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
iget_failed(inode);
if (!err)
@@ -199,7 +199,6 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
inode->i_blocks = 0;
inode->i_ctime = CURRENT_TIME;
inode->i_mtime = CURRENT_TIME;
- inode->i_nlink = 1;
li->li_refcount = 1;
INIT_LIST_HEAD(&li->li_freeing_list);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index f22d108bfa5..398ecff6e54 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -618,7 +618,6 @@ static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
void emergency_read_end(struct page *page);
void logfs_crash_dump(struct super_block *sb);
-void *memchr_inv(const void *s, int c, size_t n);
int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
int logfs_check_ds(struct logfs_disk_super *ds);
int logfs_write_sb(struct super_block *sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index d8d09380c7d..2ac4217b790 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -126,7 +126,7 @@ static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
inode->i_atime = be64_to_timespec(di->di_atime);
inode->i_ctime = be64_to_timespec(di->di_ctime);
inode->i_mtime = be64_to_timespec(di->di_mtime);
- inode->i_nlink = be32_to_cpu(di->di_refcount);
+ set_nlink(inode, be32_to_cpu(di->di_refcount));
inode->i_generation = be32_to_cpu(di->di_generation);
switch (inode->i_mode & S_IFMT) {
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index ce03a182c77..e795c234ea3 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -13,6 +13,7 @@
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
+#include <linux/module.h>
#include <linux/mtd/mtd.h>
#include <linux/statfs.h>
#include <linux/buffer_head.h>
@@ -91,28 +92,6 @@ void logfs_crash_dump(struct super_block *sb)
}
/*
- * TODO: move to lib/string.c
- */
-/**
- * memchr_inv - Find a character in an area of memory.
- * @s: The memory area
- * @c: The byte to search for
- * @n: The size of the area.
- *
- * returns the address of the first character other than @c, or %NULL
- * if the whole buffer contains just @c.
- */
-void *memchr_inv(const void *s, int c, size_t n)
-{
- const unsigned char *p = s;
- while (n-- != 0)
- if ((unsigned char)c != *p++)
- return (void *)(p - 1);
-
- return NULL;
-}
-
-/*
* FIXME: There should be a reserve for root, similar to ext2.
*/
int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 3f32bcb0d9b..ef175cb8cfd 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -16,38 +16,26 @@
#include <linux/bitops.h>
#include <linux/sched.h>
-static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
-
static DEFINE_SPINLOCK(bitmap_lock);
-static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits)
+/*
+ * bitmap consists of blocks filled with 16bit words
+ * bit set == busy, bit clear == free
+ * endianness is a mess, but for counting zero bits it really doesn't matter...
+ */
+static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits)
{
- unsigned i, j, sum = 0;
- struct buffer_head *bh;
-
- for (i=0; i<numblocks-1; i++) {
- if (!(bh=map[i]))
- return(0);
- for (j=0; j<bh->b_size; j++)
- sum += nibblemap[bh->b_data[j] & 0xf]
- + nibblemap[(bh->b_data[j]>>4) & 0xf];
- }
+ __u32 sum = 0;
+ unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8);
- if (numblocks==0 || !(bh=map[numblocks-1]))
- return(0);
- i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2;
- for (j=0; j<i; j++) {
- sum += nibblemap[bh->b_data[j] & 0xf]
- + nibblemap[(bh->b_data[j]>>4) & 0xf];
+ while (blocks--) {
+ unsigned words = blocksize / 2;
+ __u16 *p = (__u16 *)(*map++)->b_data;
+ while (words--)
+ sum += 16 - hweight16(*p++);
}
- i = numbits%16;
- if (i!=0) {
- i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1);
- sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf];
- sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf];
- }
- return(sum);
+ return sum;
}
void minix_free_block(struct inode *inode, unsigned long block)
@@ -105,10 +93,12 @@ int minix_new_block(struct inode * inode)
return 0;
}
-unsigned long minix_count_free_blocks(struct minix_sb_info *sbi)
+unsigned long minix_count_free_blocks(struct super_block *sb)
{
- return (count_free(sbi->s_zmap, sbi->s_zmap_blocks,
- sbi->s_nzones - sbi->s_firstdatazone + 1)
+ struct minix_sb_info *sbi = minix_sb(sb);
+ u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1);
+
+ return (count_free(sbi->s_zmap, sb->s_blocksize, bits)
<< sbi->s_log_zone_size);
}
@@ -273,7 +263,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
return inode;
}
-unsigned long minix_count_free_inodes(struct minix_sb_info *sbi)
+unsigned long minix_count_free_inodes(struct super_block *sb)
{
- return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1);
+ struct minix_sb_info *sbi = minix_sb(sb);
+ u32 bits = sbi->s_ninodes + 1;
+
+ return count_free(sbi->s_imap, sb->s_blocksize, bits);
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e7d23e25bf1..4d46a6a5907 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -263,6 +263,26 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
goto out_no_root;
}
+ /* Apparently minix can create filesystems that allocate more blocks for
+ * the bitmaps than needed. We simply ignore that, but verify it didn't
+ * create one with not enough blocks and bail out if so.
+ */
+ block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
+ if (sbi->s_imap_blocks < block) {
+ printk("MINIX-fs: file system does not have enough "
+ "imap blocks allocated. Refusing to mount\n");
+ goto out_iput;
+ }
+
+ block = minix_blocks_needed(
+ (sbi->s_nzones - (sbi->s_firstdatazone + 1)),
+ s->s_blocksize);
+ if (sbi->s_zmap_blocks < block) {
+ printk("MINIX-fs: file system does not have enough "
+ "zmap blocks allocated. Refusing to mount.\n");
+ goto out_iput;
+ }
+
ret = -ENOMEM;
s->s_root = d_alloc_root(root_inode);
if (!s->s_root)
@@ -276,9 +296,10 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
if (!(sbi->s_mount_state & MINIX_VALID_FS))
printk("MINIX-fs: mounting unchecked file system, "
"running fsck is recommended\n");
- else if (sbi->s_mount_state & MINIX_ERROR_FS)
+ else if (sbi->s_mount_state & MINIX_ERROR_FS)
printk("MINIX-fs: mounting file system with errors, "
"running fsck is recommended\n");
+
return 0;
out_iput:
@@ -339,10 +360,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = sb->s_magic;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
- buf->f_bfree = minix_count_free_blocks(sbi);
+ buf->f_bfree = minix_count_free_blocks(sb);
buf->f_bavail = buf->f_bfree;
buf->f_files = sbi->s_ninodes;
- buf->f_ffree = minix_count_free_inodes(sbi);
+ buf->f_ffree = minix_count_free_inodes(sb);
buf->f_namelen = sbi->s_namelen;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
@@ -446,7 +467,7 @@ static struct inode *V1_minix_iget(struct inode *inode)
inode->i_mode = raw_inode->i_mode;
inode->i_uid = (uid_t)raw_inode->i_uid;
inode->i_gid = (gid_t)raw_inode->i_gid;
- inode->i_nlink = raw_inode->i_nlinks;
+ set_nlink(inode, raw_inode->i_nlinks);
inode->i_size = raw_inode->i_size;
inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time;
inode->i_mtime.tv_nsec = 0;
@@ -479,7 +500,7 @@ static struct inode *V2_minix_iget(struct inode *inode)
inode->i_mode = raw_inode->i_mode;
inode->i_uid = (uid_t)raw_inode->i_uid;
inode->i_gid = (gid_t)raw_inode->i_gid;
- inode->i_nlink = raw_inode->i_nlinks;
+ set_nlink(inode, raw_inode->i_nlinks);
inode->i_size = raw_inode->i_size;
inode->i_mtime.tv_sec = raw_inode->i_mtime;
inode->i_atime.tv_sec = raw_inode->i_atime;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 341e2122879..26bbd55e82e 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -48,10 +48,10 @@ extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, stru
extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
extern struct inode * minix_new_inode(const struct inode *, int, int *);
extern void minix_free_inode(struct inode * inode);
-extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
+extern unsigned long minix_count_free_inodes(struct super_block *sb);
extern int minix_new_block(struct inode * inode);
extern void minix_free_block(struct inode *inode, unsigned long block);
-extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi);
+extern unsigned long minix_count_free_blocks(struct super_block *sb);
extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
@@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
return list_entry(inode, struct minix_inode_info, vfs_inode);
}
+static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize)
+{
+ return DIV_ROUND_UP(bits, blocksize * 8);
+}
+
#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
@@ -125,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
if (!size)
return 0;
- size = (size >> 4) + ((size & 15) > 0);
+ size >>= 4;
while (*p++ == 0xffff) {
if (--size == 0)
return (p - addr) << 4;
diff --git a/fs/namei.c b/fs/namei.c
index 0b3138de2a3..5008f01787f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -137,7 +137,7 @@ static int do_getname(const char __user *filename, char *page)
return retval;
}
-static char *getname_flags(const char __user * filename, int flags)
+static char *getname_flags(const char __user *filename, int flags, int *empty)
{
char *tmp, *result;
@@ -148,6 +148,8 @@ static char *getname_flags(const char __user * filename, int flags)
result = tmp;
if (retval < 0) {
+ if (retval == -ENOENT && empty)
+ *empty = 1;
if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
__putname(tmp);
result = ERR_PTR(retval);
@@ -160,7 +162,7 @@ static char *getname_flags(const char __user * filename, int flags)
char *getname(const char __user * filename)
{
- return getname_flags(filename, 0);
+ return getname_flags(filename, 0, 0);
}
#ifdef CONFIG_AUDITSYSCALL
@@ -221,14 +223,12 @@ static int check_acl(struct inode *inode, int mask)
}
/*
- * This does basic POSIX ACL permission checking
+ * This does the basic permission checking
*/
static int acl_permission_check(struct inode *inode, int mask)
{
unsigned int mode = inode->i_mode;
- mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
-
if (current_user_ns() != inode_userns(inode))
goto other_perms;
@@ -257,7 +257,7 @@ other_perms:
/**
* generic_permission - check for access rights on a Posix-like filesystem
* @inode: inode to check access rights for
- * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
*
* Used to check for read/write/execute permissions on a file.
* We use "fsuid" for this, letting us set arbitrary permissions
@@ -273,7 +273,7 @@ int generic_permission(struct inode *inode, int mask)
int ret;
/*
- * Do the basic POSIX ACL permission checks.
+ * Do the basic permission checks.
*/
ret = acl_permission_check(inode, mask);
if (ret != -EACCES)
@@ -331,12 +331,14 @@ static inline int do_inode_permission(struct inode *inode, int mask)
/**
* inode_permission - check for access rights to a given inode
* @inode: inode to check permission on
- * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
*
* Used to check for read/write/execute permissions on an inode.
* We use "fsuid" for this, letting us set arbitrary permissions
* for filesystem access without changing the "normal" uids which
* are used for other things.
+ *
+ * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
*/
int inode_permission(struct inode *inode, int mask)
{
@@ -850,7 +852,7 @@ static int follow_managed(struct path *path, unsigned flags)
mntput(path->mnt);
if (ret == -EISDIR)
ret = 0;
- return ret;
+ return ret < 0 ? ret : need_mntput;
}
int follow_down_one(struct path *path)
@@ -898,6 +900,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
break;
path->mnt = mounted;
path->dentry = mounted->mnt_root;
+ nd->flags |= LOOKUP_JUMPED;
nd->seq = read_seqcount_begin(&path->dentry->d_seq);
/*
* Update the inode too. We don't need to re-check the
@@ -1211,6 +1214,8 @@ retry:
path_put_conditional(path, nd);
return err;
}
+ if (err)
+ nd->flags |= LOOKUP_JUMPED;
*inode = path->dentry->d_inode;
return 0;
}
@@ -1798,11 +1803,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
return __lookup_hash(&this, base, NULL);
}
-int user_path_at(int dfd, const char __user *name, unsigned flags,
- struct path *path)
+int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
+ struct path *path, int *empty)
{
struct nameidata nd;
- char *tmp = getname_flags(name, flags);
+ char *tmp = getname_flags(name, flags, empty);
int err = PTR_ERR(tmp);
if (!IS_ERR(tmp)) {
@@ -1816,6 +1821,12 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
return err;
}
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+ struct path *path)
+{
+ return user_path_at_empty(dfd, name, flags, path, 0);
+}
+
static int user_path_parent(int dfd, const char __user *path,
struct nameidata *nd, char **name)
{
@@ -2035,10 +2046,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
if (flag & O_NOATIME && !inode_owner_or_capable(inode))
return -EPERM;
- /*
- * Ensure there are no outstanding leases on the file.
- */
- return break_lease(inode, flag);
+ return 0;
}
static int handle_truncate(struct file *filp)
@@ -2141,6 +2149,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
}
/* create side of things */
+ /*
+ * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been
+ * cleared when we got to the last component we are about to look up
+ */
error = complete_walk(nd);
if (error)
return ERR_PTR(error);
@@ -2209,6 +2221,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
if (error < 0)
goto exit_dput;
+ if (error)
+ nd->flags |= LOOKUP_JUMPED;
+
error = -ENOENT;
if (!path->dentry->d_inode)
goto exit_dput;
@@ -2218,6 +2233,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
path_to_nameidata(path, nd);
nd->inode = path->dentry->d_inode;
+ /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
+ error = complete_walk(nd);
+ if (error)
+ goto exit;
error = -EISDIR;
if (S_ISDIR(nd->inode->i_mode))
goto exit;
diff --git a/fs/namespace.c b/fs/namespace.c
index b4febb29d3b..cfc6d4448aa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1048,15 +1048,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
if (err)
goto out;
seq_putc(m, ' ');
- seq_path_root(m, &mnt_path, &root, " \t\n\\");
- if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
- /*
- * Mountpoint is outside root, discard that one. Ugly,
- * but less so than trying to do that in iterator in a
- * race-free way (due to renames).
- */
- return SEQ_SKIP;
- }
+
+ /* mountpoints outside of chroot jail will give SEQ_SKIP on this */
+ err = seq_path_root(m, &mnt_path, &root, " \t\n\\");
+ if (err)
+ goto out;
+
seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
show_mnt_opts(m, mnt);
@@ -1109,6 +1106,7 @@ static int show_vfsstat(struct seq_file *m, void *v)
/* device */
if (mnt->mnt_sb->s_op->show_devname) {
+ seq_puts(m, "device ");
err = mnt->mnt_sb->s_op->show_devname(m, mnt);
} else {
if (mnt->mnt_devname) {
@@ -2482,11 +2480,43 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
__mnt_make_longterm(mnt);
new_ns->root = mnt;
list_add(&new_ns->list, &new_ns->root->mnt_list);
+ } else {
+ mntput(mnt);
}
return new_ns;
}
EXPORT_SYMBOL(create_mnt_ns);
+struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
+{
+ struct mnt_namespace *ns;
+ struct super_block *s;
+ struct path path;
+ int err;
+
+ ns = create_mnt_ns(mnt);
+ if (IS_ERR(ns))
+ return ERR_CAST(ns);
+
+ err = vfs_path_lookup(mnt->mnt_root, mnt,
+ name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+
+ put_mnt_ns(ns);
+
+ if (err)
+ return ERR_PTR(err);
+
+ /* trade a vfsmount reference for active sb one */
+ s = path.mnt->mnt_sb;
+ atomic_inc(&s->s_active);
+ mntput(path.mnt);
+ /* lock the sucker */
+ down_write(&s->s_umount);
+ /* ... and return the root of (sub)tree on it */
+ return path.dentry;
+}
+EXPORT_SYMBOL(mount_subtree);
+
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
char __user *, type, unsigned long, flags, void __user *, data)
{
@@ -2743,3 +2773,8 @@ void kern_unmount(struct vfsmount *mnt)
}
}
EXPORT_SYMBOL(kern_unmount);
+
+bool our_mnt(struct vfsmount *mnt)
+{
+ return check_mnt(mnt);
+}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 202f370526a..cbd1a61c110 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -228,7 +228,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_uid = server->m.uid;
inode->i_gid = server->m.gid;
@@ -548,7 +548,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
if (error)
- goto out_bdi;
+ goto out_fput;
server->ncp_filp = ncp_filp;
server->ncp_sock = sock;
@@ -559,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
error = -EBADF;
server->info_filp = fget(data.info_fd);
if (!server->info_filp)
- goto out_fput;
+ goto out_bdi;
error = -ENOTSOCK;
sock_inode = server->info_filp->f_path.dentry->d_inode;
if (!S_ISSOCK(sock_inode->i_mode))
@@ -746,9 +746,9 @@ out_nls:
out_fput2:
if (server->info_filp)
fput(server->info_filp);
-out_fput:
- bdi_destroy(&server->bdi);
out_bdi:
+ bdi_destroy(&server->bdi);
+out_fput:
/* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
*
* The previously used put_filp(ncp_filp); was bogus, since
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9561c8fc8bd..281ae95932c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -176,17 +176,6 @@ retry:
return bio;
}
-static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
-{
- if (lseg->pls_range.iomode == IOMODE_RW) {
- dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
- set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
- } else {
- dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
- set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
- }
-}
-
/* This is basically copied from mpage_end_io_read */
static void bl_end_io_read(struct bio *bio, int err)
{
@@ -206,7 +195,7 @@ static void bl_end_io_read(struct bio *bio, int err)
if (!uptodate) {
if (!rdata->pnfs_error)
rdata->pnfs_error = -EIO;
- bl_set_lo_fail(rdata->lseg);
+ pnfs_set_lo_fail(rdata->lseg);
}
bio_put(bio);
put_parallel(par);
@@ -303,6 +292,7 @@ bl_read_pagelist(struct nfs_read_data *rdata)
bl_end_io_read, par);
if (IS_ERR(bio)) {
rdata->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
goto out;
}
}
@@ -370,7 +360,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
if (!uptodate) {
if (!wdata->pnfs_error)
wdata->pnfs_error = -EIO;
- bl_set_lo_fail(wdata->lseg);
+ pnfs_set_lo_fail(wdata->lseg);
}
bio_put(bio);
put_parallel(par);
@@ -386,7 +376,7 @@ static void bl_end_io_write(struct bio *bio, int err)
if (!uptodate) {
if (!wdata->pnfs_error)
wdata->pnfs_error = -EIO;
- bl_set_lo_fail(wdata->lseg);
+ pnfs_set_lo_fail(wdata->lseg);
}
bio_put(bio);
put_parallel(par);
@@ -543,6 +533,11 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
fill_invalid_ext:
dprintk("%s need to zero %d pages\n", __func__, npg_zero);
for (;npg_zero > 0; npg_zero--) {
+ if (bl_is_sector_init(be->be_inval, isect)) {
+ dprintk("isect %llu already init\n",
+ (unsigned long long)isect);
+ goto next_page;
+ }
/* page ref released in bl_end_io_write_zero */
index = isect >> PAGE_CACHE_SECTOR_SHIFT;
dprintk("%s zero %dth page: index %lu isect %llu\n",
@@ -562,8 +557,7 @@ fill_invalid_ext:
* PageUptodate: It was read before
* sector_initialized: already written out
*/
- if (PageDirty(page) || PageWriteback(page) ||
- bl_is_sector_init(be->be_inval, isect)) {
+ if (PageDirty(page) || PageWriteback(page)) {
print_page(page);
unlock_page(page);
page_cache_release(page);
@@ -592,6 +586,7 @@ fill_invalid_ext:
bl_end_io_write_zero, par);
if (IS_ERR(bio)) {
wdata->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
goto out;
}
/* FIXME: This should be done in bi_end_io */
@@ -640,6 +635,7 @@ next_page:
bl_end_io_write, par);
if (IS_ERR(bio)) {
wdata->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
goto out;
}
isect += PAGE_CACHE_SECTORS;
@@ -805,7 +801,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
struct nfs4_deviceid *d_id)
{
struct pnfs_device *dev;
- struct pnfs_block_dev *rv = NULL;
+ struct pnfs_block_dev *rv;
u32 max_resp_sz;
int max_pages;
struct page **pages = NULL;
@@ -823,18 +819,20 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
dev = kmalloc(sizeof(*dev), GFP_NOFS);
if (!dev) {
dprintk("%s kmalloc failed\n", __func__);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
if (pages == NULL) {
kfree(dev);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
for (i = 0; i < max_pages; i++) {
pages[i] = alloc_page(GFP_NOFS);
- if (!pages[i])
+ if (!pages[i]) {
+ rv = ERR_PTR(-ENOMEM);
goto out_free;
+ }
}
memcpy(&dev->dev_id, d_id, sizeof(*d_id));
@@ -847,8 +845,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
rc = nfs4_proc_getdeviceinfo(server, dev);
dprintk("%s getdevice info returns %d\n", __func__, rc);
- if (rc)
+ if (rc) {
+ rv = ERR_PTR(rc);
goto out_free;
+ }
rv = nfs4_blk_decode_device(server, dev);
out_free:
@@ -866,7 +866,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
struct pnfs_devicelist *dlist = NULL;
struct pnfs_block_dev *bdev;
LIST_HEAD(block_disklist);
- int status = 0, i;
+ int status, i;
dprintk("%s enter\n", __func__);
@@ -898,8 +898,8 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
for (i = 0; i < dlist->num_devs; i++) {
bdev = nfs4_blk_get_deviceinfo(server, fh,
&dlist->dev_id[i]);
- if (!bdev) {
- status = -ENODEV;
+ if (IS_ERR(bdev)) {
+ status = PTR_ERR(bdev);
goto out_error;
}
spin_lock(&b_mt_id->bm_lock);
@@ -960,7 +960,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
};
static const struct rpc_pipe_ops bl_upcall_ops = {
- .upcall = bl_pipe_upcall,
+ .upcall = rpc_pipe_generic_upcall,
.downcall = bl_pipe_downcall,
.destroy_msg = bl_pipe_destroy_msg,
};
@@ -989,17 +989,20 @@ static int __init nfs4blocklayout_init(void)
mnt,
NFS_PIPE_DIRNAME, 0, &path);
if (ret)
- goto out_remove;
+ goto out_putrpc;
bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
&bl_upcall_ops, 0);
+ path_put(&path);
if (IS_ERR(bl_device_pipe)) {
ret = PTR_ERR(bl_device_pipe);
- goto out_remove;
+ goto out_putrpc;
}
out:
return ret;
+out_putrpc:
+ rpc_put_mount();
out_remove:
pnfs_unregister_layoutdriver(&blocklayout_type);
return ret;
@@ -1012,6 +1015,7 @@ static void __exit nfs4blocklayout_exit(void)
pnfs_unregister_layoutdriver(&blocklayout_type);
rpc_unlink(bl_device_pipe);
+ rpc_put_mount();
}
MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f27d827960a..42acf7ef599 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -150,7 +150,7 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
}
struct bl_dev_msg {
- int status;
+ int32_t status;
uint32_t major, minor;
};
@@ -169,8 +169,6 @@ extern wait_queue_head_t bl_wq;
#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
/* blocklayoutdev.c */
-ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
- char __user *, size_t);
ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
struct block_device *nfs4_blkdev_get(dev_t dev);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a83b393fb01..d08ba9107fd 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -79,28 +79,6 @@ int nfs4_blkdev_put(struct block_device *bdev)
return blkdev_put(bdev, FMODE_READ);
}
-/*
- * Shouldn't there be a rpc_generic_upcall() to do this for us?
- */
-ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
- char __user *dst, size_t buflen)
-{
- char *data = (char *)msg->data + msg->copied;
- size_t mlen = min(msg->len - msg->copied, buflen);
- unsigned long left;
-
- left = copy_to_user(dst, data, mlen);
- if (left == mlen) {
- msg->errno = -EFAULT;
- return -EFAULT;
- }
-
- mlen -= left;
- msg->copied += mlen;
- msg->errno = 0;
- return mlen;
-}
-
static struct bl_dev_msg bl_mount_reply;
ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
@@ -131,7 +109,7 @@ struct pnfs_block_dev *
nfs4_blk_decode_device(struct nfs_server *server,
struct pnfs_device *dev)
{
- struct pnfs_block_dev *rv = NULL;
+ struct pnfs_block_dev *rv;
struct block_device *bd = NULL;
struct rpc_pipe_msg msg;
struct bl_msg_hdr bl_msg = {
@@ -141,7 +119,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
uint8_t *dataptr;
DECLARE_WAITQUEUE(wq, current);
struct bl_dev_msg *reply = &bl_mount_reply;
- int offset, len, i;
+ int offset, len, i, rc;
dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
@@ -168,8 +146,10 @@ nfs4_blk_decode_device(struct nfs_server *server,
dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
add_wait_queue(&bl_wq, &wq);
- if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+ rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg);
+ if (rc < 0) {
remove_wait_queue(&bl_wq, &wq);
+ rv = ERR_PTR(rc);
goto out;
}
@@ -187,8 +167,9 @@ nfs4_blk_decode_device(struct nfs_server *server,
bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
if (IS_ERR(bd)) {
- dprintk("%s failed to open device : %ld\n",
- __func__, PTR_ERR(bd));
+ rc = PTR_ERR(bd);
+ dprintk("%s failed to open device : %d\n", __func__, rc);
+ rv = ERR_PTR(rc);
goto out;
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e3d29426905..516f3375e06 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -125,7 +125,7 @@ nfs4_callback_up(struct svc_serv *serv)
else
goto out_err;
- return svc_prepare_thread(serv, &serv->sv_pools[0]);
+ return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
out_err:
if (ret == 0)
@@ -199,7 +199,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
INIT_LIST_HEAD(&serv->sv_cb_list);
spin_lock_init(&serv->sv_cb_lock);
init_waitqueue_head(&serv->sv_cb_waitq);
- rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
+ rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
if (IS_ERR(rqstp)) {
svc_xprt_put(serv->sv_bc_xprt);
serv->sv_bc_xprt = NULL;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 918ad647afe..726e59a9e50 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
struct xdr_stream *xdr,
struct cb_recallanyargs *args)
{
- __be32 *p;
+ uint32_t bitmap[2];
+ __be32 *p, status;
args->craa_addr = svc_addr(rqstp);
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
args->craa_objs_to_keep = ntohl(*p++);
- p = read_buf(xdr, 4);
- if (unlikely(p == NULL))
- return htonl(NFS4ERR_BADXDR);
- args->craa_type_mask = ntohl(*p);
+ status = decode_bitmap(xdr, bitmap);
+ if (unlikely(status))
+ return status;
+ args->craa_type_mask = bitmap[0];
return 0;
}
@@ -986,4 +987,5 @@ struct svc_version nfs4_callback_version4 = {
.vs_proc = nfs4_callback_procedures1,
.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
.vs_dispatch = NULL,
+ .vs_hidden = 1,
};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5833fbbf59b..873bf00d51a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -336,11 +336,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
- if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
- sin1->sin6_scope_id != sin2->sin6_scope_id)
+ if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
return 0;
+ else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ return sin1->sin6_scope_id == sin2->sin6_scope_id;
- return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
+ return 1;
}
#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
@@ -1867,6 +1868,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
/* display one transport per line on subsequent lines */
clp = list_entry(v, struct nfs_client, cl_share_link);
+ /* Check if the client is initialized */
+ if (clp->cl_cons_state != NFS_CS_READY)
+ return 0;
+
seq_printf(m, "v%u %s %s %3d %s\n",
clp->rpc_ops->version,
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 321a66bc384..7f265406980 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -240,7 +240,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
sizeof(delegation->stateid.data));
delegation->type = res->delegation_type;
delegation->maxsize = res->maxsize;
- delegation->change_attr = nfsi->change_attr;
+ delegation->change_attr = inode->i_version;
delegation->cred = get_rpccred(cred);
delegation->inode = inode;
delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b238d95ac48..ac289909814 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1468,12 +1468,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
res = NULL;
goto out;
/* This turned out not to be a regular file */
+ case -EISDIR:
case -ENOTDIR:
goto no_open;
case -ELOOP:
if (!(nd->intent.open.flags & O_NOFOLLOW))
goto no_open;
- /* case -EISDIR: */
/* case -EINVAL: */
default:
res = ERR_CAST(inode);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 28b8c3f3cda..606ef0f20ae 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -40,48 +40,8 @@
#define NFSDBG_FACILITY NFSDBG_FILE
-static int nfs_file_open(struct inode *, struct file *);
-static int nfs_file_release(struct inode *, struct file *);
-static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
-static int nfs_file_mmap(struct file *, struct vm_area_struct *);
-static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t count, unsigned int flags);
-static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
-static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
- struct file *filp, loff_t *ppos,
- size_t count, unsigned int flags);
-static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
-static int nfs_file_flush(struct file *, fl_owner_t id);
-static int nfs_file_fsync(struct file *, loff_t, loff_t, int datasync);
-static int nfs_check_flags(int flags);
-static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
-static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
-static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
-
static const struct vm_operations_struct nfs_file_vm_ops;
-const struct file_operations nfs_file_operations = {
- .llseek = nfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = nfs_file_read,
- .aio_write = nfs_file_write,
- .mmap = nfs_file_mmap,
- .open = nfs_file_open,
- .flush = nfs_file_flush,
- .release = nfs_file_release,
- .fsync = nfs_file_fsync,
- .lock = nfs_lock,
- .flock = nfs_flock,
- .splice_read = nfs_file_splice_read,
- .splice_write = nfs_file_splice_write,
- .check_flags = nfs_check_flags,
- .setlease = nfs_setlease,
-};
-
const struct inode_operations nfs_file_inode_operations = {
.permission = nfs_permission,
.getattr = nfs_getattr,
@@ -137,11 +97,9 @@ nfs_file_open(struct inode *inode, struct file *filp)
static int
nfs_file_release(struct inode *inode, struct file *filp)
{
- struct dentry *dentry = filp->f_path.dentry;
-
dprintk("NFS: release(%s/%s)\n",
- dentry->d_parent->d_name.name,
- dentry->d_name.name);
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name);
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
return nfs_release(inode, filp);
@@ -180,8 +138,6 @@ force_reval:
static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
{
- loff_t loff;
-
dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
filp->f_path.dentry->d_parent->d_name.name,
filp->f_path.dentry->d_name.name,
@@ -191,19 +147,15 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
* origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
* the cached file length
*/
- if (origin != SEEK_SET || origin != SEEK_CUR) {
+ if (origin != SEEK_SET && origin != SEEK_CUR) {
struct inode *inode = filp->f_mapping->host;
int retval = nfs_revalidate_file_size(inode, filp);
if (retval < 0)
return (loff_t)retval;
+ }
- spin_lock(&inode->i_lock);
- loff = generic_file_llseek_unlocked(filp, offset, origin);
- spin_unlock(&inode->i_lock);
- } else
- loff = generic_file_llseek_unlocked(filp, offset, origin);
- return loff;
+ return generic_file_llseek(filp, offset, origin);
}
/*
@@ -234,14 +186,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
struct dentry * dentry = iocb->ki_filp->f_path.dentry;
struct inode * inode = dentry->d_inode;
ssize_t result;
- size_t count = iov_length(iov, nr_segs);
if (iocb->ki_filp->f_flags & O_DIRECT)
return nfs_file_direct_read(iocb, iov, nr_segs, pos);
dprintk("NFS: read(%s/%s, %lu@%lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- (unsigned long) count, (unsigned long) pos);
+ (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
if (!result) {
@@ -895,3 +846,54 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
file->f_path.dentry->d_name.name, arg);
return -EINVAL;
}
+
+const struct file_operations nfs_file_operations = {
+ .llseek = nfs_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = nfs_file_read,
+ .aio_write = nfs_file_write,
+ .mmap = nfs_file_mmap,
+ .open = nfs_file_open,
+ .flush = nfs_file_flush,
+ .release = nfs_file_release,
+ .fsync = nfs_file_fsync,
+ .lock = nfs_lock,
+ .flock = nfs_flock,
+ .splice_read = nfs_file_splice_read,
+ .splice_write = nfs_file_splice_write,
+ .check_flags = nfs_check_flags,
+ .setlease = nfs_setlease,
+};
+
+#ifdef CONFIG_NFS_V4
+static int
+nfs4_file_open(struct inode *inode, struct file *filp)
+{
+ /*
+ * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to
+ * this point, then something is very wrong
+ */
+ dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp);
+ return -ENOTDIR;
+}
+
+const struct file_operations nfs4_file_operations = {
+ .llseek = nfs_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = nfs_file_read,
+ .aio_write = nfs_file_write,
+ .mmap = nfs_file_mmap,
+ .open = nfs4_file_open,
+ .flush = nfs_file_flush,
+ .release = nfs_file_release,
+ .fsync = nfs_file_fsync,
+ .lock = nfs_lock,
+ .flock = nfs_flock,
+ .splice_read = nfs_file_splice_read,
+ .splice_write = nfs_file_splice_write,
+ .check_flags = nfs_check_flags,
+ .setlease = nfs_setlease,
+};
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 5b1006480bc..7cf2c4699b0 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -212,7 +212,7 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
auxdata.ctime = nfsi->vfs_inode.i_ctime;
if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
- auxdata.change_attr = nfsi->change_attr;
+ auxdata.change_attr = nfsi->vfs_inode.i_version;
if (bufmax > sizeof(auxdata))
bufmax = sizeof(auxdata);
@@ -244,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
auxdata.ctime = nfsi->vfs_inode.i_ctime;
if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
- auxdata.change_attr = nfsi->change_attr;
+ auxdata.change_attr = nfsi->vfs_inode.i_version;
if (memcmp(data, &auxdata, datalen) != 0)
return FSCACHE_CHECKAUX_OBSOLETE;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index f20801ae0a1..47d1c6ff2d8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -336,8 +336,6 @@ struct idmap {
struct idmap_hashtable idmap_group_hash;
};
-static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
- char __user *, size_t);
static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
size_t);
static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
@@ -345,7 +343,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
static unsigned int fnvhash32(const void *, size_t);
static const struct rpc_pipe_ops idmap_upcall_ops = {
- .upcall = idmap_pipe_upcall,
+ .upcall = rpc_pipe_generic_upcall,
.downcall = idmap_pipe_downcall,
.destroy_msg = idmap_pipe_destroy_msg,
};
@@ -595,27 +593,6 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
return ret;
}
-/* RPC pipefs upcall/downcall routines */
-static ssize_t
-idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
- char __user *dst, size_t buflen)
-{
- char *data = (char *)msg->data + msg->copied;
- size_t mlen = min(msg->len, buflen);
- unsigned long left;
-
- left = copy_to_user(dst, data, mlen);
- if (left == mlen) {
- msg->errno = -EFAULT;
- return -EFAULT;
- }
-
- mlen -= left;
- msg->copied += mlen;
- msg->errno = 0;
- return mlen;
-}
-
static ssize_t
idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index fe1203797b2..50a15fa8cf9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
*/
inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
if (S_ISREG(inode->i_mode)) {
- inode->i_fop = &nfs_file_operations;
+ inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
inode->i_data.a_ops = &nfs_file_aops;
inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
} else if (S_ISDIR(inode->i_mode)) {
@@ -318,9 +318,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
memset(&inode->i_atime, 0, sizeof(inode->i_atime));
memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
- nfsi->change_attr = 0;
+ inode->i_version = 0;
inode->i_size = 0;
- inode->i_nlink = 0;
+ clear_nlink(inode);
inode->i_uid = -2;
inode->i_gid = -2;
inode->i_blocks = 0;
@@ -344,7 +344,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL;
if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
- nfsi->change_attr = fattr->change_attr;
+ inode->i_version = fattr->change_attr;
else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
nfsi->cache_validity |= NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_DATA;
@@ -355,7 +355,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
| NFS_INO_INVALID_DATA
| NFS_INO_REVAL_PAGECACHE;
if (fattr->valid & NFS_ATTR_FATTR_NLINK)
- inode->i_nlink = fattr->nlink;
+ set_nlink(inode, fattr->nlink);
else if (nfs_server_capable(inode, NFS_CAP_NLINK))
nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
if (fattr->valid & NFS_ATTR_FATTR_OWNER)
@@ -897,8 +897,8 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
- && nfsi->change_attr == fattr->pre_change_attr) {
- nfsi->change_attr = fattr->change_attr;
+ && inode->i_version == fattr->pre_change_attr) {
+ inode->i_version = fattr->change_attr;
if (S_ISDIR(inode->i_mode))
nfsi->cache_validity |= NFS_INO_INVALID_DATA;
ret |= NFS_INO_INVALID_ATTR;
@@ -952,7 +952,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
return -EIO;
if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
- nfsi->change_attr != fattr->change_attr)
+ inode->i_version != fattr->change_attr)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
/* Verify a few of the more important attributes */
@@ -1163,7 +1163,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
}
if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
(fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
- fattr->pre_change_attr = NFS_I(inode)->change_attr;
+ fattr->pre_change_attr = inode->i_version;
fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
}
if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
@@ -1244,13 +1244,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* More cache consistency checks */
if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
- if (nfsi->change_attr != fattr->change_attr) {
+ if (inode->i_version != fattr->change_attr) {
dprintk("NFS: change_attr change on server for file %s/%ld\n",
inode->i_sb->s_id, inode->i_ino);
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
if (S_ISDIR(inode->i_mode))
nfs_force_lookup_revalidate(inode);
- nfsi->change_attr = fattr->change_attr;
+ inode->i_version = fattr->change_attr;
}
} else if (server->caps & NFS_CAP_CHANGE_ATTR)
invalid |= save_cache_validity;
@@ -1361,7 +1361,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_ATTR;
if (S_ISDIR(inode->i_mode))
invalid |= NFS_INO_INVALID_DATA;
- inode->i_nlink = fattr->nlink;
+ set_nlink(inode, fattr->nlink);
}
} else if (server->caps & NFS_CAP_NLINK)
invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ab12913dd47..3f4d95751d5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -299,6 +299,8 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
struct list_head *head);
+extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode);
extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
extern void nfs_readdata_release(struct nfs_read_data *rdata);
@@ -457,13 +459,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
PAGE_SIZE - 1) >> PAGE_SHIFT;
}
-/*
- * Helper for restarting RPC calls in the possible presence of NFSv4.1
- * sessions.
- */
-static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
-{
- if (nfs4_has_session(clp))
- return rpc_restart_call_prepare(task);
- return rpc_restart_call(task);
-}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 85f1690ca08..d4bc9ed9174 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -853,6 +853,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.dentry_ops = &nfs_dentry_operations,
.dir_inode_ops = &nfs3_dir_inode_operations,
.file_inode_ops = &nfs3_file_inode_operations,
+ .file_ops = &nfs_file_operations,
.getroot = nfs3_proc_get_root,
.getattr = nfs3_proc_getattr,
.setattr = nfs3_proc_setattr,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3e93e9a1bee..693ae22f873 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -13,30 +13,6 @@
struct idmap;
-/*
- * In a seqid-mutating op, this macro controls which error return
- * values trigger incrementation of the seqid.
- *
- * from rfc 3010:
- * The client MUST monotonically increment the sequence number for the
- * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE
- * operations. This is true even in the event that the previous
- * operation that used the sequence number received an error. The only
- * exception to this rule is if the previous operation received one of
- * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID,
- * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR,
- * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE.
- *
- */
-#define seqid_mutating_err(err) \
-(((err) != NFSERR_STALE_CLIENTID) && \
- ((err) != NFSERR_STALE_STATEID) && \
- ((err) != NFSERR_BAD_STATEID) && \
- ((err) != NFSERR_BAD_SEQID) && \
- ((err) != NFSERR_BAD_XDR) && \
- ((err) != NFSERR_RESOURCE) && \
- ((err) != NFSERR_NOFILEHANDLE))
-
enum nfs4_client_state {
NFS4CLNT_MANAGER_RUNNING = 0,
NFS4CLNT_CHECK_LEASE,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index e8915d4840a..a62d36b9a99 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -31,6 +31,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
+#include <linux/module.h>
#include "internal.h"
#include "nfs4filelayout.h"
@@ -77,19 +78,6 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
BUG();
}
-/* For data server errors we don't recover from */
-static void
-filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
-{
- if (lseg->pls_range.iomode == IOMODE_RW) {
- dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
- set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
- } else {
- dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
- set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
- }
-}
-
static int filelayout_async_handle_error(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
@@ -135,7 +123,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
static int filelayout_read_done_cb(struct rpc_task *task,
struct nfs_read_data *data)
{
- struct nfs_client *clp = data->ds_clp;
int reset = 0;
dprintk("%s DS read\n", __func__);
@@ -145,11 +132,10 @@ static int filelayout_read_done_cb(struct rpc_task *task,
dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
__func__, data->ds_clp, data->ds_clp->cl_session);
if (reset) {
- filelayout_set_lo_fail(data->lseg);
+ pnfs_set_lo_fail(data->lseg);
nfs4_reset_read(task, data);
- clp = NFS_SERVER(data->inode)->nfs_client;
}
- nfs_restart_rpc(task, clp);
+ rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -216,17 +202,13 @@ static int filelayout_write_done_cb(struct rpc_task *task,
if (filelayout_async_handle_error(task, data->args.context->state,
data->ds_clp, &reset) == -EAGAIN) {
- struct nfs_client *clp;
-
dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
__func__, data->ds_clp, data->ds_clp->cl_session);
if (reset) {
- filelayout_set_lo_fail(data->lseg);
+ pnfs_set_lo_fail(data->lseg);
nfs4_reset_write(task, data);
- clp = NFS_SERVER(data->inode)->nfs_client;
- } else
- clp = data->ds_clp;
- nfs_restart_rpc(task, clp);
+ }
+ rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -256,9 +238,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
__func__, data->ds_clp, data->ds_clp->cl_session);
if (reset) {
prepare_to_resend_writes(data);
- filelayout_set_lo_fail(data->lseg);
+ pnfs_set_lo_fail(data->lseg);
} else
- nfs_restart_rpc(task, data->ds_clp);
+ rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -468,9 +450,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
fl->dsaddr = dsaddr;
- if (fl->first_stripe_index < 0 ||
- fl->first_stripe_index >= dsaddr->stripe_count) {
- dprintk("%s Bad first_stripe_index %d\n",
+ if (fl->first_stripe_index >= dsaddr->stripe_count) {
+ dprintk("%s Bad first_stripe_index %u\n",
__func__, fl->first_stripe_index);
goto out_put;
}
@@ -571,7 +552,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
/* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
* Futher checking is done in filelayout_check_layout */
- if (fl->num_fh < 0 || fl->num_fh >
+ if (fl->num_fh >
max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
goto out_err;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4700fae1ada..d9f4d78c341 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -39,6 +39,8 @@
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/string.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/gss_api.h>
@@ -73,9 +75,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
-static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir,
- const struct qstr *name, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
@@ -753,9 +752,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
spin_lock(&dir->i_lock);
nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
- if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
+ if (!cinfo->atomic || cinfo->before != dir->i_version)
nfs_force_lookup_revalidate(dir);
- nfsi->change_attr = cinfo->after;
+ dir->i_version = cinfo->after;
spin_unlock(&dir->i_lock);
}
@@ -897,6 +896,8 @@ out:
static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
{
+ if (delegation == NULL)
+ return 0;
if ((delegation->type & fmode) != fmode)
return 0;
if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
@@ -1039,8 +1040,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
}
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
- if (delegation == NULL ||
- !can_open_delegated(delegation, fmode)) {
+ if (!can_open_delegated(delegation, fmode)) {
rcu_read_unlock();
break;
}
@@ -1094,7 +1094,12 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
if (delegation)
delegation_flags = delegation->flags;
rcu_read_unlock();
- if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
+ if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) {
+ pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
+ "returning a delegation for "
+ "OPEN(CLAIM_DELEGATE_CUR)\n",
+ NFS_CLIENT(inode)->cl_server);
+ } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
nfs_inode_set_delegation(state->inode,
data->owner->so_cred,
&data->o_res);
@@ -1426,11 +1431,9 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
goto out_no_action;
rcu_read_lock();
delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
- if (delegation != NULL &&
- test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) {
- rcu_read_unlock();
- goto out_no_action;
- }
+ if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR &&
+ can_open_delegated(delegation, data->o_arg.fmode))
+ goto unlock_no_action;
rcu_read_unlock();
}
/* Update sequence id. */
@@ -1447,6 +1450,8 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
return;
rpc_call_start(task);
return;
+unlock_no_action:
+ rcu_read_unlock();
out_no_action:
task->tk_action = NULL;
@@ -1596,8 +1601,14 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
int status;
status = nfs4_run_open_task(data, 0);
- if (status != 0 || !data->rpc_done)
+ if (!data->rpc_done)
+ return status;
+ if (status != 0) {
+ if (status == -NFS4ERR_BADNAME &&
+ !(o_arg->open_flags & O_CREAT))
+ return -ENOENT;
return status;
+ }
if (o_arg->open_flags & O_CREAT) {
update_changeattr(dir, &o_res->cinfo);
@@ -2408,14 +2419,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
return status;
}
-static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
- const struct nfs_fh *dirfh, const struct qstr *name,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
+ const struct qstr *name, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr)
{
+ struct nfs_server *server = NFS_SERVER(dir);
int status;
struct nfs4_lookup_arg args = {
.bitmask = server->attr_bitmask,
- .dir_fh = dirfh,
+ .dir_fh = NFS_FH(dir),
.name = name,
};
struct nfs4_lookup_res res = {
@@ -2431,40 +2443,8 @@ static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
nfs_fattr_init(fattr);
- dprintk("NFS call lookupfh %s\n", name->name);
- status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
- dprintk("NFS reply lookupfh: %d\n", status);
- return status;
-}
-
-static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
- struct qstr *name, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
-{
- struct nfs4_exception exception = { };
- int err;
- do {
- err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr);
- /* FIXME: !!!! */
- if (err == -NFS4ERR_MOVED) {
- err = -EREMOTE;
- break;
- }
- err = nfs4_handle_exception(server, err, &exception);
- } while (exception.retry);
- return err;
-}
-
-static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
- const struct qstr *name, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
-{
- int status;
-
dprintk("NFS call lookup %s\n", name->name);
- status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr);
- if (status == -NFS4ERR_MOVED)
- status = nfs4_get_referral(dir, name, fattr, fhandle);
+ status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
dprintk("NFS reply lookup: %d\n", status);
return status;
}
@@ -2485,11 +2465,19 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst
struct nfs4_exception exception = { };
int err;
do {
- err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr),
- &exception);
- if (err == -EPERM)
+ int status;
+
+ status = _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr);
+ switch (status) {
+ case -NFS4ERR_BADNAME:
+ return -ENOENT;
+ case -NFS4ERR_MOVED:
+ return nfs4_get_referral(dir, name, fattr, fhandle);
+ case -NFS4ERR_WRONGSEC:
nfs_fixup_secinfo_attributes(fattr, fhandle);
+ }
+ err = nfs4_handle_exception(NFS_SERVER(dir),
+ status, &exception);
} while (exception.retry);
return err;
}
@@ -3210,7 +3198,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
struct nfs_server *server = NFS_SERVER(data->inode);
if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
- nfs_restart_rpc(task, server->nfs_client);
+ rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -3260,7 +3248,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
struct inode *inode = data->inode;
if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+ rpc_restart_call_prepare(task);
return -EAGAIN;
}
if (task->tk_status >= 0) {
@@ -3317,7 +3305,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat
struct inode *inode = data->inode;
if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+ rpc_restart_call_prepare(task);
return -EAGAIN;
}
nfs_refresh_inode(inode, data->res.fattr);
@@ -3857,7 +3845,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
default:
if (nfs4_async_handle_error(task, data->res.server, NULL) ==
-EAGAIN) {
- nfs_restart_rpc(task, data->res.server->nfs_client);
+ rpc_restart_call_prepare(task);
return;
}
}
@@ -4111,8 +4099,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
break;
default:
if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
- nfs_restart_rpc(task,
- calldata->server->nfs_client);
+ rpc_restart_call_prepare(task);
}
}
@@ -4945,7 +4932,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
task->tk_status = 0;
/* fall through */
case -NFS4ERR_RETRY_UNCACHED_REP:
- nfs_restart_rpc(task, data->clp);
+ rpc_restart_call_prepare(task);
return;
}
dprintk("<-- %s\n", __func__);
@@ -5786,7 +5773,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
server = NFS_SERVER(lrp->args.inode);
if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
- nfs_restart_rpc(task, lrp->clp);
+ rpc_restart_call_prepare(task);
return;
}
spin_lock(&lo->plh_inode->i_lock);
@@ -5957,7 +5944,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
}
if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
- nfs_restart_rpc(task, server->nfs_client);
+ rpc_restart_call_prepare(task);
return;
}
@@ -5970,6 +5957,7 @@ static void nfs4_layoutcommit_release(void *calldata)
{
struct nfs4_layoutcommit_data *data = calldata;
struct pnfs_layout_segment *lseg, *tmp;
+ unsigned long *bitlock = &NFS_I(data->args.inode)->flags;
pnfs_cleanup_layoutcommit(data);
/* Matched by references in pnfs_set_layoutcommit */
@@ -5979,6 +5967,11 @@ static void nfs4_layoutcommit_release(void *calldata)
&lseg->pls_flags))
put_lseg(lseg);
}
+
+ clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+ smp_mb__after_clear_bit();
+ wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+
put_rpccred(data->cred);
kfree(data);
}
@@ -6267,10 +6260,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.dentry_ops = &nfs4_dentry_operations,
.dir_inode_ops = &nfs4_dir_inode_operations,
.file_inode_ops = &nfs4_file_inode_operations,
+ .file_ops = &nfs4_file_operations,
.getroot = nfs4_proc_get_root,
.getattr = nfs4_proc_getattr,
.setattr = nfs4_proc_setattr,
- .lookupfh = nfs4_proc_lookupfh,
.lookup = nfs4_proc_lookup,
.access = nfs4_proc_access,
.readlink = nfs4_proc_readlink,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 39914be40b0..6a7107ae6b7 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1156,11 +1156,13 @@ restart:
if (status >= 0) {
status = nfs4_reclaim_locks(state, ops);
if (status >= 0) {
+ spin_lock(&state->state_lock);
list_for_each_entry(lock, &state->lock_states, ls_locks) {
if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
printk("%s: Lock reclaim failed!\n",
__func__);
}
+ spin_unlock(&state->state_lock);
nfs4_put_open_state(state);
goto restart;
}
@@ -1224,10 +1226,12 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
clear_bit(NFS_O_RDONLY_STATE, &state->flags);
clear_bit(NFS_O_WRONLY_STATE, &state->flags);
clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ spin_lock(&state->state_lock);
list_for_each_entry(lock, &state->lock_states, ls_locks) {
lock->ls_seqid.flags = 0;
lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
}
+ spin_unlock(&state->state_lock);
}
static void nfs4_reset_seqids(struct nfs_server *server,
@@ -1350,12 +1354,14 @@ static void nfs4_warn_keyexpired(const char *s)
static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
{
switch (error) {
+ case 0:
+ break;
case -NFS4ERR_CB_PATH_DOWN:
nfs_handle_cb_pathdown(clp);
- return 0;
+ break;
case -NFS4ERR_NO_GRACE:
nfs4_state_end_reclaim_reboot(clp);
- return 0;
+ break;
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_LEASE_MOVED:
set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
@@ -1375,13 +1381,15 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
case -NFS4ERR_SEQ_MISORDERED:
set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
/* Zero session reset errors */
- return 0;
+ break;
case -EKEYEXPIRED:
/* Nothing we can do */
nfs4_warn_keyexpired(clp->cl_hostname);
- return 0;
+ break;
+ default:
+ return error;
}
- return error;
+ return 0;
}
static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
@@ -1428,7 +1436,7 @@ static int nfs4_check_lease(struct nfs_client *clp)
struct rpc_cred *cred;
const struct nfs4_state_maintenance_ops *ops =
clp->cl_mvops->state_renewal_ops;
- int status = -NFS4ERR_EXPIRED;
+ int status;
/* Is the client already known to have an expired lease? */
if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
@@ -1438,6 +1446,7 @@ static int nfs4_check_lease(struct nfs_client *clp)
spin_unlock(&clp->cl_lock);
if (cred == NULL) {
cred = nfs4_get_setclientid_cred(clp);
+ status = -ENOKEY;
if (cred == NULL)
goto out;
}
@@ -1525,16 +1534,16 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
{
if (!flags)
return;
- else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
+ if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
nfs41_handle_server_reboot(clp);
- else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
+ if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
SEQ4_STATUS_ADMIN_STATE_REVOKED |
SEQ4_STATUS_LEASE_MOVED))
nfs41_handle_state_revoked(clp);
- else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
+ if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
nfs41_handle_recallable_state_revoked(clp);
- else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
+ if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
SEQ4_STATUS_BACKCHANNEL_FAULT |
SEQ4_STATUS_CB_PATH_DOWN_SESSION))
nfs41_handle_cb_path_down(clp);
@@ -1662,10 +1671,10 @@ static void nfs4_state_manager(struct nfs_client *clp)
if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
status = nfs4_check_lease(clp);
+ if (status < 0)
+ goto out_error;
if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
continue;
- if (status < 0 && status != -NFS4ERR_CB_PATH_DOWN)
- goto out_error;
}
/* Initialize or reset the session */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1dce12f41a4..e6161b213ed 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
if (status)
goto out;
status = decode_secinfo(xdr, res);
- if (status)
- goto out;
out:
return status;
}
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index d0cda12fddc..c807ab93140 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -38,21 +38,15 @@
*/
#include <linux/module.h>
-#include <scsi/osd_initiator.h>
+#include <scsi/osd_ore.h>
#include "objlayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-#define _LLU(x) ((unsigned long long)x)
-
-enum { BIO_MAX_PAGES_KMALLOC =
- (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
-};
-
struct objio_dev_ent {
struct nfs4_deviceid_node id_node;
- struct osd_dev *od;
+ struct ore_dev od;
};
static void
@@ -60,8 +54,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
{
struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
- dprintk("%s: free od=%p\n", __func__, de->od);
- osduld_put_device(de->od);
+ dprintk("%s: free od=%p\n", __func__, de->od.od);
+ osduld_put_device(de->od.od);
kfree(de);
}
@@ -98,12 +92,12 @@ _dev_list_add(const struct nfs_server *nfss,
nfss->pnfs_curr_ld,
nfss->nfs_client,
d_id);
- de->od = od;
+ de->od.od = od;
d = nfs4_insert_deviceid_node(&de->id_node);
n = container_of(d, struct objio_dev_ent, id_node);
if (n != de) {
- dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
+ dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
objio_free_deviceid_node(&de->id_node);
de = n;
}
@@ -111,28 +105,11 @@ _dev_list_add(const struct nfs_server *nfss,
return de;
}
-struct caps_buffers {
- u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
- u8 creds[OSD_CAP_LEN];
-};
-
struct objio_segment {
struct pnfs_layout_segment lseg;
- struct pnfs_osd_object_cred *comps;
-
- unsigned mirrors_p1;
- unsigned stripe_unit;
- unsigned group_width; /* Data stripe_units without integrity comps */
- u64 group_depth;
- unsigned group_count;
-
- unsigned max_io_size;
-
- unsigned comps_index;
- unsigned num_comps;
- /* variable length */
- struct objio_dev_ent *ods[];
+ struct ore_layout layout;
+ struct ore_components oc;
};
static inline struct objio_segment *
@@ -141,59 +118,44 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
return container_of(lseg, struct objio_segment, lseg);
}
-struct objio_state;
-typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
-
struct objio_state {
/* Generic layer */
- struct objlayout_io_state ol_state;
-
- struct objio_segment *layout;
-
- struct kref kref;
- objio_done_fn done;
- void *private;
-
- unsigned long length;
- unsigned numdevs; /* Actually used devs in this IO */
- /* A per-device variable array of size numdevs */
- struct _objio_per_comp {
- struct bio *bio;
- struct osd_request *or;
- unsigned long length;
- u64 offset;
- unsigned dev;
- } per_dev[];
+ struct objlayout_io_res oir;
+
+ bool sync;
+ /*FIXME: Support for extra_bytes at ore_get_rw_state() */
+ struct ore_io_state *ios;
};
/* Send and wait for a get_device_info of devices in the layout,
then look them up with the osd_initiator library */
-static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
- struct objio_segment *objio_seg, unsigned comp,
- gfp_t gfp_flags)
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+ struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
+ gfp_t gfp_flags)
{
struct pnfs_osd_deviceaddr *deviceaddr;
- struct nfs4_deviceid *d_id;
struct objio_dev_ent *ode;
struct osd_dev *od;
struct osd_dev_info odi;
int err;
- d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
-
ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
- if (ode)
- return ode;
+ if (ode) {
+ objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+ return 0;
+ }
err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
if (unlikely(err)) {
dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
__func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
- return ERR_PTR(err);
+ return err;
}
odi.systemid_len = deviceaddr->oda_systemid.len;
if (odi.systemid_len > sizeof(odi.systemid)) {
+ dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
+ __func__, sizeof(odi.systemid));
err = -EINVAL;
goto out;
} else if (odi.systemid_len)
@@ -218,96 +180,53 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
gfp_flags);
-
+ objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+ dprintk("Adding new dev_id(%llx:%llx)\n",
+ _DEVID_LO(d_id), _DEVID_HI(d_id));
out:
- dprintk("%s: return=%d\n", __func__, err);
objlayout_put_deviceinfo(deviceaddr);
- return err ? ERR_PTR(err) : ode;
+ return err;
}
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
- struct objio_segment *objio_seg,
- gfp_t gfp_flags)
+static void copy_single_comp(struct ore_components *oc, unsigned c,
+ struct pnfs_osd_object_cred *src_comp)
{
- unsigned i;
- int err;
+ struct ore_comp *ocomp = &oc->comps[c];
- /* lookup all devices */
- for (i = 0; i < objio_seg->num_comps; i++) {
- struct objio_dev_ent *ode;
+ WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
+ WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
- ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
- if (unlikely(IS_ERR(ode))) {
- err = PTR_ERR(ode);
- goto out;
- }
- objio_seg->ods[i] = ode;
- }
- err = 0;
+ ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
+ ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
-out:
- dprintk("%s: return=%d\n", __func__, err);
- return err;
+ memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
}
-static int _verify_data_map(struct pnfs_osd_layout *layout)
+int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
+ struct objio_segment **pseg)
{
- struct pnfs_osd_data_map *data_map = &layout->olo_map;
- u64 stripe_length;
- u32 group_width;
-
-/* FIXME: Only raid0 for now. if not go through MDS */
- if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
- printk(KERN_ERR "Only RAID_0 for now\n");
- return -ENOTSUPP;
- }
- if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
- printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
- data_map->odm_num_comps, data_map->odm_mirror_cnt);
- return -EINVAL;
- }
+ struct __alloc_objio_segment {
+ struct objio_segment olseg;
+ struct ore_dev *ods[numdevs];
+ struct ore_comp comps[numdevs];
+ } *aolseg;
- if (data_map->odm_group_width)
- group_width = data_map->odm_group_width;
- else
- group_width = data_map->odm_num_comps /
- (data_map->odm_mirror_cnt + 1);
-
- stripe_length = (u64)data_map->odm_stripe_unit * group_width;
- if (stripe_length >= (1ULL << 32)) {
- printk(KERN_ERR "Total Stripe length(0x%llx)"
- " >= 32bit is not supported\n", _LLU(stripe_length));
- return -ENOTSUPP;
+ aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
+ if (unlikely(!aolseg)) {
+ dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
+ numdevs, sizeof(*aolseg));
+ return -ENOMEM;
}
- if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
- printk(KERN_ERR "Stripe Unit(0x%llx)"
- " must be Multples of PAGE_SIZE(0x%lx)\n",
- _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
- return -ENOTSUPP;
- }
+ aolseg->olseg.oc.numdevs = numdevs;
+ aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
+ aolseg->olseg.oc.comps = aolseg->comps;
+ aolseg->olseg.oc.ods = aolseg->ods;
+ *pseg = &aolseg->olseg;
return 0;
}
-static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
- struct pnfs_osd_object_cred *src_comp,
- struct caps_buffers *caps_p)
-{
- WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
- WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
-
- *cur_comp = *src_comp;
-
- memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
- sizeof(caps_p->caps_key));
- cur_comp->oc_cap_key.cred = caps_p->caps_key;
-
- memcpy(caps_p->creds, src_comp->oc_cap.cred,
- sizeof(caps_p->creds));
- cur_comp->oc_cap.cred = caps_p->creds;
-}
-
int objio_alloc_lseg(struct pnfs_layout_segment **outp,
struct pnfs_layout_hdr *pnfslay,
struct pnfs_layout_range *range,
@@ -317,59 +236,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
struct objio_segment *objio_seg;
struct pnfs_osd_xdr_decode_layout_iter iter;
struct pnfs_osd_layout layout;
- struct pnfs_osd_object_cred *cur_comp, src_comp;
- struct caps_buffers *caps_p;
+ struct pnfs_osd_object_cred src_comp;
+ unsigned cur_comp;
int err;
err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
if (unlikely(err))
return err;
- err = _verify_data_map(&layout);
+ err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
if (unlikely(err))
return err;
- objio_seg = kzalloc(sizeof(*objio_seg) +
- sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
- sizeof(*objio_seg->comps) * layout.olo_num_comps +
- sizeof(struct caps_buffers) * layout.olo_num_comps,
- gfp_flags);
- if (!objio_seg)
- return -ENOMEM;
+ objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
+ objio_seg->layout.group_width = layout.olo_map.odm_group_width;
+ objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
+ objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+ objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
- objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
- cur_comp = objio_seg->comps;
- caps_p = (void *)(cur_comp + layout.olo_num_comps);
- while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
- copy_single_comp(cur_comp++, &src_comp, caps_p++);
+ err = ore_verify_layout(layout.olo_map.odm_num_comps,
+ &objio_seg->layout);
if (unlikely(err))
goto err;
- objio_seg->num_comps = layout.olo_num_comps;
- objio_seg->comps_index = layout.olo_comps_index;
- err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
- if (err)
- goto err;
-
- objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
- objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
- if (layout.olo_map.odm_group_width) {
- objio_seg->group_width = layout.olo_map.odm_group_width;
- objio_seg->group_depth = layout.olo_map.odm_group_depth;
- objio_seg->group_count = layout.olo_map.odm_num_comps /
- objio_seg->mirrors_p1 /
- objio_seg->group_width;
- } else {
- objio_seg->group_width = layout.olo_map.odm_num_comps /
- objio_seg->mirrors_p1;
- objio_seg->group_depth = -1;
- objio_seg->group_count = 1;
+ objio_seg->oc.first_dev = layout.olo_comps_index;
+ cur_comp = 0;
+ while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
+ copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
+ err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
+ &src_comp.oc_object_id.oid_device_id,
+ gfp_flags);
+ if (err)
+ goto err;
+ ++cur_comp;
}
-
- /* Cache this calculation it will hit for every page */
- objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
- objio_seg->stripe_unit) *
- objio_seg->group_width;
+ /* pnfs_osd_xdr_decode_layout_comp returns false on error */
+ if (unlikely(err))
+ goto err;
*outp = &objio_seg->lseg;
return 0;
@@ -386,43 +289,63 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
int i;
struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
- for (i = 0; i < objio_seg->num_comps; i++) {
- if (!objio_seg->ods[i])
+ for (i = 0; i < objio_seg->oc.numdevs; i++) {
+ struct ore_dev *od = objio_seg->oc.ods[i];
+ struct objio_dev_ent *ode;
+
+ if (!od)
break;
- nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
+ ode = container_of(od, typeof(*ode), od);
+ nfs4_put_deviceid_node(&ode->id_node);
}
kfree(objio_seg);
}
-int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
- struct objlayout_io_state **outp,
- gfp_t gfp_flags)
+static int
+objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
+ struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
+ loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
+ struct objio_state **outp)
{
struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
- struct objio_state *ios;
- const unsigned first_size = sizeof(*ios) +
- objio_seg->num_comps * sizeof(ios->per_dev[0]);
- const unsigned sec_size = objio_seg->num_comps *
- sizeof(ios->ol_state.ioerrs[0]);
-
- ios = kzalloc(first_size + sec_size, gfp_flags);
- if (unlikely(!ios))
+ struct ore_io_state *ios;
+ int ret;
+ struct __alloc_objio_state {
+ struct objio_state objios;
+ struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
+ } *aos;
+
+ aos = kzalloc(sizeof(*aos), gfp_flags);
+ if (unlikely(!aos))
return -ENOMEM;
- ios->layout = objio_seg;
- ios->ol_state.ioerrs = ((void *)ios) + first_size;
- ios->ol_state.num_comps = objio_seg->num_comps;
+ objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
+ aos->ioerrs, rpcdata, pnfs_layout_type);
- *outp = &ios->ol_state;
+ ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
+ offset, count, &ios);
+ if (unlikely(ret)) {
+ kfree(aos);
+ return ret;
+ }
+
+ ios->pages = pages;
+ ios->pgbase = pgbase;
+ ios->private = aos;
+ BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
+
+ aos->objios.sync = 0;
+ aos->objios.ios = ios;
+ *outp = &aos->objios;
return 0;
}
-void objio_free_io_state(struct objlayout_io_state *ol_state)
+void objio_free_result(struct objlayout_io_res *oir)
{
- struct objio_state *ios = container_of(ol_state, struct objio_state,
- ol_state);
+ struct objio_state *objios = container_of(oir, struct objio_state, oir);
- kfree(ios);
+ ore_put_io_state(objios->ios);
+ kfree(objios);
}
enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
@@ -455,539 +378,152 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
}
}
-static void _clear_bio(struct bio *bio)
+static void __on_dev_error(struct ore_io_state *ios,
+ struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
+ u64 dev_offset, u64 dev_len)
{
- struct bio_vec *bv;
- unsigned i;
-
- __bio_for_each_segment(bv, bio, i, 0) {
- unsigned this_count = bv->bv_len;
-
- if (likely(PAGE_SIZE == this_count))
- clear_highpage(bv->bv_page);
- else
- zero_user(bv->bv_page, bv->bv_offset, this_count);
- }
-}
-
-static int _io_check(struct objio_state *ios, bool is_write)
-{
- enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
- int lin_ret = 0;
- int i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_sense_info osi;
- struct osd_request *or = ios->per_dev[i].or;
- int ret;
-
- if (!or)
- continue;
+ struct objio_state *objios = ios->private;
+ struct pnfs_osd_objid pooid;
+ struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
+ /* FIXME: what to do with more-then-one-group layouts. We need to
+ * translate from ore_io_state index to oc->comps index
+ */
+ unsigned comp = dev_index;
- ret = osd_req_decode_sense(or, &osi);
- if (likely(!ret))
- continue;
+ pooid.oid_device_id = ode->id_node.deviceid;
+ pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
+ pooid.oid_object_id = ios->oc->comps[comp].obj.id;
- if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
- /* start read offset passed endof file */
- BUG_ON(is_write);
- _clear_bio(ios->per_dev[i].bio);
- dprintk("%s: start read offset passed end of file "
- "offset=0x%llx, length=0x%lx\n", __func__,
- _LLU(ios->per_dev[i].offset),
- ios->per_dev[i].length);
-
- continue; /* we recovered */
- }
- objlayout_io_set_result(&ios->ol_state, i,
- &ios->layout->comps[i].oc_object_id,
- osd_pri_2_pnfs_err(osi.osd_err_pri),
- ios->per_dev[i].offset,
- ios->per_dev[i].length,
- is_write);
-
- if (osi.osd_err_pri >= oep) {
- oep = osi.osd_err_pri;
- lin_ret = ret;
- }
- }
-
- return lin_ret;
-}
-
-/*
- * Common IO state helpers.
- */
-static void _io_free(struct objio_state *ios)
-{
- unsigned i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct _objio_per_comp *per_dev = &ios->per_dev[i];
-
- if (per_dev->or) {
- osd_end_request(per_dev->or);
- per_dev->or = NULL;
- }
-
- if (per_dev->bio) {
- bio_put(per_dev->bio);
- per_dev->bio = NULL;
- }
- }
-}
-
-struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
-{
- unsigned min_dev = ios->layout->comps_index;
- unsigned max_dev = min_dev + ios->layout->num_comps;
-
- BUG_ON(dev < min_dev || max_dev <= dev);
- return ios->layout->ods[dev - min_dev]->od;
-}
-
-struct _striping_info {
- u64 obj_offset;
- u64 group_length;
- unsigned dev;
- unsigned unit_off;
-};
-
-static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
- struct _striping_info *si)
-{
- u32 stripe_unit = ios->layout->stripe_unit;
- u32 group_width = ios->layout->group_width;
- u64 group_depth = ios->layout->group_depth;
- u32 U = stripe_unit * group_width;
-
- u64 T = U * group_depth;
- u64 S = T * ios->layout->group_count;
- u64 M = div64_u64(file_offset, S);
-
- /*
- G = (L - (M * S)) / T
- H = (L - (M * S)) % T
- */
- u64 LmodU = file_offset - M * S;
- u32 G = div64_u64(LmodU, T);
- u64 H = LmodU - G * T;
-
- u32 N = div_u64(H, U);
-
- div_u64_rem(file_offset, stripe_unit, &si->unit_off);
- si->obj_offset = si->unit_off + (N * stripe_unit) +
- (M * group_depth * stripe_unit);
-
- /* "H - (N * U)" is just "H % U" so it's bound to u32 */
- si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
- si->dev *= ios->layout->mirrors_p1;
-
- si->group_length = T - H;
-}
-
-static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
- unsigned pgbase, struct _objio_per_comp *per_dev, int len,
- gfp_t gfp_flags)
-{
- unsigned pg = *cur_pg;
- int cur_len = len;
- struct request_queue *q =
- osd_request_queue(_io_od(ios, per_dev->dev));
-
- if (per_dev->bio == NULL) {
- unsigned pages_in_stripe = ios->layout->group_width *
- (ios->layout->stripe_unit / PAGE_SIZE);
- unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
- ios->layout->group_width;
-
- if (BIO_MAX_PAGES_KMALLOC < bio_size)
- bio_size = BIO_MAX_PAGES_KMALLOC;
-
- per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
- if (unlikely(!per_dev->bio)) {
- dprintk("Faild to allocate BIO size=%u\n", bio_size);
- return -ENOMEM;
- }
- }
-
- while (cur_len > 0) {
- unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
- unsigned added_len;
-
- BUG_ON(ios->ol_state.nr_pages <= pg);
- cur_len -= pglen;
-
- added_len = bio_add_pc_page(q, per_dev->bio,
- ios->ol_state.pages[pg], pglen, pgbase);
- if (unlikely(pglen != added_len))
- return -ENOMEM;
- pgbase = 0;
- ++pg;
- }
- BUG_ON(cur_len);
-
- per_dev->length += len;
- *cur_pg = pg;
- return 0;
-}
-
-static int _prepare_one_group(struct objio_state *ios, u64 length,
- struct _striping_info *si, unsigned *last_pg,
- gfp_t gfp_flags)
-{
- unsigned stripe_unit = ios->layout->stripe_unit;
- unsigned mirrors_p1 = ios->layout->mirrors_p1;
- unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
- unsigned dev = si->dev;
- unsigned first_dev = dev - (dev % devs_in_group);
- unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
- unsigned cur_pg = *last_pg;
- int ret = 0;
-
- while (length) {
- struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
- unsigned cur_len, page_off = 0;
-
- if (!per_dev->length) {
- per_dev->dev = dev;
- if (dev < si->dev) {
- per_dev->offset = si->obj_offset + stripe_unit -
- si->unit_off;
- cur_len = stripe_unit;
- } else if (dev == si->dev) {
- per_dev->offset = si->obj_offset;
- cur_len = stripe_unit - si->unit_off;
- page_off = si->unit_off & ~PAGE_MASK;
- BUG_ON(page_off &&
- (page_off != ios->ol_state.pgbase));
- } else { /* dev > si->dev */
- per_dev->offset = si->obj_offset - si->unit_off;
- cur_len = stripe_unit;
- }
-
- if (max_comp < dev - first_dev)
- max_comp = dev - first_dev;
- } else {
- cur_len = stripe_unit;
- }
- if (cur_len >= length)
- cur_len = length;
-
- ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
- cur_len, gfp_flags);
- if (unlikely(ret))
- goto out;
-
- dev += mirrors_p1;
- dev = (dev % devs_in_group) + first_dev;
-
- length -= cur_len;
- ios->length += cur_len;
- }
-out:
- ios->numdevs = max_comp + mirrors_p1;
- *last_pg = cur_pg;
- return ret;
-}
-
-static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
-{
- u64 length = ios->ol_state.count;
- u64 offset = ios->ol_state.offset;
- struct _striping_info si;
- unsigned last_pg = 0;
- int ret = 0;
-
- while (length) {
- _calc_stripe_info(ios, offset, &si);
-
- if (length < si.group_length)
- si.group_length = length;
-
- ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
- if (unlikely(ret))
- goto out;
-
- offset += si.group_length;
- length -= si.group_length;
- }
-
-out:
- if (!ios->length)
- return ret;
-
- return 0;
-}
-
-static ssize_t _sync_done(struct objio_state *ios)
-{
- struct completion *waiting = ios->private;
-
- complete(waiting);
- return 0;
-}
-
-static void _last_io(struct kref *kref)
-{
- struct objio_state *ios = container_of(kref, struct objio_state, kref);
-
- ios->done(ios);
-}
-
-static void _done_io(struct osd_request *or, void *p)
-{
- struct objio_state *ios = p;
-
- kref_put(&ios->kref, _last_io);
-}
-
-static ssize_t _io_exec(struct objio_state *ios)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- ssize_t status = 0; /* sync status */
- unsigned i;
- objio_done_fn saved_done_fn = ios->done;
- bool sync = ios->ol_state.sync;
-
- if (sync) {
- ios->done = _sync_done;
- ios->private = &wait;
- }
-
- kref_init(&ios->kref);
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_request *or = ios->per_dev[i].or;
-
- if (!or)
- continue;
-
- kref_get(&ios->kref);
- osd_execute_request_async(or, _done_io, ios);
- }
-
- kref_put(&ios->kref, _last_io);
-
- if (sync) {
- wait_for_completion(&wait);
- status = saved_done_fn(ios);
- }
-
- return status;
+ objlayout_io_set_result(&objios->oir, comp,
+ &pooid, osd_pri_2_pnfs_err(oep),
+ dev_offset, dev_len, !ios->reading);
}
/*
* read
*/
-static ssize_t _read_done(struct objio_state *ios)
+static void _read_done(struct ore_io_state *ios, void *private)
{
+ struct objio_state *objios = private;
ssize_t status;
- int ret = _io_check(ios, false);
+ int ret = ore_check_io(ios, &__on_dev_error);
- _io_free(ios);
+ /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
if (likely(!ret))
status = ios->length;
else
status = ret;
- objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
- return status;
+ objlayout_read_done(&objios->oir, status, objios->sync);
}
-static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
+int objio_read_pagelist(struct nfs_read_data *rdata)
{
- struct osd_request *or = NULL;
- struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
- unsigned dev = per_dev->dev;
- struct pnfs_osd_object_cred *cred =
- &ios->layout->comps[cur_comp];
- struct osd_obj_id obj = {
- .partition = cred->oc_object_id.oid_partition_id,
- .id = cred->oc_object_id.oid_object_id,
- };
+ struct objio_state *objios;
int ret;
- or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
- if (unlikely(!or)) {
- ret = -ENOMEM;
- goto err;
- }
- per_dev->or = or;
-
- osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
-
- ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
- if (ret) {
- dprintk("%s: Faild to osd_finalize_request() => %d\n",
- __func__, ret);
- goto err;
- }
-
- dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
- __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
- per_dev->length);
-
-err:
- return ret;
-}
-
-static ssize_t _read_exec(struct objio_state *ios)
-{
- unsigned i;
- int ret;
-
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- if (!ios->per_dev[i].length)
- continue;
- ret = _read_mirrors(ios, i);
- if (unlikely(ret))
- goto err;
- }
-
- ios->done = _read_done;
- return _io_exec(ios); /* In sync mode exec returns the io status */
-
-err:
- _io_free(ios);
- return ret;
-}
-
-ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
-{
- struct objio_state *ios = container_of(ol_state, struct objio_state,
- ol_state);
- int ret;
-
- ret = _io_rw_pagelist(ios, GFP_KERNEL);
+ ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
+ rdata->lseg, rdata->args.pages, rdata->args.pgbase,
+ rdata->args.offset, rdata->args.count, rdata,
+ GFP_KERNEL, &objios);
if (unlikely(ret))
return ret;
- return _read_exec(ios);
+ objios->ios->done = _read_done;
+ dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
+ rdata->args.offset, rdata->args.count);
+ return ore_read(objios->ios);
}
/*
* write
*/
-static ssize_t _write_done(struct objio_state *ios)
+static void _write_done(struct ore_io_state *ios, void *private)
{
+ struct objio_state *objios = private;
ssize_t status;
- int ret = _io_check(ios, true);
+ int ret = ore_check_io(ios, &__on_dev_error);
- _io_free(ios);
+ /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
if (likely(!ret)) {
/* FIXME: should be based on the OSD's persistence model
* See OSD2r05 Section 4.13 Data persistence model */
- ios->ol_state.committed = NFS_FILE_SYNC;
+ objios->oir.committed = NFS_FILE_SYNC;
status = ios->length;
} else {
status = ret;
}
- objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
- return status;
+ objlayout_write_done(&objios->oir, status, objios->sync);
}
-static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
{
- struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
- unsigned dev = ios->per_dev[cur_comp].dev;
- unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
- int ret;
-
- for (; cur_comp < last_comp; ++cur_comp, ++dev) {
- struct osd_request *or = NULL;
- struct pnfs_osd_object_cred *cred =
- &ios->layout->comps[cur_comp];
- struct osd_obj_id obj = {
- .partition = cred->oc_object_id.oid_partition_id,
- .id = cred->oc_object_id.oid_object_id,
- };
- struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
- struct bio *bio;
-
- or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
- if (unlikely(!or)) {
- ret = -ENOMEM;
- goto err;
- }
- per_dev->or = or;
-
- if (per_dev != master_dev) {
- bio = bio_kmalloc(GFP_NOFS,
- master_dev->bio->bi_max_vecs);
- if (unlikely(!bio)) {
- dprintk("Faild to allocate BIO size=%u\n",
- master_dev->bio->bi_max_vecs);
- ret = -ENOMEM;
- goto err;
- }
-
- __bio_clone(bio, master_dev->bio);
- bio->bi_bdev = NULL;
- bio->bi_next = NULL;
- per_dev->bio = bio;
- per_dev->dev = dev;
- per_dev->length = master_dev->length;
- per_dev->offset = master_dev->offset;
- } else {
- bio = master_dev->bio;
- bio->bi_rw |= REQ_WRITE;
- }
-
- osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
+ struct objio_state *objios = priv;
+ struct nfs_write_data *wdata = objios->oir.rpcdata;
+ pgoff_t index = offset / PAGE_SIZE;
+ struct page *page = find_get_page(wdata->inode->i_mapping, index);
- ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
- if (ret) {
- dprintk("%s: Faild to osd_finalize_request() => %d\n",
- __func__, ret);
- goto err;
+ if (!page) {
+ page = find_or_create_page(wdata->inode->i_mapping,
+ index, GFP_NOFS);
+ if (unlikely(!page)) {
+ dprintk("%s: grab_cache_page Failed index=0x%lx\n",
+ __func__, index);
+ return NULL;
}
-
- dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
- __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
- per_dev->length);
+ unlock_page(page);
}
+ if (PageDirty(page) || PageWriteback(page))
+ *uptodate = true;
+ else
+ *uptodate = PageUptodate(page);
+ dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
+ return page;
+}
-err:
- return ret;
+static void __r4w_put_page(void *priv, struct page *page)
+{
+ dprintk("%s: index=0x%lx\n", __func__, page->index);
+ page_cache_release(page);
+ return;
}
-static ssize_t _write_exec(struct objio_state *ios)
+static const struct _ore_r4w_op _r4w_op = {
+ .get_page = &__r4w_get_page,
+ .put_page = &__r4w_put_page,
+};
+
+int objio_write_pagelist(struct nfs_write_data *wdata, int how)
{
- unsigned i;
+ struct objio_state *objios;
int ret;
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- if (!ios->per_dev[i].length)
- continue;
- ret = _write_mirrors(ios, i);
- if (unlikely(ret))
- goto err;
- }
-
- ios->done = _write_done;
- return _io_exec(ios); /* In sync mode exec returns the io->status */
+ ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
+ wdata->lseg, wdata->args.pages, wdata->args.pgbase,
+ wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
+ &objios);
+ if (unlikely(ret))
+ return ret;
-err:
- _io_free(ios);
- return ret;
-}
+ objios->sync = 0 != (how & FLUSH_SYNC);
+ objios->ios->r4w = &_r4w_op;
-ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
-{
- struct objio_state *ios = container_of(ol_state, struct objio_state,
- ol_state);
- int ret;
+ if (!objios->sync)
+ objios->ios->done = _write_done;
- /* TODO: ios->stable = stable; */
- ret = _io_rw_pagelist(ios, GFP_NOFS);
+ dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
+ wdata->args.offset, wdata->args.count);
+ ret = ore_write(objios->ios);
if (unlikely(ret))
return ret;
- return _write_exec(ios);
+ if (objios->sync)
+ _write_done(objios->ios, objios);
+
+ return 0;
}
static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
@@ -997,7 +533,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
return false;
return pgio->pg_count + req->wb_bytes <=
- OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
+ OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
}
static const struct nfs_pageio_ops objio_pg_read_ops = {
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1d06f8e2ade..72074e3a04f 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -156,77 +156,39 @@ last_byte_offset(u64 start, u64 len)
return end > start ? end - 1 : NFS4_MAX_UINT64;
}
-static struct objlayout_io_state *
-objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
- struct page **pages,
- unsigned pgbase,
- loff_t offset,
- size_t count,
- struct pnfs_layout_segment *lseg,
- void *rpcdata,
- gfp_t gfp_flags)
+void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
+ struct page ***p_pages, unsigned *p_pgbase,
+ u64 offset, unsigned long count)
{
- struct objlayout_io_state *state;
u64 lseg_end_offset;
- dprintk("%s: allocating io_state\n", __func__);
- if (objio_alloc_io_state(lseg, &state, gfp_flags))
- return NULL;
-
BUG_ON(offset < lseg->pls_range.offset);
lseg_end_offset = end_offset(lseg->pls_range.offset,
lseg->pls_range.length);
BUG_ON(offset >= lseg_end_offset);
- if (offset + count > lseg_end_offset) {
- count = lseg->pls_range.length -
- (offset - lseg->pls_range.offset);
- dprintk("%s: truncated count %Zd\n", __func__, count);
- }
+ WARN_ON(offset + count > lseg_end_offset);
- if (pgbase > PAGE_SIZE) {
- pages += pgbase >> PAGE_SHIFT;
- pgbase &= ~PAGE_MASK;
+ if (*p_pgbase > PAGE_SIZE) {
+ dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
+ *p_pages += *p_pgbase >> PAGE_SHIFT;
+ *p_pgbase &= ~PAGE_MASK;
}
-
- INIT_LIST_HEAD(&state->err_list);
- state->lseg = lseg;
- state->rpcdata = rpcdata;
- state->pages = pages;
- state->pgbase = pgbase;
- state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
- state->offset = offset;
- state->count = count;
- state->sync = 0;
-
- return state;
-}
-
-static void
-objlayout_free_io_state(struct objlayout_io_state *state)
-{
- dprintk("%s: freeing io_state\n", __func__);
- if (unlikely(!state))
- return;
-
- objio_free_io_state(state);
}
/*
* I/O done common code
*/
static void
-objlayout_iodone(struct objlayout_io_state *state)
+objlayout_iodone(struct objlayout_io_res *oir)
{
- dprintk("%s: state %p status\n", __func__, state);
-
- if (likely(state->status >= 0)) {
- objlayout_free_io_state(state);
+ if (likely(oir->status >= 0)) {
+ objio_free_result(oir);
} else {
- struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+ struct objlayout *objlay = oir->objlay;
spin_lock(&objlay->lock);
objlay->delta_space_valid = OBJ_DSU_INVALID;
- list_add(&objlay->err_list, &state->err_list);
+ list_add(&objlay->err_list, &oir->err_list);
spin_unlock(&objlay->lock);
}
}
@@ -238,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state)
* the error for later reporting at layout-return.
*/
void
-objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
struct pnfs_osd_objid *pooid, int osd_error,
u64 offset, u64 length, bool is_write)
{
- struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+ struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
- BUG_ON(index >= state->num_comps);
+ BUG_ON(index >= oir->num_comps);
if (osd_error) {
ioerr->oer_component = *pooid;
ioerr->oer_comp_offset = offset;
@@ -285,21 +247,18 @@ static void _rpc_read_complete(struct work_struct *work)
}
void
-objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- int eof = state->eof;
- struct nfs_read_data *rdata;
+ struct nfs_read_data *rdata = oir->rpcdata;
- state->status = status;
- dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof);
- rdata = state->rpcdata;
- rdata->task.tk_status = status;
- if (status >= 0) {
+ oir->status = rdata->task.tk_status = status;
+ if (status >= 0)
rdata->res.count = status;
- rdata->res.eof = eof;
- }
- objlayout_iodone(state);
- /* must not use state after this point */
+ objlayout_iodone(oir);
+ /* must not use oir after this point */
+
+ dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
+ status, rdata->res.eof, sync);
if (sync)
pnfs_ld_read_done(rdata);
@@ -317,40 +276,36 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
{
loff_t offset = rdata->args.offset;
size_t count = rdata->args.count;
- struct objlayout_io_state *state;
- ssize_t status = 0;
+ int err;
loff_t eof;
- dprintk("%s: Begin inode %p offset %llu count %d\n",
- __func__, rdata->inode, offset, (int)count);
-
eof = i_size_read(rdata->inode);
if (unlikely(offset + count > eof)) {
if (offset >= eof) {
- status = 0;
+ err = 0;
rdata->res.count = 0;
rdata->res.eof = 1;
+ /*FIXME: do we need to call pnfs_ld_read_done() */
goto out;
}
count = eof - offset;
}
- state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
- rdata->args.pages, rdata->args.pgbase,
- offset, count,
- rdata->lseg, rdata,
- GFP_KERNEL);
- if (unlikely(!state)) {
- status = -ENOMEM;
- goto out;
- }
+ rdata->res.eof = (offset + count) >= eof;
+ _fix_verify_io_params(rdata->lseg, &rdata->args.pages,
+ &rdata->args.pgbase,
+ rdata->args.offset, rdata->args.count);
- state->eof = state->offset + state->count >= eof;
+ dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
+ __func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
- status = objio_read_pagelist(state);
+ err = objio_read_pagelist(rdata);
out:
- dprintk("%s: Return status %Zd\n", __func__, status);
- rdata->pnfs_error = status;
+ if (unlikely(err)) {
+ rdata->pnfs_error = err;
+ dprintk("%s: Returned Error %d\n", __func__, err);
+ return PNFS_NOT_ATTEMPTED;
+ }
return PNFS_ATTEMPTED;
}
@@ -371,26 +326,20 @@ static void _rpc_write_complete(struct work_struct *work)
}
void
-objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
- bool sync)
+objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- struct nfs_write_data *wdata;
+ struct nfs_write_data *wdata = oir->rpcdata;
- dprintk("%s: Begin\n", __func__);
- wdata = state->rpcdata;
- state->status = status;
- wdata->task.tk_status = status;
+ oir->status = wdata->task.tk_status = status;
if (status >= 0) {
wdata->res.count = status;
- wdata->verf.committed = state->committed;
- dprintk("%s: Return status %d committed %d\n",
- __func__, wdata->task.tk_status,
- wdata->verf.committed);
- } else
- dprintk("%s: Return status %d\n",
- __func__, wdata->task.tk_status);
- objlayout_iodone(state);
- /* must not use state after this point */
+ wdata->verf.committed = oir->committed;
+ }
+ objlayout_iodone(oir);
+ /* must not use oir after this point */
+
+ dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
+ status, wdata->verf.committed, sync);
if (sync)
pnfs_ld_write_done(wdata);
@@ -407,30 +356,18 @@ enum pnfs_try_status
objlayout_write_pagelist(struct nfs_write_data *wdata,
int how)
{
- struct objlayout_io_state *state;
- ssize_t status;
-
- dprintk("%s: Begin inode %p offset %llu count %u\n",
- __func__, wdata->inode, wdata->args.offset, wdata->args.count);
-
- state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
- wdata->args.pages,
- wdata->args.pgbase,
- wdata->args.offset,
- wdata->args.count,
- wdata->lseg, wdata,
- GFP_NOFS);
- if (unlikely(!state)) {
- status = -ENOMEM;
- goto out;
- }
+ int err;
- state->sync = how & FLUSH_SYNC;
+ _fix_verify_io_params(wdata->lseg, &wdata->args.pages,
+ &wdata->args.pgbase,
+ wdata->args.offset, wdata->args.count);
- status = objio_write_pagelist(state, how & FLUSH_STABLE);
- out:
- dprintk("%s: Return status %Zd\n", __func__, status);
- wdata->pnfs_error = status;
+ err = objio_write_pagelist(wdata, how);
+ if (unlikely(err)) {
+ wdata->pnfs_error = err;
+ dprintk("%s: Returned Error %d\n", __func__, err);
+ return PNFS_NOT_ATTEMPTED;
+ }
return PNFS_ATTEMPTED;
}
@@ -537,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err,
static void
encode_accumulated_error(struct objlayout *objlay, __be32 *p)
{
- struct objlayout_io_state *state, *tmp;
+ struct objlayout_io_res *oir, *tmp;
struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
- list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
unsigned i;
- for (i = 0; i < state->num_comps; i++) {
- struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+ for (i = 0; i < oir->num_comps; i++) {
+ struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
if (!ioerr->oer_errno)
continue;
@@ -563,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
merge_ioerr(&accumulated_err, ioerr);
}
- list_del(&state->err_list);
- objlayout_free_io_state(state);
+ list_del(&oir->err_list);
+ objio_free_result(oir);
}
pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
@@ -576,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
const struct nfs4_layoutreturn_args *args)
{
struct objlayout *objlay = OBJLAYOUT(pnfslay);
- struct objlayout_io_state *state, *tmp;
+ struct objlayout_io_res *oir, *tmp;
__be32 *start;
dprintk("%s: Begin\n", __func__);
@@ -585,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
spin_lock(&objlay->lock);
- list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
__be32 *last_xdr = NULL, *p;
unsigned i;
int res = 0;
- for (i = 0; i < state->num_comps; i++) {
- struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+ for (i = 0; i < oir->num_comps; i++) {
+ struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
if (!ioerr->oer_errno)
continue;
@@ -615,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
}
last_xdr = p;
- pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
+ pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
}
/* TODO: use xdr_write_pages */
@@ -631,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
encode_accumulated_error(objlay, last_xdr);
goto loop_done;
}
- list_del(&state->err_list);
- objlayout_free_io_state(state);
+ list_del(&oir->err_list);
+ objio_free_result(oir);
}
loop_done:
spin_unlock(&objlay->lock);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index a8244c8e042..8ec34727ed2 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -74,19 +74,11 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
* per-I/O operation state
* embedded in objects provider io_state data structure
*/
-struct objlayout_io_state {
- struct pnfs_layout_segment *lseg;
-
- struct page **pages;
- unsigned pgbase;
- unsigned nr_pages;
- unsigned long count;
- loff_t offset;
- bool sync;
+struct objlayout_io_res {
+ struct objlayout *objlay;
void *rpcdata;
int status; /* res */
- int eof; /* res */
int committed; /* res */
/* Error reporting (layout_return) */
@@ -100,6 +92,18 @@ struct objlayout_io_state {
struct pnfs_osd_ioerr *ioerrs;
};
+static inline
+void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
+ struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
+ struct pnfs_layout_hdr *pnfs_layout_type)
+{
+ oir->objlay = OBJLAYOUT(pnfs_layout_type);
+ oir->rpcdata = rpcdata;
+ INIT_LIST_HEAD(&oir->err_list);
+ oir->num_comps = num_comps;
+ oir->ioerrs = ioerrs;
+}
+
/*
* Raid engine I/O API
*/
@@ -110,28 +114,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
gfp_t gfp_flags);
extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
-extern int objio_alloc_io_state(
- struct pnfs_layout_segment *lseg,
- struct objlayout_io_state **outp,
- gfp_t gfp_flags);
-extern void objio_free_io_state(struct objlayout_io_state *state);
+/* objio_free_result will free these @oir structs recieved from
+ * objlayout_{read,write}_done
+ */
+extern void objio_free_result(struct objlayout_io_res *oir);
-extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
-extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
- bool stable);
+extern int objio_read_pagelist(struct nfs_read_data *rdata);
+extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
/*
* callback API
*/
-extern void objlayout_io_set_result(struct objlayout_io_state *state,
+extern void objlayout_io_set_result(struct objlayout_io_res *oir,
unsigned index, struct pnfs_osd_objid *pooid,
int osd_error, u64 offset, u64 length, bool is_write);
static inline void
-objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
{
- struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
-
/* If one of the I/Os errored out and the delta_space_used was
* invalid we render the complete report as invalid. Protocol mandate
* the DSU be accurate or not reported.
@@ -144,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
spin_unlock(&objlay->lock);
}
-extern void objlayout_read_done(struct objlayout_io_state *state,
+extern void objlayout_read_done(struct objlayout_io_res *oir,
ssize_t status, bool sync);
-extern void objlayout_write_done(struct objlayout_io_state *state,
+extern void objlayout_write_done(struct objlayout_io_res *oir,
ssize_t status, bool sync);
extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b60970cc7f1..5668f7c54c4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -18,6 +18,7 @@
#include <linux/nfs_page.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_mount.h>
+#include <linux/export.h>
#include "internal.h"
#include "pnfs.h"
@@ -41,7 +42,7 @@ nfs_page_free(struct nfs_page *p)
/**
* nfs_create_request - Create an NFS read/write request.
- * @file: file descriptor to use
+ * @ctx: open context to use
* @inode: inode to which the request is attached
* @page: page to write
* @offset: starting offset within the page for the write
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e550e8836c3..8e672a2b2d6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -29,6 +29,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
+#include <linux/module.h>
#include "internal.h"
#include "pnfs.h"
#include "iostat.h"
@@ -1168,23 +1169,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
/*
* Called by non rpc-based layout drivers
*/
-int
-pnfs_ld_write_done(struct nfs_write_data *data)
+void pnfs_ld_write_done(struct nfs_write_data *data)
{
- int status;
-
- if (!data->pnfs_error) {
+ if (likely(!data->pnfs_error)) {
pnfs_set_layoutcommit(data);
data->mds_ops->rpc_call_done(&data->task, data);
- data->mds_ops->rpc_release(data);
- return 0;
+ } else {
+ put_lseg(data->lseg);
+ data->lseg = NULL;
+ dprintk("pnfs write error = %d\n", data->pnfs_error);
}
-
- dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
- data->pnfs_error);
- status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
- data->mds_ops, NFS_FILE_SYNC);
- return status ? : -EAGAIN;
+ data->mds_ops->rpc_release(data);
}
EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
@@ -1265,26 +1260,36 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
+static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
+{
+ struct nfs_pageio_descriptor pgio;
+
+ put_lseg(data->lseg);
+ data->lseg = NULL;
+ dprintk("pnfs write error = %d\n", data->pnfs_error);
+
+ nfs_pageio_init_read_mds(&pgio, data->inode);
+
+ while (!list_empty(&data->pages)) {
+ struct nfs_page *req = nfs_list_entry(data->pages.next);
+
+ nfs_list_remove_request(req);
+ nfs_pageio_add_request(&pgio, req);
+ }
+ nfs_pageio_complete(&pgio);
+}
+
/*
* Called by non rpc-based layout drivers
*/
-int
-pnfs_ld_read_done(struct nfs_read_data *data)
+void pnfs_ld_read_done(struct nfs_read_data *data)
{
- int status;
-
- if (!data->pnfs_error) {
+ if (likely(!data->pnfs_error)) {
__nfs4_read_done_cb(data);
data->mds_ops->rpc_call_done(&data->task, data);
- data->mds_ops->rpc_release(data);
- return 0;
- }
-
- dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
- data->pnfs_error);
- status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
- data->mds_ops);
- return status ? : -EAGAIN;
+ } else
+ pnfs_ld_handle_read_error(data);
+ data->mds_ops->rpc_release(data);
}
EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
@@ -1381,6 +1386,18 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
}
}
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+ if (lseg->pls_range.iomode == IOMODE_RW) {
+ dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+ set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+ } else {
+ dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+ set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
+
void
pnfs_set_layoutcommit(struct nfs_write_data *wdata)
{
@@ -1443,17 +1460,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
data = kzalloc(sizeof(*data), GFP_NOFS);
if (!data) {
- mark_inode_dirty_sync(inode);
status = -ENOMEM;
goto out;
}
+ if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+ goto out_free;
+
+ if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
+ if (!sync) {
+ status = -EAGAIN;
+ goto out_free;
+ }
+ status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (status)
+ goto out_free;
+ }
+
INIT_LIST_HEAD(&data->lseg_list);
spin_lock(&inode->i_lock);
if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+ clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
spin_unlock(&inode->i_lock);
- kfree(data);
- goto out;
+ wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
+ goto out_free;
}
pnfs_list_write_lseg(inode, &data->lseg_list);
@@ -1475,6 +1506,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
status = nfs4_proc_layoutcommit(data, sync);
out:
+ if (status)
+ mark_inode_dirty_sync(inode);
dprintk("<-- %s status %d\n", __func__, status);
return status;
+out_free:
+ kfree(data);
+ goto out;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 01cbfd54f3c..1509530cb11 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -178,6 +178,7 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *);
int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
@@ -200,8 +201,8 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
int _pnfs_return_layout(struct inode *);
-int pnfs_ld_write_done(struct nfs_write_data *);
-int pnfs_ld_read_done(struct nfs_read_data *);
+void pnfs_ld_write_done(struct nfs_write_data *);
+void pnfs_ld_read_done(struct nfs_read_data *);
struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
loff_t pos,
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6fda5228ef5..4f359d2a26e 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -28,6 +28,7 @@
* such damages.
*/
+#include <linux/export.h>
#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ac40b8535d7..f48125da198 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -710,6 +710,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.dentry_ops = &nfs_dentry_operations,
.dir_inode_ops = &nfs_dir_inode_operations,
.file_inode_ops = &nfs_file_inode_operations,
+ .file_ops = &nfs_file_operations,
.getroot = nfs_proc_get_root,
.getattr = nfs_proc_getattr,
.setattr = nfs_proc_setattr,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2171c043ab0..cfa175c223d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -35,16 +35,13 @@ static const struct rpc_call_ops nfs_read_partial_ops;
static const struct rpc_call_ops nfs_read_full_ops;
static struct kmem_cache *nfs_rdata_cachep;
-static mempool_t *nfs_rdata_mempool;
-
-#define MIN_POOL_READ (32)
struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
{
- struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
+ struct nfs_read_data *p;
+ p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
if (p) {
- memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->pages);
p->npages = pagecount;
if (pagecount <= ARRAY_SIZE(p->page_array))
@@ -52,7 +49,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
else {
p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
if (!p->pagevec) {
- mempool_free(p, nfs_rdata_mempool);
+ kmem_cache_free(nfs_rdata_cachep, p);
p = NULL;
}
}
@@ -64,7 +61,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
{
if (p && (p->pagevec != &p->page_array[0]))
kfree(p->pagevec);
- mempool_free(p, nfs_rdata_mempool);
+ kmem_cache_free(nfs_rdata_cachep, p);
}
void nfs_readdata_release(struct nfs_read_data *rdata)
@@ -112,7 +109,7 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
}
}
-static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
+void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
struct inode *inode)
{
nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
@@ -276,7 +273,6 @@ nfs_async_read_error(struct list_head *head)
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
- SetPageError(req->wb_page);
nfs_readpage_release(req);
}
}
@@ -322,7 +318,6 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head
offset += len;
} while(nbytes != 0);
atomic_set(&req->wb_complete, requests);
- ClearPageError(page);
desc->pg_rpc_callops = &nfs_read_partial_ops;
return ret;
out_bad:
@@ -331,7 +326,6 @@ out_bad:
list_del(&data->list);
nfs_readdata_free(data);
}
- SetPageError(page);
nfs_readpage_release(req);
return -ENOMEM;
}
@@ -357,7 +351,6 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
nfs_list_add_request(req, &data->pages);
- ClearPageError(req->wb_page);
*pages++ = req->wb_page;
}
req = nfs_list_entry(data->pages.next);
@@ -435,7 +428,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
argp->offset += resp->count;
argp->pgbase += resp->count;
argp->count -= resp->count;
- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
+ rpc_restart_call_prepare(task);
}
/*
@@ -462,10 +455,10 @@ static void nfs_readpage_release_partial(void *calldata)
int status = data->task.tk_status;
if (status < 0)
- SetPageError(page);
+ set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags);
if (atomic_dec_and_test(&req->wb_complete)) {
- if (!PageError(page))
+ if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags))
SetPageUptodate(page);
nfs_readpage_release(req);
}
@@ -648,7 +641,6 @@ readpage_async_filler(void *data, struct page *page)
return 0;
out_error:
error = PTR_ERR(new);
- SetPageError(page);
out_unlock:
unlock_page(page);
return error;
@@ -711,16 +703,10 @@ int __init nfs_init_readpagecache(void)
if (nfs_rdata_cachep == NULL)
return -ENOMEM;
- nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ,
- nfs_rdata_cachep);
- if (nfs_rdata_mempool == NULL)
- return -ENOMEM;
-
return 0;
}
void nfs_destroy_readpagecache(void)
{
- mempool_destroy(nfs_rdata_mempool);
kmem_cache_destroy(nfs_rdata_cachep);
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5b19b6aabe1..134777406ee 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -733,18 +733,22 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
return 0;
}
+
+#ifdef CONFIG_NFS_V4
#ifdef CONFIG_NFS_V4_1
-void show_sessions(struct seq_file *m, struct nfs_server *server)
+static void show_sessions(struct seq_file *m, struct nfs_server *server)
{
if (nfs4_has_session(server->nfs_client))
seq_printf(m, ",sessions");
}
#else
-void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
#endif
+#ifdef CONFIG_NFS_V4
#ifdef CONFIG_NFS_V4_1
-void show_pnfs(struct seq_file *m, struct nfs_server *server)
+static void show_pnfs(struct seq_file *m, struct nfs_server *server)
{
seq_printf(m, ",pnfs=");
if (server->pnfs_curr_ld)
@@ -752,9 +756,10 @@ void show_pnfs(struct seq_file *m, struct nfs_server *server)
else
seq_printf(m, "not configured");
}
-#else /* CONFIG_NFS_V4_1 */
-void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
-#endif /* CONFIG_NFS_V4_1 */
+#else
+static void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#endif
+#endif
static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
{
@@ -2782,43 +2787,18 @@ static void nfs_referral_loop_unprotect(void)
static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
const char *export_path)
{
- struct mnt_namespace *ns_private;
- struct super_block *s;
struct dentry *dentry;
- struct path path;
- int ret;
-
- ns_private = create_mnt_ns(root_mnt);
- ret = PTR_ERR(ns_private);
- if (IS_ERR(ns_private))
- goto out_mntput;
+ int ret = nfs_referral_loop_protect();
- ret = nfs_referral_loop_protect();
- if (ret != 0)
- goto out_put_mnt_ns;
-
- ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
- export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+ if (ret) {
+ mntput(root_mnt);
+ return ERR_PTR(ret);
+ }
+ dentry = mount_subtree(root_mnt, export_path);
nfs_referral_loop_unprotect();
- put_mnt_ns(ns_private);
-
- if (ret != 0)
- goto out_err;
-
- s = path.mnt->mnt_sb;
- atomic_inc(&s->s_active);
- dentry = dget(path.dentry);
- path_put(&path);
- down_write(&s->s_umount);
return dentry;
-out_put_mnt_ns:
- put_mnt_ns(ns_private);
-out_mntput:
- mntput(root_mnt);
-out_err:
- return ERR_PTR(ret);
}
static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index b2fbbde58e4..4f9319a2e56 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -87,7 +87,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
struct inode *dir = data->dir;
if (!NFS_PROTO(dir)->unlink_done(task, dir))
- nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
+ rpc_restart_call_prepare(task);
}
/**
@@ -369,7 +369,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
struct dentry *new_dentry = data->new_dentry;
if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
- nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
+ rpc_restart_call_prepare(task);
return;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c9bd2a6b7d4..1dda78db6a7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -20,6 +20,7 @@
#include <linux/nfs_mount.h>
#include <linux/nfs_page.h>
#include <linux/backing-dev.h>
+#include <linux/export.h>
#include <asm/uaccess.h>
@@ -390,7 +391,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
BUG_ON(error);
if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
- nfsi->change_attr++;
+ inode->i_version++;
set_bit(PG_MAPPED, &req->wb_flags);
SetPagePrivate(req->wb_page);
set_page_private(req->wb_page, (unsigned long)req);
@@ -428,7 +429,6 @@ static void
nfs_mark_request_dirty(struct nfs_page *req)
{
__set_page_dirty_nobuffers(req->wb_page);
- __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
}
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -762,6 +762,8 @@ int nfs_updatepage(struct file *file, struct page *page,
status = nfs_writepage_setup(ctx, page, offset, count);
if (status < 0)
nfs_set_pageerror(page);
+ else
+ __set_page_dirty_nobuffers(page);
dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
@@ -1010,7 +1012,6 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
nfs_list_add_request(req, &data->pages);
- ClearPageError(req->wb_page);
*pages++ = req->wb_page;
}
req = nfs_list_entry(data->pages.next);
@@ -1165,7 +1166,13 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
static void nfs_writeback_release_full(void *calldata)
{
struct nfs_write_data *data = calldata;
- int status = data->task.tk_status;
+ int ret, status = data->task.tk_status;
+ struct nfs_pageio_descriptor pgio;
+
+ if (data->pnfs_error) {
+ nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
+ pgio.pg_recoalesce = 1;
+ }
/* Update attributes as result of writeback. */
while (!list_empty(&data->pages)) {
@@ -1181,6 +1188,11 @@ static void nfs_writeback_release_full(void *calldata)
req->wb_bytes,
(long long)req_offset(req));
+ if (data->pnfs_error) {
+ dprintk(", pnfs error = %d\n", data->pnfs_error);
+ goto next;
+ }
+
if (status < 0) {
nfs_set_pageerror(page);
nfs_context_set_write_error(req->wb_context, status);
@@ -1200,7 +1212,19 @@ remove_request:
next:
nfs_clear_page_tag_locked(req);
nfs_end_page_writeback(page);
+ if (data->pnfs_error) {
+ lock_page(page);
+ nfs_pageio_cond_complete(&pgio, page->index);
+ ret = nfs_page_async_flush(&pgio, page, 0);
+ if (ret) {
+ nfs_set_pageerror(page);
+ dprintk("rewrite to MDS error = %d\n", ret);
+ }
+ unlock_page(page);
+ }
}
+ if (data->pnfs_error)
+ nfs_pageio_complete(&pgio);
nfs_writedata_release(calldata);
}
@@ -1220,7 +1244,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
{
struct nfs_writeargs *argp = &data->args;
struct nfs_writeres *resp = &data->res;
- struct nfs_server *server = NFS_SERVER(data->inode);
int status;
dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1254,7 +1277,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
if (time_before(complain, jiffies)) {
dprintk("NFS: faulty NFS server %s:"
" (committed = %d) != (stable = %d)\n",
- server->nfs_client->cl_hostname,
+ NFS_SERVER(data->inode)->nfs_client->cl_hostname,
resp->verf->committed, argp->stable);
complain = jiffies + 300 * HZ;
}
@@ -1281,7 +1304,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
*/
argp->stable = NFS_FILE_SYNC;
}
- nfs_restart_rpc(task, server->nfs_client);
+ rpc_restart_call_prepare(task);
return;
}
if (time_before(complain, jiffies)) {
@@ -1553,6 +1576,10 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
int flags = FLUSH_SYNC;
int ret = 0;
+ /* no commits means nothing needs to be done */
+ if (!nfsi->ncommit)
+ return ret;
+
if (wbc->sync_mode == WB_SYNC_NONE) {
/* Don't commit yet if this is a non-blocking flush and there
* are a lot of outstanding writes for this mapping.
@@ -1686,34 +1713,20 @@ out_error:
int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
struct page *page)
{
- struct nfs_page *req;
- int ret;
+ /*
+ * If PagePrivate is set, then the page is currently associated with
+ * an in-progress read or write request. Don't try to migrate it.
+ *
+ * FIXME: we could do this in principle, but we'll need a way to ensure
+ * that we can safely release the inode reference while holding
+ * the page lock.
+ */
+ if (PagePrivate(page))
+ return -EBUSY;
nfs_fscache_release_page(page, GFP_KERNEL);
- req = nfs_find_and_lock_request(page, false);
- ret = PTR_ERR(req);
- if (IS_ERR(req))
- goto out;
-
- ret = migrate_page(mapping, newpage, page);
- if (!req)
- goto out;
- if (ret)
- goto out_unlock;
- page_cache_get(newpage);
- spin_lock(&mapping->host->i_lock);
- req->wb_page = newpage;
- SetPagePrivate(newpage);
- set_page_private(newpage, (unsigned long)req);
- ClearPagePrivate(page);
- set_page_private(page, 0);
- spin_unlock(&mapping->host->i_lock);
- page_cache_release(page);
-out_unlock:
- nfs_clear_page_tag_locked(req);
-out:
- return ret;
+ return migrate_page(mapping, newpage, page);
}
#endif
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index f4cc1e2bfc5..62f3b9074e8 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -16,7 +16,6 @@
#include <linux/module.h>
#include <linux/exportfs.h>
-#include <linux/nfsd/syscall.h>
#include <net/ipv6.h>
#include "nfsd.h"
@@ -318,7 +317,6 @@ static void svc_export_put(struct kref *ref)
struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
path_put(&exp->ex_path);
auth_domain_put(exp->ex_client);
- kfree(exp->ex_pathname);
nfsd4_fslocs_free(&exp->ex_fslocs);
kfree(exp);
}
@@ -528,11 +526,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
exp.ex_client = dom;
- err = -ENOMEM;
- exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
- if (!exp.ex_pathname)
- goto out2;
-
/* expiry */
err = -EINVAL;
exp.h.expiry_time = get_expiry(&mesg);
@@ -613,8 +606,6 @@ out4:
nfsd4_fslocs_free(&exp.ex_fslocs);
kfree(exp.ex_uuid);
out3:
- kfree(exp.ex_pathname);
-out2:
path_put(&exp.ex_path);
out1:
auth_domain_put(dom);
@@ -678,7 +669,6 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_client = item->ex_client;
new->ex_path.dentry = dget(item->ex_path.dentry);
new->ex_path.mnt = mntget(item->ex_path.mnt);
- new->ex_pathname = NULL;
new->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = 0;
@@ -696,8 +686,6 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
new->ex_fsid = item->ex_fsid;
new->ex_uuid = item->ex_uuid;
item->ex_uuid = NULL;
- new->ex_pathname = item->ex_pathname;
- item->ex_pathname = NULL;
new->ex_fslocs.locations = item->ex_fslocs.locations;
item->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
@@ -1010,7 +998,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
return exp;
}
-static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
+struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp)
{
u32 fsidv[2];
@@ -1030,7 +1018,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
struct svc_export *exp;
__be32 rv;
- exp = find_fsidzero_export(rqstp);
+ exp = rqst_find_fsidzero_export(rqstp);
if (IS_ERR(exp))
return nfserrno(PTR_ERR(exp));
rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index ad88f1c0a4c..9c51aff02ae 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,6 +36,7 @@
#include <linux/slab.h>
#include <linux/nfs_fs.h>
+#include <linux/export.h>
#include "acl.h"
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 02eb4edf0ec..7748d6a18d9 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -39,6 +39,8 @@
#define NFSDDBG_FACILITY NFSDDBG_PROC
+static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
+
#define NFSPROC4_CB_NULL 0
#define NFSPROC4_CB_COMPOUND 1
@@ -351,7 +353,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
__be32 *p;
encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
- encode_stateid4(xdr, &dp->dl_stateid);
+ encode_stateid4(xdr, &dp->dl_stid.sc_stateid);
p = xdr_reserve_space(xdr, 4);
*p++ = xdr_zero; /* truncate */
@@ -460,6 +462,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
*/
status = 0;
out:
+ if (status)
+ nfsd4_mark_cb_fault(cb->cb_clp, status);
return status;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -686,6 +690,12 @@ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
warn_no_callback_path(clp, reason);
}
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+{
+ clp->cl_cb_state = NFSD4_CB_FAULT;
+ warn_no_callback_path(clp, reason);
+}
+
static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
{
struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
@@ -787,7 +797,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
u32 minorversion = clp->cl_minorversion;
cb->cb_minorversion = minorversion;
@@ -809,7 +819,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
dprintk("%s: minorversion=%d\n", __func__,
clp->cl_minorversion);
@@ -832,7 +842,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
nfsd4_cb_done(task, calldata);
@@ -1006,7 +1016,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
void nfsd4_cb_recall(struct nfs4_delegation *dp)
{
struct nfsd4_callback *cb = &dp->dl_recall;
- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
dp->dl_retries = 1;
cb->cb_op = dp;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e8077766661..fa383361bc6 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -35,6 +35,7 @@
#include <linux/file.h>
#include <linux/slab.h>
+#include "idmap.h"
#include "cache.h"
#include "xdr4.h"
#include "vfs.h"
@@ -156,6 +157,8 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
return nfserr_inval;
+ accmode |= NFSD_MAY_READ_IF_EXEC;
+
if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
accmode |= NFSD_MAY_READ;
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
@@ -168,12 +171,29 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
return status;
}
+static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
+{
+ umode_t mode = fh->fh_dentry->d_inode->i_mode;
+
+ if (S_ISREG(mode))
+ return nfs_ok;
+ if (S_ISDIR(mode))
+ return nfserr_isdir;
+ /*
+ * Using err_symlink as our catch-all case may look odd; but
+ * there's no other obvious error for this case in 4.0, and we
+ * happen to know that it will cause the linux v4 client to do
+ * the right thing on attempts to open something other than a
+ * regular file.
+ */
+ return nfserr_symlink;
+}
+
static __be32
do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
struct svc_fh resfh;
__be32 status;
- int created = 0;
fh_init(&resfh, NFS4_FHSIZE);
open->op_truncate = 0;
@@ -202,7 +222,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
open->op_fname.len, &open->op_iattr,
&resfh, open->op_createmode,
(u32 *)open->op_verf.data,
- &open->op_truncate, &created);
+ &open->op_truncate, &open->op_created);
/*
* Following rfc 3530 14.2.16, use the returned bitmask
@@ -216,6 +236,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
status = nfsd_lookup(rqstp, current_fh,
open->op_fname.data, open->op_fname.len, &resfh);
fh_unlock(current_fh);
+ if (status)
+ goto out;
+ status = nfsd_check_obj_isreg(&resfh);
}
if (status)
goto out;
@@ -227,9 +250,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
fh_dup2(current_fh, &resfh);
/* set reply cache */
- fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+ fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
&resfh.fh_handle);
- if (!created)
+ if (!open->op_created)
status = do_open_permission(rqstp, current_fh, open,
NFSD_MAY_NOP);
@@ -254,7 +277,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
/* set replay cache */
- fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+ fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
&current_fh->fh_handle);
open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
@@ -283,14 +306,18 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfsd4_compoundres *resp;
- dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
+ dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
(int)open->op_fname.len, open->op_fname.data,
- open->op_stateowner);
+ open->op_openowner);
/* This check required by spec. */
if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
return nfserr_inval;
+ /* We don't yet support WANT bits: */
+ open->op_share_access &= NFS4_SHARE_ACCESS_MASK;
+
+ open->op_created = 0;
/*
* RFC5661 18.51.3
* Before RECLAIM_COMPLETE done, server should deny new lock
@@ -309,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
resp = rqstp->rq_resp;
status = nfsd4_process_open1(&resp->cstate, open);
if (status == nfserr_replay_me) {
- struct nfs4_replay *rp = &open->op_stateowner->so_replay;
+ struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
fh_put(&cstate->current_fh);
fh_copy_shallow(&cstate->current_fh.fh_handle,
&rp->rp_openfh);
@@ -339,32 +366,23 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
case NFS4_OPEN_CLAIM_NULL:
- /*
- * (1) set CURRENT_FH to the file being opened,
- * creating it if necessary, (2) set open->op_cinfo,
- * (3) set open->op_truncate if the file is to be
- * truncated after opening, (4) do permission checking.
- */
status = do_open_lookup(rqstp, &cstate->current_fh,
open);
if (status)
goto out;
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
- open->op_stateowner->so_confirmed = 1;
- /*
- * The CURRENT_FH is already set to the file being
- * opened. (1) set open->op_cinfo, (2) set
- * open->op_truncate if the file is to be truncated
- * after opening, (3) do permission checking.
- */
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
status = do_open_fhandle(rqstp, &cstate->current_fh,
open);
if (status)
goto out;
break;
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
case NFS4_OPEN_CLAIM_DELEGATE_PREV:
- open->op_stateowner->so_confirmed = 1;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
dprintk("NFSD: unsupported OPEN claim type %d\n",
open->op_claim_type);
status = nfserr_notsupp;
@@ -381,12 +399,13 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* set, (2) sets open->op_stateid, (3) sets open->op_delegation.
*/
status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
+ WARN_ON(status && open->op_created);
out:
- if (open->op_stateowner) {
- nfs4_get_stateowner(open->op_stateowner);
- cstate->replay_owner = open->op_stateowner;
- }
- nfs4_unlock_state();
+ nfsd4_cleanup_open_state(open, status);
+ if (open->op_openowner)
+ cstate->replay_owner = &open->op_openowner->oo_owner;
+ else
+ nfs4_unlock_state();
return status;
}
@@ -467,17 +486,12 @@ static __be32
nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_commit *commit)
{
- __be32 status;
-
u32 *p = (u32 *)commit->co_verf.data;
*p++ = nfssvc_boot.tv_sec;
*p++ = nfssvc_boot.tv_usec;
- status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+ return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
commit->co_count);
- if (status == nfserr_symlink)
- status = nfserr_inval;
- return status;
}
static __be32
@@ -492,8 +506,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
NFSD_MAY_CREATE);
- if (status == nfserr_symlink)
- status = nfserr_notdir;
if (status)
return status;
@@ -691,7 +703,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
- if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
+ if ((cookie == 1) || (cookie == 2) ||
(cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
return nfserr_bad_cookie;
@@ -719,8 +731,6 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_grace;
status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
remove->rm_name, remove->rm_namelen);
- if (status == nfserr_symlink)
- return nfserr_notdir;
if (!status) {
fh_unlock(&cstate->current_fh);
set_change_info(&remove->rm_cinfo, &cstate->current_fh);
@@ -751,8 +761,6 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
(S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) &&
S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode)))
status = nfserr_exist;
- else if (status == nfserr_symlink)
- status = nfserr_notdir;
if (!status) {
set_change_info(&rename->rn_sinfo, &cstate->current_fh);
@@ -892,8 +900,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
write->wr_bytes_written = cnt;
- if (status == nfserr_symlink)
- status = nfserr_inval;
return status;
}
@@ -930,7 +936,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
count = 4 + (verify->ve_attrlen >> 2);
buf = kmalloc(count << 2, GFP_KERNEL);
if (!buf)
- return nfserr_resource;
+ return nfserr_jukebox;
status = nfsd4_encode_fattr(&cstate->current_fh,
cstate->current_fh.fh_export,
@@ -994,6 +1000,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
void *);
+typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op);
+
enum nfsd4_op_flags {
ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */
@@ -1001,13 +1009,15 @@ enum nfsd4_op_flags {
/* For rfc 5661 section 2.6.3.1.1: */
OP_HANDLES_WRONGSEC = 1 << 3,
OP_IS_PUTFH_LIKE = 1 << 4,
-};
-
-struct nfsd4_operation {
- nfsd4op_func op_func;
- u32 op_flags;
- char *op_name;
/*
+ * These are the ops whose result size we estimate before
+ * encoding, to avoid performing an op then not being able to
+ * respond or cache a response. This includes writes and setattrs
+ * as well as the operations usually called "nonidempotent":
+ */
+ OP_MODIFIES_SOMETHING = 1 << 5,
+ /*
+ * Cache compounds containing these ops in the xid-based drc:
* We use the DRC for compounds containing non-idempotent
* operations, *except* those that are 4.1-specific (since
* sessions provide their own EOS), and except for stateful
@@ -1015,7 +1025,15 @@ struct nfsd4_operation {
* (since sequence numbers provide EOS for open, lock, etc in
* the v4.0 case).
*/
- bool op_cacheresult;
+ OP_CACHEME = 1 << 6,
+};
+
+struct nfsd4_operation {
+ nfsd4op_func op_func;
+ u32 op_flags;
+ char *op_name;
+ /* Try to get response size before operation */
+ nfsd4op_rsize op_rsize_bop;
};
static struct nfsd4_operation nfsd4_ops[];
@@ -1062,7 +1080,7 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
bool nfsd4_cache_this_op(struct nfsd4_op *op)
{
- return OPDESC(op)->op_cacheresult;
+ return OPDESC(op)->op_flags & OP_CACHEME;
}
static bool need_wrongsec_check(struct svc_rqst *rqstp)
@@ -1110,6 +1128,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
struct nfsd4_operation *opdesc;
struct nfsd4_compound_state *cstate = &resp->cstate;
int slack_bytes;
+ u32 plen = 0;
__be32 status;
resp->xbuf = &rqstp->rq_res;
@@ -1188,6 +1207,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
goto encode_op;
}
+ /* If op is non-idempotent */
+ if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
+ plen = opdesc->op_rsize_bop(rqstp, op);
+ op->status = nfsd4_check_resp_size(resp, plen);
+ }
+
+ if (op->status)
+ goto encode_op;
+
if (opdesc->op_func)
op->status = opdesc->op_func(rqstp, cstate, &op->u);
else
@@ -1217,7 +1245,7 @@ encode_op:
be32_to_cpu(status));
if (cstate->replay_owner) {
- nfs4_put_stateowner(cstate->replay_owner);
+ nfs4_unlock_state();
cstate->replay_owner = NULL;
}
/* XXX Ugh, we need to get rid of this kind of special case: */
@@ -1238,6 +1266,144 @@ out:
return status;
}
+#define op_encode_hdr_size (2)
+#define op_encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define op_encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define op_encode_change_info_maxsz (5)
+#define nfs4_fattr_bitmap_maxsz (4)
+
+#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz)
+
+#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+
+#define op_encode_ace_maxsz (3 + nfs4_owner_maxsz)
+#define op_encode_delegation_maxsz (1 + op_encode_stateid_maxsz + 1 + \
+ op_encode_ace_maxsz)
+
+#define op_encode_channel_attrs_maxsz (6 + 1 + 1)
+
+static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
+}
+
+static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz
+ + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_stateid_maxsz
+ + op_encode_change_info_maxsz + 1
+ + nfs4_fattr_bitmap_maxsz
+ + op_encode_delegation_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 maxcount = 0, rlen = 0;
+
+ maxcount = svc_max_payload(rqstp);
+ rlen = op->u.read.rd_length;
+
+ if (rlen > maxcount)
+ rlen = maxcount;
+
+ return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen;
+}
+
+static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 rlen = op->u.readdir.rd_maxcount;
+
+ if (rlen > PAGE_SIZE)
+ rlen = PAGE_SIZE;
+
+ return (op_encode_hdr_size + op_encode_verifier_maxsz)
+ * sizeof(__be32) + rlen;
+}
+
+static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz
+ + op_encode_change_info_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
+ 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\
+ 2 + /*eir_server_owner.so_minor_id */\
+ /* eir_server_owner.so_major_id<> */\
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+ /* eir_server_scope<> */\
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+ 1 + /* eir_server_impl_id array length */\
+ 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
+ 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
+ 2 + /* csr_sequence, csr_flags */\
+ op_encode_channel_attrs_maxsz + \
+ op_encode_channel_attrs_maxsz) * sizeof(__be32);
+}
+
static struct nfsd4_operation nfsd4_ops[] = {
[OP_ACCESS] = {
.op_func = (nfsd4op_func)nfsd4_access,
@@ -1245,20 +1411,27 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_CLOSE] = {
.op_func = (nfsd4op_func)nfsd4_close,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_CLOSE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
},
[OP_COMMIT] = {
.op_func = (nfsd4op_func)nfsd4_commit,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_COMMIT",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize,
},
[OP_CREATE] = {
.op_func = (nfsd4op_func)nfsd4_create,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_CREATE",
- .op_cacheresult = true,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize,
},
[OP_DELEGRETURN] = {
.op_func = (nfsd4op_func)nfsd4_delegreturn,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_DELEGRETURN",
+ .op_rsize_bop = nfsd4_only_status_rsize,
},
[OP_GETATTR] = {
.op_func = (nfsd4op_func)nfsd4_getattr,
@@ -1271,12 +1444,16 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_LINK] = {
.op_func = (nfsd4op_func)nfsd4_link,
+ .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING
+ | OP_CACHEME,
.op_name = "OP_LINK",
- .op_cacheresult = true,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize,
},
[OP_LOCK] = {
.op_func = (nfsd4op_func)nfsd4_lock,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_LOCK",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
},
[OP_LOCKT] = {
.op_func = (nfsd4op_func)nfsd4_lockt,
@@ -1284,7 +1461,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_LOCKU] = {
.op_func = (nfsd4op_func)nfsd4_locku,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_LOCKU",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
},
[OP_LOOKUP] = {
.op_func = (nfsd4op_func)nfsd4_lookup,
@@ -1302,42 +1481,54 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_OPEN] = {
.op_func = (nfsd4op_func)nfsd4_open,
- .op_flags = OP_HANDLES_WRONGSEC,
+ .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
.op_name = "OP_OPEN",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize,
},
[OP_OPEN_CONFIRM] = {
.op_func = (nfsd4op_func)nfsd4_open_confirm,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_OPEN_CONFIRM",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
},
[OP_OPEN_DOWNGRADE] = {
.op_func = (nfsd4op_func)nfsd4_open_downgrade,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_OPEN_DOWNGRADE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
},
[OP_PUTFH] = {
.op_func = (nfsd4op_func)nfsd4_putfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
.op_name = "OP_PUTFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_PUTPUBFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
.op_name = "OP_PUTPUBFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_PUTROOTFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
.op_name = "OP_PUTROOTFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_READ] = {
.op_func = (nfsd4op_func)nfsd4_read,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_READ",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize,
},
[OP_READDIR] = {
.op_func = (nfsd4op_func)nfsd4_readdir,
+ .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_READDIR",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize,
},
[OP_READLINK] = {
.op_func = (nfsd4op_func)nfsd4_readlink,
@@ -1345,29 +1536,36 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_REMOVE] = {
.op_func = (nfsd4op_func)nfsd4_remove,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_REMOVE",
- .op_cacheresult = true,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize,
},
[OP_RENAME] = {
- .op_name = "OP_RENAME",
.op_func = (nfsd4op_func)nfsd4_rename,
- .op_cacheresult = true,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_RENAME",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize,
},
[OP_RENEW] = {
.op_func = (nfsd4op_func)nfsd4_renew,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_RENEW",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+
},
[OP_RESTOREFH] = {
.op_func = (nfsd4op_func)nfsd4_restorefh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE,
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
.op_name = "OP_RESTOREFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SAVEFH] = {
.op_func = (nfsd4op_func)nfsd4_savefh,
- .op_flags = OP_HANDLES_WRONGSEC,
+ .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
.op_name = "OP_SAVEFH",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SECINFO] = {
.op_func = (nfsd4op_func)nfsd4_secinfo,
@@ -1377,19 +1575,22 @@ static struct nfsd4_operation nfsd4_ops[] = {
[OP_SETATTR] = {
.op_func = (nfsd4op_func)nfsd4_setattr,
.op_name = "OP_SETATTR",
- .op_cacheresult = true,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize,
},
[OP_SETCLIENTID] = {
.op_func = (nfsd4op_func)nfsd4_setclientid,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_SETCLIENTID",
- .op_cacheresult = true,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize,
},
[OP_SETCLIENTID_CONFIRM] = {
.op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_SETCLIENTID_CONFIRM",
- .op_cacheresult = true,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_VERIFY] = {
.op_func = (nfsd4op_func)nfsd4_verify,
@@ -1397,35 +1598,46 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_WRITE] = {
.op_func = (nfsd4op_func)nfsd4_write,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_WRITE",
- .op_cacheresult = true,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
},
[OP_RELEASE_LOCKOWNER] = {
.op_func = (nfsd4op_func)nfsd4_release_lockowner,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_RELEASE_LOCKOWNER",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
/* NFSv4.1 operations */
[OP_EXCHANGE_ID] = {
.op_func = (nfsd4op_func)nfsd4_exchange_id,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_EXCHANGE_ID",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
},
[OP_BIND_CONN_TO_SESSION] = {
.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_BIND_CONN_TO_SESSION",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize,
},
[OP_CREATE_SESSION] = {
.op_func = (nfsd4op_func)nfsd4_create_session,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_CREATE_SESSION",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize,
},
[OP_DESTROY_SESSION] = {
.op_func = (nfsd4op_func)nfsd4_destroy_session,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_DESTROY_SESSION",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SEQUENCE] = {
.op_func = (nfsd4op_func)nfsd4_sequence,
@@ -1433,14 +1645,17 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_SEQUENCE",
},
[OP_DESTROY_CLIENTID] = {
- .op_func = NULL,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_func = (nfsd4op_func)nfsd4_destroy_clientid,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
.op_name = "OP_DESTROY_CLIENTID",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_RECLAIM_COMPLETE] = {
.op_func = (nfsd4op_func)nfsd4_reclaim_complete,
- .op_flags = ALLOWED_WITHOUT_FH,
+ .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
.op_name = "OP_RECLAIM_COMPLETE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SECINFO_NO_NAME] = {
.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
@@ -1454,8 +1669,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_FREE_STATEID] = {
.op_func = (nfsd4op_func)nfsd4_free_stateid,
- .op_flags = ALLOWED_WITHOUT_FH,
+ .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
.op_name = "OP_FREE_STATEID",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
};
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 29d77f60585..ed083b9a731 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -45,6 +45,7 @@
/* Globals */
static struct file *rec_file;
+static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
static int
nfs4_save_creds(const struct cred **original_creds)
@@ -88,7 +89,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
struct xdr_netobj cksum;
struct hash_desc desc;
struct scatterlist sg;
- __be32 status = nfserr_resource;
+ __be32 status = nfserr_jukebox;
dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
clname->len, clname->data);
@@ -129,6 +130,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
if (!rec_file || clp->cl_firststate)
return 0;
+ clp->cl_firststate = 1;
status = nfs4_save_creds(&original_cred);
if (status < 0)
return status;
@@ -143,10 +145,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
goto out_unlock;
}
status = -EEXIST;
- if (dentry->d_inode) {
- dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
+ if (dentry->d_inode)
goto out_put;
- }
status = mnt_want_write(rec_file->f_path.mnt);
if (status)
goto out_put;
@@ -156,12 +156,14 @@ out_put:
dput(dentry);
out_unlock:
mutex_unlock(&dir->d_inode->i_mutex);
- if (status == 0) {
- clp->cl_firststate = 1;
+ if (status == 0)
vfs_fsync(rec_file, 0);
- }
+ else
+ printk(KERN_ERR "NFSD: failed to write recovery record"
+ " (err %d); please check that %s exists"
+ " and is writeable", status,
+ user_recovery_dirname);
nfs4_reset_creds(original_cred);
- dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
return status;
}
@@ -354,13 +356,13 @@ nfsd4_recdir_load(void) {
*/
void
-nfsd4_init_recdir(char *rec_dirname)
+nfsd4_init_recdir()
{
const struct cred *original_cred;
int status;
printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
- rec_dirname);
+ user_recovery_dirname);
BUG_ON(rec_file);
@@ -372,10 +374,10 @@ nfsd4_init_recdir(char *rec_dirname)
return;
}
- rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
+ rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
if (IS_ERR(rec_file)) {
printk("NFSD: unable to find recovery directory %s\n",
- rec_dirname);
+ user_recovery_dirname);
rec_file = NULL;
}
@@ -390,3 +392,30 @@ nfsd4_shutdown_recdir(void)
fput(rec_file);
rec_file = NULL;
}
+
+/*
+ * Change the NFSv4 recovery directory to recdir.
+ */
+int
+nfs4_reset_recoverydir(char *recdir)
+{
+ int status;
+ struct path path;
+
+ status = kern_path(recdir, LOOKUP_FOLLOW, &path);
+ if (status)
+ return status;
+ status = -ENOTDIR;
+ if (S_ISDIR(path.dentry->d_inode->i_mode)) {
+ strcpy(user_recovery_dirname, recdir);
+ status = 0;
+ }
+ path_put(&path);
+ return status;
+}
+
+char *
+nfs4_recoverydir(void)
+{
+ return user_recovery_dirname;
+}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3787ec11740..47e94e33a97 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,9 +49,6 @@
time_t nfsd4_lease = 90; /* default lease time */
time_t nfsd4_grace = 90;
static time_t boot_time;
-static u32 current_ownerid = 1;
-static u32 current_fileid = 1;
-static u32 current_delegid = 1;
static stateid_t zerostateid; /* bits all 0 */
static stateid_t onestateid; /* bits all 1 */
static u64 current_sessionid = 1;
@@ -60,13 +57,7 @@ static u64 current_sessionid = 1;
#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
/* forward declarations */
-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
-static struct nfs4_stateid * search_for_stateid(stateid_t *stid);
-static struct nfs4_delegation * search_for_delegation(stateid_t *stid);
-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
-static void nfs4_set_recdir(char *recdir);
-static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner);
+static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
/* Locking: */
@@ -80,7 +71,8 @@ static DEFINE_MUTEX(client_mutex);
*/
static DEFINE_SPINLOCK(recall_lock);
-static struct kmem_cache *stateowner_slab = NULL;
+static struct kmem_cache *openowner_slab = NULL;
+static struct kmem_cache *lockowner_slab = NULL;
static struct kmem_cache *file_slab = NULL;
static struct kmem_cache *stateid_slab = NULL;
static struct kmem_cache *deleg_slab = NULL;
@@ -112,6 +104,11 @@ opaque_hashval(const void *ptr, int nbytes)
static struct list_head del_recall_lru;
+static void nfsd4_free_file(struct nfs4_file *f)
+{
+ kmem_cache_free(file_slab, f);
+}
+
static inline void
put_nfs4_file(struct nfs4_file *fi)
{
@@ -119,7 +116,7 @@ put_nfs4_file(struct nfs4_file *fi)
list_del(&fi->fi_hash);
spin_unlock(&recall_lock);
iput(fi->fi_inode);
- kmem_cache_free(file_slab, fi);
+ nfsd4_free_file(fi);
}
}
@@ -136,35 +133,33 @@ unsigned int max_delegations;
* Open owner state (share locks)
*/
-/* hash tables for nfs4_stateowner */
-#define OWNER_HASH_BITS 8
-#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS)
-#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1)
+/* hash tables for open owners */
+#define OPEN_OWNER_HASH_BITS 8
+#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS)
+#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1)
-#define ownerid_hashval(id) \
- ((id) & OWNER_HASH_MASK)
-#define ownerstr_hashval(clientid, ownername) \
- (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
+static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
+{
+ unsigned int ret;
-static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE];
-static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
+ ret = opaque_hashval(ownername->data, ownername->len);
+ ret += clientid;
+ return ret & OPEN_OWNER_HASH_MASK;
+}
+
+static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
/* hash table for nfs4_file */
#define FILE_HASH_BITS 8
#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
-/* hash table for (open)nfs4_stateid */
-#define STATEID_HASH_BITS 10
-#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS)
-#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1)
-
-#define file_hashval(x) \
- hash_ptr(x, FILE_HASH_BITS)
-#define stateid_hashval(owner_id, file_id) \
- (((owner_id) + (file_id)) & STATEID_HASH_MASK)
+static unsigned int file_hashval(struct inode *ino)
+{
+ /* XXX: why are we hashing on inode pointer, anyway? */
+ return hash_ptr(ino, FILE_HASH_BITS);
+}
static struct list_head file_hashtbl[FILE_HASH_SIZE];
-static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
{
@@ -192,8 +187,15 @@ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
{
if (atomic_dec_and_test(&fp->fi_access[oflag])) {
- nfs4_file_put_fd(fp, O_RDWR);
nfs4_file_put_fd(fp, oflag);
+ /*
+ * It's also safe to get rid of the RDWR open *if*
+ * we no longer have need of the other kind of access
+ * or if we already have the other kind of open:
+ */
+ if (fp->fi_fds[1-oflag]
+ || atomic_read(&fp->fi_access[1 - oflag]) == 0)
+ nfs4_file_put_fd(fp, O_RDWR);
}
}
@@ -206,8 +208,73 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
__nfs4_file_put_access(fp, oflag);
}
+static inline int get_new_stid(struct nfs4_stid *stid)
+{
+ static int min_stateid = 0;
+ struct idr *stateids = &stid->sc_client->cl_stateids;
+ int new_stid;
+ int error;
+
+ error = idr_get_new_above(stateids, stid, min_stateid, &new_stid);
+ /*
+ * Note: the necessary preallocation was done in
+ * nfs4_alloc_stateid(). The idr code caps the number of
+ * preallocations that can exist at a time, but the state lock
+ * prevents anyone from using ours before we get here:
+ */
+ BUG_ON(error);
+ /*
+ * It shouldn't be a problem to reuse an opaque stateid value.
+ * I don't think it is for 4.1. But with 4.0 I worry that, for
+ * example, a stray write retransmission could be accepted by
+ * the server when it should have been rejected. Therefore,
+ * adopt a trick from the sctp code to attempt to maximize the
+ * amount of time until an id is reused, by ensuring they always
+ * "increase" (mod INT_MAX):
+ */
+
+ min_stateid = new_stid+1;
+ if (min_stateid == INT_MAX)
+ min_stateid = 0;
+ return new_stid;
+}
+
+static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
+{
+ stateid_t *s = &stid->sc_stateid;
+ int new_id;
+
+ stid->sc_type = type;
+ stid->sc_client = cl;
+ s->si_opaque.so_clid = cl->cl_clientid;
+ new_id = get_new_stid(stid);
+ s->si_opaque.so_id = (u32)new_id;
+ /* Will be incremented before return to client: */
+ s->si_generation = 0;
+}
+
+static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab)
+{
+ struct idr *stateids = &cl->cl_stateids;
+
+ if (!idr_pre_get(stateids, GFP_KERNEL))
+ return NULL;
+ /*
+ * Note: if we fail here (or any time between now and the time
+ * we actually get the new idr), we won't need to undo the idr
+ * preallocation, since the idr code caps the number of
+ * preallocated entries.
+ */
+ return kmem_cache_alloc(slab, GFP_KERNEL);
+}
+
+static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
+{
+ return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
+}
+
static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
{
struct nfs4_delegation *dp;
struct nfs4_file *fp = stp->st_file;
@@ -224,21 +291,23 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
return NULL;
if (num_delegations > max_delegations)
return NULL;
- dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
+ dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
if (dp == NULL)
return dp;
+ init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
+ /*
+ * delegation seqid's are never incremented. The 4.1 special
+ * meaning of seqid 0 isn't meaningful, really, but let's avoid
+ * 0 anyway just for consistency and use 1:
+ */
+ dp->dl_stid.sc_stateid.si_generation = 1;
num_delegations++;
INIT_LIST_HEAD(&dp->dl_perfile);
INIT_LIST_HEAD(&dp->dl_perclnt);
INIT_LIST_HEAD(&dp->dl_recall_lru);
- dp->dl_client = clp;
get_nfs4_file(fp);
dp->dl_file = fp;
dp->dl_type = type;
- dp->dl_stateid.si_boot = boot_time;
- dp->dl_stateid.si_stateownerid = current_delegid++;
- dp->dl_stateid.si_fileid = 0;
- dp->dl_stateid.si_generation = 0;
fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
dp->dl_time = 0;
atomic_set(&dp->dl_count, 1);
@@ -267,10 +336,18 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
}
}
+static void unhash_stid(struct nfs4_stid *s)
+{
+ struct idr *stateids = &s->sc_client->cl_stateids;
+
+ idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
+}
+
/* Called under the state lock. */
static void
unhash_delegation(struct nfs4_delegation *dp)
{
+ unhash_stid(&dp->dl_stid);
list_del_init(&dp->dl_perclnt);
spin_lock(&recall_lock);
list_del_init(&dp->dl_perfile);
@@ -292,10 +369,16 @@ static DEFINE_SPINLOCK(client_lock);
#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
-#define clientid_hashval(id) \
- ((id) & CLIENT_HASH_MASK)
-#define clientstr_hashval(name) \
- (opaque_hashval((name), 8) & CLIENT_HASH_MASK)
+static unsigned int clientid_hashval(u32 id)
+{
+ return id & CLIENT_HASH_MASK;
+}
+
+static unsigned int clientstr_hashval(const char *name)
+{
+ return opaque_hashval(name, 8) & CLIENT_HASH_MASK;
+}
+
/*
* reclaim_str_hashtbl[] holds known client info from previous reset/reboot
* used in reboot/reset lease grace period processing
@@ -362,7 +445,7 @@ set_deny(unsigned int *deny, unsigned long bmap) {
}
static int
-test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
+test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
unsigned int access, deny;
set_access(&access, stp->st_access_bmap);
@@ -385,14 +468,13 @@ static int nfs4_access_to_omode(u32 access)
BUG();
}
-static void unhash_generic_stateid(struct nfs4_stateid *stp)
+static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
{
- list_del(&stp->st_hash);
list_del(&stp->st_perfile);
list_del(&stp->st_perstateowner);
}
-static void free_generic_stateid(struct nfs4_stateid *stp)
+static void close_generic_stateid(struct nfs4_ol_stateid *stp)
{
int i;
@@ -401,84 +483,106 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
if (test_bit(i, &stp->st_access_bmap))
nfs4_file_put_access(stp->st_file,
nfs4_access_to_omode(i));
+ __clear_bit(i, &stp->st_access_bmap);
}
}
put_nfs4_file(stp->st_file);
+ stp->st_file = NULL;
+}
+
+static void free_generic_stateid(struct nfs4_ol_stateid *stp)
+{
kmem_cache_free(stateid_slab, stp);
}
-static void release_lock_stateid(struct nfs4_stateid *stp)
+static void release_lock_stateid(struct nfs4_ol_stateid *stp)
{
struct file *file;
unhash_generic_stateid(stp);
+ unhash_stid(&stp->st_stid);
file = find_any_file(stp->st_file);
if (file)
- locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
+ locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
+ close_generic_stateid(stp);
free_generic_stateid(stp);
}
-static void unhash_lockowner(struct nfs4_stateowner *sop)
+static void unhash_lockowner(struct nfs4_lockowner *lo)
{
- struct nfs4_stateid *stp;
+ struct nfs4_ol_stateid *stp;
- list_del(&sop->so_idhash);
- list_del(&sop->so_strhash);
- list_del(&sop->so_perstateid);
- while (!list_empty(&sop->so_stateids)) {
- stp = list_first_entry(&sop->so_stateids,
- struct nfs4_stateid, st_perstateowner);
+ list_del(&lo->lo_owner.so_strhash);
+ list_del(&lo->lo_perstateid);
+ while (!list_empty(&lo->lo_owner.so_stateids)) {
+ stp = list_first_entry(&lo->lo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
release_lock_stateid(stp);
}
}
-static void release_lockowner(struct nfs4_stateowner *sop)
+static void release_lockowner(struct nfs4_lockowner *lo)
{
- unhash_lockowner(sop);
- nfs4_put_stateowner(sop);
+ unhash_lockowner(lo);
+ nfs4_free_lockowner(lo);
}
static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
+release_stateid_lockowners(struct nfs4_ol_stateid *open_stp)
{
- struct nfs4_stateowner *lock_sop;
+ struct nfs4_lockowner *lo;
while (!list_empty(&open_stp->st_lockowners)) {
- lock_sop = list_entry(open_stp->st_lockowners.next,
- struct nfs4_stateowner, so_perstateid);
- /* list_del(&open_stp->st_lockowners); */
- BUG_ON(lock_sop->so_is_open_owner);
- release_lockowner(lock_sop);
+ lo = list_entry(open_stp->st_lockowners.next,
+ struct nfs4_lockowner, lo_perstateid);
+ release_lockowner(lo);
}
}
-static void release_open_stateid(struct nfs4_stateid *stp)
+static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
{
unhash_generic_stateid(stp);
release_stateid_lockowners(stp);
+ close_generic_stateid(stp);
+}
+
+static void release_open_stateid(struct nfs4_ol_stateid *stp)
+{
+ unhash_open_stateid(stp);
+ unhash_stid(&stp->st_stid);
free_generic_stateid(stp);
}
-static void unhash_openowner(struct nfs4_stateowner *sop)
+static void unhash_openowner(struct nfs4_openowner *oo)
{
- struct nfs4_stateid *stp;
+ struct nfs4_ol_stateid *stp;
- list_del(&sop->so_idhash);
- list_del(&sop->so_strhash);
- list_del(&sop->so_perclient);
- list_del(&sop->so_perstateid); /* XXX: necessary? */
- while (!list_empty(&sop->so_stateids)) {
- stp = list_first_entry(&sop->so_stateids,
- struct nfs4_stateid, st_perstateowner);
+ list_del(&oo->oo_owner.so_strhash);
+ list_del(&oo->oo_perclient);
+ while (!list_empty(&oo->oo_owner.so_stateids)) {
+ stp = list_first_entry(&oo->oo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
release_open_stateid(stp);
}
}
-static void release_openowner(struct nfs4_stateowner *sop)
+static void release_last_closed_stateid(struct nfs4_openowner *oo)
{
- unhash_openowner(sop);
- list_del(&sop->so_close_lru);
- nfs4_put_stateowner(sop);
+ struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
+
+ if (s) {
+ unhash_stid(&s->st_stid);
+ free_generic_stateid(s);
+ oo->oo_last_closed_stid = NULL;
+ }
+}
+
+static void release_openowner(struct nfs4_openowner *oo)
+{
+ unhash_openowner(oo);
+ list_del(&oo->oo_close_lru);
+ release_last_closed_stateid(oo);
+ nfs4_free_openowner(oo);
}
#define SESSION_HASH_SIZE 512
@@ -843,9 +947,6 @@ renew_client_locked(struct nfs4_client *clp)
return;
}
- /*
- * Move client to the end to the LRU list.
- */
dprintk("renewing client (clientid %08x/%08x)\n",
clp->cl_clientid.cl_boot,
clp->cl_clientid.cl_id);
@@ -943,7 +1044,7 @@ unhash_client_locked(struct nfs4_client *clp)
static void
expire_client(struct nfs4_client *clp)
{
- struct nfs4_stateowner *sop;
+ struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct list_head reaplist;
@@ -961,8 +1062,8 @@ expire_client(struct nfs4_client *clp)
unhash_delegation(dp);
}
while (!list_empty(&clp->cl_openowners)) {
- sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
- release_openowner(sop);
+ oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
+ release_openowner(oo);
}
nfsd4_shutdown_callback(clp);
if (clp->cl_cb_conn.cb_xprt)
@@ -1038,6 +1139,23 @@ static void gen_confirm(struct nfs4_client *clp)
*p++ = i++;
}
+static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
+{
+ return idr_find(&cl->cl_stateids, t->si_opaque.so_id);
+}
+
+static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
+{
+ struct nfs4_stid *s;
+
+ s = find_stateid(cl, t);
+ if (!s)
+ return NULL;
+ if (typemask & s->sc_type)
+ return s;
+ return NULL;
+}
+
static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
struct svc_rqst *rqstp, nfs4_verifier *verf)
{
@@ -1060,6 +1178,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
}
}
+ idr_init(&clp->cl_stateids);
memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
atomic_set(&clp->cl_refcount, 0);
clp->cl_cb_state = NFSD4_CB_UNKNOWN;
@@ -1083,17 +1202,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
return clp;
}
-static int check_name(struct xdr_netobj name)
-{
- if (name.len == 0)
- return 0;
- if (name.len > NFS4_OPAQUE_LIMIT) {
- dprintk("NFSD: check_name: name too long(%d)!\n", name.len);
- return 0;
- }
- return 1;
-}
-
static void
add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
{
@@ -1125,8 +1233,10 @@ find_confirmed_client(clientid_t *clid)
unsigned int idhashval = clientid_hashval(clid->cl_id);
list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
- if (same_clid(&clp->cl_clientid, clid))
+ if (same_clid(&clp->cl_clientid, clid)) {
+ renew_client(clp);
return clp;
+ }
}
return NULL;
}
@@ -1173,20 +1283,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
return NULL;
}
-static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
-{
- switch (family) {
- case AF_INET:
- ((struct sockaddr_in *)sa)->sin_family = AF_INET;
- ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
- return;
- case AF_INET6:
- ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
- ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
- return;
- }
-}
-
static void
gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
{
@@ -1218,7 +1314,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
conn->cb_prog = se->se_callback_prog;
conn->cb_ident = se->se_callback_ident;
- rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
+ memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen);
return;
out_err:
conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1350,7 +1446,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
__func__, rqstp, exid, exid->clname.len, exid->clname.data,
addr_str, exid->flags, exid->spa_how);
- if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+ if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
return nfserr_inval;
/* Currently only support SP4_NONE */
@@ -1849,8 +1945,16 @@ out:
nfsd4_get_session(cstate->session);
atomic_inc(&clp->cl_refcount);
- if (clp->cl_cb_state == NFSD4_CB_DOWN)
- seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
+ switch (clp->cl_cb_state) {
+ case NFSD4_CB_DOWN:
+ seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
+ break;
+ case NFSD4_CB_FAULT:
+ seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
+ break;
+ default:
+ seq->status_flags = 0;
+ }
}
kfree(conn);
spin_unlock(&client_lock);
@@ -1858,6 +1962,50 @@ out:
return status;
}
+static inline bool has_resources(struct nfs4_client *clp)
+{
+ return !list_empty(&clp->cl_openowners)
+ || !list_empty(&clp->cl_delegations)
+ || !list_empty(&clp->cl_sessions);
+}
+
+__be32
+nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
+{
+ struct nfs4_client *conf, *unconf, *clp;
+ int status = 0;
+
+ nfs4_lock_state();
+ unconf = find_unconfirmed_client(&dc->clientid);
+ conf = find_confirmed_client(&dc->clientid);
+
+ if (conf) {
+ clp = conf;
+
+ if (!is_client_expired(conf) && has_resources(conf)) {
+ status = nfserr_clientid_busy;
+ goto out;
+ }
+
+ /* rfc5661 18.50.3 */
+ if (cstate->session && conf == cstate->session->se_client) {
+ status = nfserr_clientid_busy;
+ goto out;
+ }
+ } else if (unconf)
+ clp = unconf;
+ else {
+ status = nfserr_stale_clientid;
+ goto out;
+ }
+
+ expire_client(clp);
+out:
+ nfs4_unlock_state();
+ dprintk("%s return %d\n", __func__, ntohl(status));
+ return status;
+}
+
__be32
nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
{
@@ -1900,19 +2048,13 @@ __be32
nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_setclientid *setclid)
{
- struct xdr_netobj clname = {
- .len = setclid->se_namelen,
- .data = setclid->se_name,
- };
+ struct xdr_netobj clname = setclid->se_name;
nfs4_verifier clverifier = setclid->se_verf;
unsigned int strhashval;
struct nfs4_client *conf, *unconf, *new;
__be32 status;
char dname[HEXDIR_LEN];
- if (!check_name(clname))
- return nfserr_inval;
-
status = nfs4_make_rec_clidname(dname, &clname);
if (status)
return status;
@@ -1946,7 +2088,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* of 5 bullet points, labeled as CASE0 - CASE4 below.
*/
unconf = find_unconfirmed_client_by_str(dname, strhashval);
- status = nfserr_resource;
+ status = nfserr_jukebox;
if (!conf) {
/*
* RFC 3530 14.2.33 CASE 4:
@@ -2116,31 +2258,28 @@ out:
return status;
}
+static struct nfs4_file *nfsd4_alloc_file(void)
+{
+ return kmem_cache_alloc(file_slab, GFP_KERNEL);
+}
+
/* OPEN Share state helper functions */
-static inline struct nfs4_file *
-alloc_init_file(struct inode *ino)
+static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
{
- struct nfs4_file *fp;
unsigned int hashval = file_hashval(ino);
- fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
- if (fp) {
- atomic_set(&fp->fi_ref, 1);
- INIT_LIST_HEAD(&fp->fi_hash);
- INIT_LIST_HEAD(&fp->fi_stateids);
- INIT_LIST_HEAD(&fp->fi_delegations);
- fp->fi_inode = igrab(ino);
- fp->fi_id = current_fileid++;
- fp->fi_had_conflict = false;
- fp->fi_lease = NULL;
- memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
- memset(fp->fi_access, 0, sizeof(fp->fi_access));
- spin_lock(&recall_lock);
- list_add(&fp->fi_hash, &file_hashtbl[hashval]);
- spin_unlock(&recall_lock);
- return fp;
- }
- return NULL;
+ atomic_set(&fp->fi_ref, 1);
+ INIT_LIST_HEAD(&fp->fi_hash);
+ INIT_LIST_HEAD(&fp->fi_stateids);
+ INIT_LIST_HEAD(&fp->fi_delegations);
+ fp->fi_inode = igrab(ino);
+ fp->fi_had_conflict = false;
+ fp->fi_lease = NULL;
+ memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+ memset(fp->fi_access, 0, sizeof(fp->fi_access));
+ spin_lock(&recall_lock);
+ list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+ spin_unlock(&recall_lock);
}
static void
@@ -2155,7 +2294,8 @@ nfsd4_free_slab(struct kmem_cache **slab)
void
nfsd4_free_slabs(void)
{
- nfsd4_free_slab(&stateowner_slab);
+ nfsd4_free_slab(&openowner_slab);
+ nfsd4_free_slab(&lockowner_slab);
nfsd4_free_slab(&file_slab);
nfsd4_free_slab(&stateid_slab);
nfsd4_free_slab(&deleg_slab);
@@ -2164,16 +2304,20 @@ nfsd4_free_slabs(void)
static int
nfsd4_init_slabs(void)
{
- stateowner_slab = kmem_cache_create("nfsd4_stateowners",
- sizeof(struct nfs4_stateowner), 0, 0, NULL);
- if (stateowner_slab == NULL)
+ openowner_slab = kmem_cache_create("nfsd4_openowners",
+ sizeof(struct nfs4_openowner), 0, 0, NULL);
+ if (openowner_slab == NULL)
+ goto out_nomem;
+ lockowner_slab = kmem_cache_create("nfsd4_lockowners",
+ sizeof(struct nfs4_openowner), 0, 0, NULL);
+ if (lockowner_slab == NULL)
goto out_nomem;
file_slab = kmem_cache_create("nfsd4_files",
sizeof(struct nfs4_file), 0, 0, NULL);
if (file_slab == NULL)
goto out_nomem;
stateid_slab = kmem_cache_create("nfsd4_stateids",
- sizeof(struct nfs4_stateid), 0, 0, NULL);
+ sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
if (stateid_slab == NULL)
goto out_nomem;
deleg_slab = kmem_cache_create("nfsd4_delegations",
@@ -2187,97 +2331,94 @@ out_nomem:
return -ENOMEM;
}
-void
-nfs4_free_stateowner(struct kref *kref)
+void nfs4_free_openowner(struct nfs4_openowner *oo)
{
- struct nfs4_stateowner *sop =
- container_of(kref, struct nfs4_stateowner, so_ref);
- kfree(sop->so_owner.data);
- kmem_cache_free(stateowner_slab, sop);
+ kfree(oo->oo_owner.so_owner.data);
+ kmem_cache_free(openowner_slab, oo);
}
-static inline struct nfs4_stateowner *
-alloc_stateowner(struct xdr_netobj *owner)
+void nfs4_free_lockowner(struct nfs4_lockowner *lo)
{
- struct nfs4_stateowner *sop;
+ kfree(lo->lo_owner.so_owner.data);
+ kmem_cache_free(lockowner_slab, lo);
+}
- if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) {
- if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) {
- memcpy(sop->so_owner.data, owner->data, owner->len);
- sop->so_owner.len = owner->len;
- kref_init(&sop->so_ref);
- return sop;
- }
- kmem_cache_free(stateowner_slab, sop);
- }
- return NULL;
+static void init_nfs4_replay(struct nfs4_replay *rp)
+{
+ rp->rp_status = nfserr_serverfault;
+ rp->rp_buflen = 0;
+ rp->rp_buf = rp->rp_ibuf;
}
-static struct nfs4_stateowner *
-alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
+static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp)
+{
struct nfs4_stateowner *sop;
- struct nfs4_replay *rp;
- unsigned int idhashval;
- if (!(sop = alloc_stateowner(&open->op_owner)))
+ sop = kmem_cache_alloc(slab, GFP_KERNEL);
+ if (!sop)
+ return NULL;
+
+ sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL);
+ if (!sop->so_owner.data) {
+ kmem_cache_free(slab, sop);
return NULL;
- idhashval = ownerid_hashval(current_ownerid);
- INIT_LIST_HEAD(&sop->so_idhash);
- INIT_LIST_HEAD(&sop->so_strhash);
- INIT_LIST_HEAD(&sop->so_perclient);
+ }
+ sop->so_owner.len = owner->len;
+
INIT_LIST_HEAD(&sop->so_stateids);
- INIT_LIST_HEAD(&sop->so_perstateid); /* not used */
- INIT_LIST_HEAD(&sop->so_close_lru);
- sop->so_time = 0;
- list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]);
- list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]);
- list_add(&sop->so_perclient, &clp->cl_openowners);
- sop->so_is_open_owner = 1;
- sop->so_id = current_ownerid++;
sop->so_client = clp;
- sop->so_seqid = open->op_seqid;
- sop->so_confirmed = 0;
- rp = &sop->so_replay;
- rp->rp_status = nfserr_serverfault;
- rp->rp_buflen = 0;
- rp->rp_buf = rp->rp_ibuf;
+ init_nfs4_replay(&sop->so_replay);
return sop;
}
-static inline void
-init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
- struct nfs4_stateowner *sop = open->op_stateowner;
- unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
+static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
+{
+ list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]);
+ list_add(&oo->oo_perclient, &clp->cl_openowners);
+}
- INIT_LIST_HEAD(&stp->st_hash);
- INIT_LIST_HEAD(&stp->st_perstateowner);
+static struct nfs4_openowner *
+alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
+ struct nfs4_openowner *oo;
+
+ oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
+ if (!oo)
+ return NULL;
+ oo->oo_owner.so_is_open_owner = 1;
+ oo->oo_owner.so_seqid = open->op_seqid;
+ oo->oo_flags = NFS4_OO_NEW;
+ oo->oo_time = 0;
+ oo->oo_last_closed_stid = NULL;
+ INIT_LIST_HEAD(&oo->oo_close_lru);
+ hash_openowner(oo, clp, strhashval);
+ return oo;
+}
+
+static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+ struct nfs4_openowner *oo = open->op_openowner;
+ struct nfs4_client *clp = oo->oo_owner.so_client;
+
+ init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
INIT_LIST_HEAD(&stp->st_lockowners);
- INIT_LIST_HEAD(&stp->st_perfile);
- list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
- list_add(&stp->st_perstateowner, &sop->so_stateids);
+ list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
list_add(&stp->st_perfile, &fp->fi_stateids);
- stp->st_stateowner = sop;
+ stp->st_stateowner = &oo->oo_owner;
get_nfs4_file(fp);
stp->st_file = fp;
- stp->st_stateid.si_boot = boot_time;
- stp->st_stateid.si_stateownerid = sop->so_id;
- stp->st_stateid.si_fileid = fp->fi_id;
- stp->st_stateid.si_generation = 0;
stp->st_access_bmap = 0;
stp->st_deny_bmap = 0;
- __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
- &stp->st_access_bmap);
+ __set_bit(open->op_share_access, &stp->st_access_bmap);
__set_bit(open->op_share_deny, &stp->st_deny_bmap);
stp->st_openstp = NULL;
}
static void
-move_to_close_lru(struct nfs4_stateowner *sop)
+move_to_close_lru(struct nfs4_openowner *oo)
{
- dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
+ dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
- list_move_tail(&sop->so_close_lru, &close_lru);
- sop->so_time = get_seconds();
+ list_move_tail(&oo->oo_close_lru, &close_lru);
+ oo->oo_time = get_seconds();
}
static int
@@ -2289,14 +2430,18 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
(sop->so_client->cl_clientid.cl_id == clid->cl_id);
}
-static struct nfs4_stateowner *
+static struct nfs4_openowner *
find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
{
- struct nfs4_stateowner *so = NULL;
+ struct nfs4_stateowner *so;
+ struct nfs4_openowner *oo;
- list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
- if (same_owner_str(so, &open->op_owner, &open->op_clientid))
- return so;
+ list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
+ if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
+ oo = openowner(so);
+ renew_client(oo->oo_owner.so_client);
+ return oo;
+ }
}
return NULL;
}
@@ -2320,31 +2465,6 @@ find_file(struct inode *ino)
return NULL;
}
-static inline int access_valid(u32 x, u32 minorversion)
-{
- if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
- return 0;
- if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
- return 0;
- x &= ~NFS4_SHARE_ACCESS_MASK;
- if (minorversion && x) {
- if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
- return 0;
- if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
- return 0;
- x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
- }
- if (x)
- return 0;
- return 1;
-}
-
-static inline int deny_valid(u32 x)
-{
- /* Note: unlike access bits, deny bits may be zero. */
- return x <= NFS4_SHARE_DENY_BOTH;
-}
-
/*
* Called to check deny when READ with all zero stateid or
* WRITE with all zero or all one stateid
@@ -2354,7 +2474,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
{
struct inode *ino = current_fh->fh_dentry->d_inode;
struct nfs4_file *fp;
- struct nfs4_stateid *stp;
+ struct nfs4_ol_stateid *stp;
__be32 ret;
dprintk("NFSD: nfs4_share_conflict\n");
@@ -2429,6 +2549,16 @@ static const struct lock_manager_operations nfsd_lease_mng_ops = {
.lm_change = nfsd_change_deleg_cb,
};
+static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid)
+{
+ if (nfsd4_has_session(cstate))
+ return nfs_ok;
+ if (seqid == so->so_seqid - 1)
+ return nfserr_replay_me;
+ if (seqid == so->so_seqid)
+ return nfs_ok;
+ return nfserr_bad_seqid;
+}
__be32
nfsd4_process_open1(struct nfsd4_compound_state *cstate,
@@ -2437,57 +2567,49 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
clientid_t *clientid = &open->op_clientid;
struct nfs4_client *clp = NULL;
unsigned int strhashval;
- struct nfs4_stateowner *sop = NULL;
-
- if (!check_name(open->op_owner))
- return nfserr_inval;
+ struct nfs4_openowner *oo = NULL;
+ __be32 status;
if (STALE_CLIENTID(&open->op_clientid))
return nfserr_stale_clientid;
+ /*
+ * In case we need it later, after we've already created the
+ * file and don't want to risk a further failure:
+ */
+ open->op_file = nfsd4_alloc_file();
+ if (open->op_file == NULL)
+ return nfserr_jukebox;
- strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner);
- sop = find_openstateowner_str(strhashval, open);
- open->op_stateowner = sop;
- if (!sop) {
- /* Make sure the client's lease hasn't expired. */
+ strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
+ oo = find_openstateowner_str(strhashval, open);
+ open->op_openowner = oo;
+ if (!oo) {
clp = find_confirmed_client(clientid);
if (clp == NULL)
return nfserr_expired;
- goto renew;
+ goto new_owner;
}
- /* When sessions are used, skip open sequenceid processing */
- if (nfsd4_has_session(cstate))
- goto renew;
- if (!sop->so_confirmed) {
+ if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
/* Replace unconfirmed owners without checking for replay. */
- clp = sop->so_client;
- release_openowner(sop);
- open->op_stateowner = NULL;
- goto renew;
- }
- if (open->op_seqid == sop->so_seqid - 1) {
- if (sop->so_replay.rp_buflen)
- return nfserr_replay_me;
- /* The original OPEN failed so spectacularly
- * that we don't even have replay data saved!
- * Therefore, we have no choice but to continue
- * processing this OPEN; presumably, we'll
- * fail again for the same reason.
- */
- dprintk("nfsd4_process_open1: replay with no replay cache\n");
- goto renew;
- }
- if (open->op_seqid != sop->so_seqid)
- return nfserr_bad_seqid;
-renew:
- if (open->op_stateowner == NULL) {
- sop = alloc_init_open_stateowner(strhashval, clp, open);
- if (sop == NULL)
- return nfserr_resource;
- open->op_stateowner = sop;
+ clp = oo->oo_owner.so_client;
+ release_openowner(oo);
+ open->op_openowner = NULL;
+ goto new_owner;
}
- list_del_init(&sop->so_close_lru);
- renew_client(sop->so_client);
+ status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
+ if (status)
+ return status;
+ clp = oo->oo_owner.so_client;
+ goto alloc_stateid;
+new_owner:
+ oo = alloc_init_open_stateowner(strhashval, clp, open);
+ if (oo == NULL)
+ return nfserr_jukebox;
+ open->op_openowner = oo;
+alloc_stateid:
+ open->op_stp = nfs4_alloc_stateid(clp);
+ if (!open->op_stp)
+ return nfserr_jukebox;
return nfs_ok;
}
@@ -2500,36 +2622,37 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
return nfs_ok;
}
-static struct nfs4_delegation *
-find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
+static int share_access_to_flags(u32 share_access)
{
- struct nfs4_delegation *dp;
+ share_access &= ~NFS4_SHARE_WANT_MASK;
- spin_lock(&recall_lock);
- list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
- if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
- spin_unlock(&recall_lock);
- return dp;
- }
- spin_unlock(&recall_lock);
- return NULL;
+ return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
}
-static int share_access_to_flags(u32 share_access)
+static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
{
- share_access &= ~NFS4_SHARE_WANT_MASK;
+ struct nfs4_stid *ret;
- return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
+ ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID);
+ if (!ret)
+ return NULL;
+ return delegstateid(ret);
+}
+
+static bool nfsd4_is_deleg_cur(struct nfsd4_open *open)
+{
+ return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
+ open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH;
}
static __be32
-nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
+nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open,
struct nfs4_delegation **dp)
{
int flags;
__be32 status = nfserr_bad_stateid;
- *dp = find_delegation_file(fp, &open->op_delegate_stateid);
+ *dp = find_deleg_stateid(cl, &open->op_delegate_stateid);
if (*dp == NULL)
goto out;
flags = share_access_to_flags(open->op_share_access);
@@ -2537,41 +2660,37 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
if (status)
*dp = NULL;
out:
- if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR)
+ if (!nfsd4_is_deleg_cur(open))
return nfs_ok;
if (status)
return status;
- open->op_stateowner->so_confirmed = 1;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
return nfs_ok;
}
static __be32
-nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp)
+nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp)
{
- struct nfs4_stateid *local;
- __be32 status = nfserr_share_denied;
- struct nfs4_stateowner *sop = open->op_stateowner;
+ struct nfs4_ol_stateid *local;
+ struct nfs4_openowner *oo = open->op_openowner;
list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
/* ignore lock owners */
if (local->st_stateowner->so_is_open_owner == 0)
continue;
/* remember if we have seen this open owner */
- if (local->st_stateowner == sop)
+ if (local->st_stateowner == &oo->oo_owner)
*stpp = local;
/* check for conflicting share reservations */
if (!test_share(local, open))
- goto out;
+ return nfserr_share_denied;
}
- status = 0;
-out:
- return status;
+ return nfs_ok;
}
-static inline struct nfs4_stateid *
-nfs4_alloc_stateid(void)
+static void nfs4_free_stateid(struct nfs4_ol_stateid *s)
{
- return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
+ kmem_cache_free(stateid_slab, s);
}
static inline int nfs4_access_to_access(u32 nfs4_access)
@@ -2592,12 +2711,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
int oflag = nfs4_access_to_omode(open->op_share_access);
int access = nfs4_access_to_access(open->op_share_access);
- /* CLAIM_DELEGATE_CUR is used in response to a broken lease;
- * allowing it to break the lease and return EAGAIN leaves the
- * client unable to make progress in returning the delegation */
- if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
- access |= NFSD_MAY_NOT_BREAK_LEASE;
-
if (!fp->fi_fds[oflag]) {
status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
&fp->fi_fds[oflag]);
@@ -2609,27 +2722,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
return nfs_ok;
}
-static __be32
-nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
- struct nfs4_file *fp, struct svc_fh *cur_fh,
- struct nfsd4_open *open)
-{
- struct nfs4_stateid *stp;
- __be32 status;
-
- stp = nfs4_alloc_stateid();
- if (stp == NULL)
- return nfserr_resource;
-
- status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
- if (status) {
- kmem_cache_free(stateid_slab, stp);
- return status;
- }
- *stpp = stp;
- return 0;
-}
-
static inline __be32
nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
struct nfsd4_open *open)
@@ -2646,9 +2738,9 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
}
static __be32
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
{
- u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
+ u32 op_share_access = open->op_share_access;
bool new_access;
__be32 status;
@@ -2677,8 +2769,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
static void
nfs4_set_claim_prev(struct nfsd4_open *open)
{
- open->op_stateowner->so_confirmed = 1;
- open->op_stateowner->so_client->cl_firststate = 1;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ open->op_openowner->oo_owner.so_client->cl_firststate = 1;
}
/* Should we give out recallable state?: */
@@ -2721,7 +2813,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
if (!fl)
return -ENOMEM;
fl->fl_file = find_readable_file(fp);
- list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+ list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
if (status) {
list_del_init(&dp->dl_perclnt);
@@ -2750,7 +2842,7 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
atomic_inc(&fp->fi_delegees);
list_add(&dp->dl_perfile, &fp->fi_delegations);
spin_unlock(&recall_lock);
- list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+ list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
return 0;
}
@@ -2758,14 +2850,14 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
* Attempt to hand out a delegation.
*/
static void
-nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp)
+nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp)
{
struct nfs4_delegation *dp;
- struct nfs4_stateowner *sop = stp->st_stateowner;
+ struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
int cb_up;
int status, flag = 0;
- cb_up = nfsd4_cb_channel_good(sop->so_client);
+ cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
flag = NFS4_OPEN_DELEGATE_NONE;
open->op_recall = 0;
switch (open->op_claim_type) {
@@ -2781,7 +2873,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
* had the chance to reclaim theirs.... */
if (locks_in_grace())
goto out;
- if (!cb_up || !sop->so_confirmed)
+ if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
goto out;
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2792,17 +2884,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
goto out;
}
- dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
+ dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
if (dp == NULL)
goto out_no_deleg;
status = nfs4_set_delegation(dp, flag);
if (status)
goto out_free;
- memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
+ memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
- STATEID_VAL(&dp->dl_stateid));
+ STATEID_VAL(&dp->dl_stid.sc_stateid));
out:
if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
&& flag == NFS4_OPEN_DELEGATE_NONE
@@ -2824,16 +2916,13 @@ __be32
nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
struct nfs4_file *fp = NULL;
struct inode *ino = current_fh->fh_dentry->d_inode;
- struct nfs4_stateid *stp = NULL;
+ struct nfs4_ol_stateid *stp = NULL;
struct nfs4_delegation *dp = NULL;
__be32 status;
- status = nfserr_inval;
- if (!access_valid(open->op_share_access, resp->cstate.minorversion)
- || !deny_valid(open->op_share_deny))
- goto out;
/*
* Lookup file; if found, lookup stateid and check open request,
* and check for delegations in the process of being recalled.
@@ -2843,17 +2932,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
if (fp) {
if ((status = nfs4_check_open(fp, open, &stp)))
goto out;
- status = nfs4_check_deleg(fp, open, &dp);
+ status = nfs4_check_deleg(cl, fp, open, &dp);
if (status)
goto out;
} else {
status = nfserr_bad_stateid;
- if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
- goto out;
- status = nfserr_resource;
- fp = alloc_init_file(ino);
- if (fp == NULL)
+ if (nfsd4_is_deleg_cur(open))
goto out;
+ status = nfserr_jukebox;
+ fp = open->op_file;
+ open->op_file = NULL;
+ nfsd4_init_file(fp, ino);
}
/*
@@ -2865,24 +2954,24 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
if (status)
goto out;
- update_stateid(&stp->st_stateid);
} else {
- status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
+ status = nfs4_get_vfs_file(rqstp, fp, current_fh, open);
if (status)
goto out;
- init_stateid(stp, fp, open);
+ stp = open->op_stp;
+ open->op_stp = NULL;
+ init_open_stateid(stp, fp, open);
status = nfsd4_truncate(rqstp, current_fh, open);
if (status) {
release_open_stateid(stp);
goto out;
}
- if (nfsd4_has_session(&resp->cstate))
- update_stateid(&stp->st_stateid);
}
- memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
if (nfsd4_has_session(&resp->cstate))
- open->op_stateowner->so_confirmed = 1;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
/*
* Attempt to hand out a delegation. No error return, because the
@@ -2893,7 +2982,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfs_ok;
dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
- STATEID_VAL(&stp->st_stateid));
+ STATEID_VAL(&stp->st_stid.sc_stateid));
out:
if (fp)
put_nfs4_file(fp);
@@ -2903,13 +2992,34 @@ out:
* To finish the open response, we just need to set the rflags.
*/
open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
- if (!open->op_stateowner->so_confirmed &&
+ if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
!nfsd4_has_session(&resp->cstate))
open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
return status;
}
+void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
+{
+ if (open->op_openowner) {
+ struct nfs4_openowner *oo = open->op_openowner;
+
+ if (!list_empty(&oo->oo_owner.so_stateids))
+ list_del_init(&oo->oo_close_lru);
+ if (oo->oo_flags & NFS4_OO_NEW) {
+ if (status) {
+ release_openowner(oo);
+ open->op_openowner = NULL;
+ } else
+ oo->oo_flags &= ~NFS4_OO_NEW;
+ }
+ }
+ if (open->op_file)
+ nfsd4_free_file(open->op_file);
+ if (open->op_stp)
+ nfs4_free_stateid(open->op_stp);
+}
+
__be32
nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
clientid_t *clid)
@@ -2930,7 +3040,6 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("nfsd4_renew: clientid not found!\n");
goto out;
}
- renew_client(clp);
status = nfserr_cb_path_down;
if (!list_empty(&clp->cl_delegations)
&& clp->cl_cb_state != NFSD4_CB_UP)
@@ -2962,7 +3071,7 @@ static time_t
nfs4_laundromat(void)
{
struct nfs4_client *clp;
- struct nfs4_stateowner *sop;
+ struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct list_head *pos, *next, reaplist;
time_t cutoff = get_seconds() - nfsd4_lease;
@@ -3019,16 +3128,14 @@ nfs4_laundromat(void)
}
test_val = nfsd4_lease;
list_for_each_safe(pos, next, &close_lru) {
- sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
- if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
- u = sop->so_time - cutoff;
+ oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
+ if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
+ u = oo->oo_time - cutoff;
if (test_val > u)
test_val = u;
break;
}
- dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
- sop->so_id);
- release_openowner(sop);
+ release_openowner(oo);
}
if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -3050,30 +3157,17 @@ laundromat_main(struct work_struct *not_used)
queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
}
-static struct nfs4_stateowner *
-search_close_lru(u32 st_id, int flags)
+static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
{
- struct nfs4_stateowner *local = NULL;
-
- if (flags & CLOSE_STATE) {
- list_for_each_entry(local, &close_lru, so_close_lru) {
- if (local->so_id == st_id)
- return local;
- }
- }
- return NULL;
-}
-
-static inline int
-nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
-{
- return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
+ if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode)
+ return nfserr_bad_stateid;
+ return nfs_ok;
}
static int
STALE_STATEID(stateid_t *stateid)
{
- if (stateid->si_boot == boot_time)
+ if (stateid->si_opaque.so_clid.cl_boot == boot_time)
return 0;
dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
STATEID_VAL(stateid));
@@ -3096,7 +3190,7 @@ access_permit_write(unsigned long access_bmap)
}
static
-__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
+__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
{
__be32 status = nfserr_openmode;
@@ -3139,68 +3233,80 @@ grace_disallows_io(struct inode *inode)
return locks_in_grace() && mandatory_lock(inode);
}
-static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
+/* Returns true iff a is later than b: */
+static bool stateid_generation_after(stateid_t *a, stateid_t *b)
+{
+ return (s32)a->si_generation - (s32)b->si_generation > 0;
+}
+
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
{
/*
* When sessions are used the stateid generation number is ignored
* when it is zero.
*/
- if ((flags & HAS_SESSION) && in->si_generation == 0)
- goto out;
+ if (has_session && in->si_generation == 0)
+ return nfs_ok;
+
+ if (in->si_generation == ref->si_generation)
+ return nfs_ok;
/* If the client sends us a stateid from the future, it's buggy: */
- if (in->si_generation > ref->si_generation)
+ if (stateid_generation_after(in, ref))
return nfserr_bad_stateid;
/*
- * The following, however, can happen. For example, if the
- * client sends an open and some IO at the same time, the open
- * may bump si_generation while the IO is still in flight.
- * Thanks to hard links and renames, the client never knows what
- * file an open will affect. So it could avoid that situation
- * only by serializing all opens and IO from the same open
- * owner. To recover from the old_stateid error, the client
- * will just have to retry the IO:
+ * However, we could see a stateid from the past, even from a
+ * non-buggy client. For example, if the client sends a lock
+ * while some IO is outstanding, the lock may bump si_generation
+ * while the IO is still in flight. The client could avoid that
+ * situation by waiting for responses on all the IO requests,
+ * but better performance may result in retrying IO that
+ * receives an old_stateid error if requests are rarely
+ * reordered in flight:
*/
- if (in->si_generation < ref->si_generation)
- return nfserr_old_stateid;
-out:
- return nfs_ok;
+ return nfserr_old_stateid;
}
-static int is_delegation_stateid(stateid_t *stateid)
+__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
{
- return stateid->si_fileid == 0;
-}
+ struct nfs4_stid *s;
+ struct nfs4_ol_stateid *ols;
+ __be32 status;
-static int is_open_stateid(struct nfs4_stateid *stateid)
-{
- return stateid->st_openstp == NULL;
+ if (STALE_STATEID(stateid))
+ return nfserr_stale_stateid;
+
+ s = find_stateid(cl, stateid);
+ if (!s)
+ return nfserr_stale_stateid;
+ status = check_stateid_generation(stateid, &s->sc_stateid, 1);
+ if (status)
+ return status;
+ if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID)))
+ return nfs_ok;
+ ols = openlockstateid(s);
+ if (ols->st_stateowner->so_is_open_owner
+ && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+ return nfserr_bad_stateid;
+ return nfs_ok;
}
-__be32 nfs4_validate_stateid(stateid_t *stateid, int flags)
+static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s)
{
- struct nfs4_stateid *stp = NULL;
- __be32 status = nfserr_stale_stateid;
+ struct nfs4_client *cl;
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+ return nfserr_bad_stateid;
if (STALE_STATEID(stateid))
- goto out;
-
- status = nfserr_expired;
- stp = search_for_stateid(stateid);
- if (!stp)
- goto out;
- status = nfserr_bad_stateid;
-
- if (!stp->st_stateowner->so_confirmed)
- goto out;
-
- status = check_stateid_generation(stateid, &stp->st_stateid, flags);
- if (status)
- goto out;
+ return nfserr_stale_stateid;
+ cl = find_confirmed_client(&stateid->si_opaque.so_clid);
+ if (!cl)
+ return nfserr_expired;
+ *s = find_stateid_by_type(cl, stateid, typemask);
+ if (!*s)
+ return nfserr_bad_stateid;
+ return nfs_ok;
- status = nfs_ok;
-out:
- return status;
}
/*
@@ -3210,7 +3316,8 @@ __be32
nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
stateid_t *stateid, int flags, struct file **filpp)
{
- struct nfs4_stateid *stp = NULL;
+ struct nfs4_stid *s;
+ struct nfs4_ol_stateid *stp = NULL;
struct nfs4_delegation *dp = NULL;
struct svc_fh *current_fh = &cstate->current_fh;
struct inode *ino = current_fh->fh_dentry->d_inode;
@@ -3222,60 +3329,47 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
if (grace_disallows_io(ino))
return nfserr_grace;
- if (nfsd4_has_session(cstate))
- flags |= HAS_SESSION;
-
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return check_special_stateids(current_fh, stateid, flags);
- status = nfserr_stale_stateid;
- if (STALE_STATEID(stateid))
+ status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s);
+ if (status)
+ return status;
+ status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
+ if (status)
goto out;
-
- /*
- * We assume that any stateid that has the current boot time,
- * but that we can't find, is expired:
- */
- status = nfserr_expired;
- if (is_delegation_stateid(stateid)) {
- dp = find_delegation_stateid(ino, stateid);
- if (!dp)
- goto out;
- status = check_stateid_generation(stateid, &dp->dl_stateid,
- flags);
- if (status)
- goto out;
+ switch (s->sc_type) {
+ case NFS4_DELEG_STID:
+ dp = delegstateid(s);
status = nfs4_check_delegmode(dp, flags);
if (status)
goto out;
- renew_client(dp->dl_client);
if (filpp) {
*filpp = dp->dl_file->fi_deleg_file;
BUG_ON(!*filpp);
}
- } else { /* open or lock stateid */
- stp = find_stateid(stateid, flags);
- if (!stp)
- goto out;
- status = nfserr_bad_stateid;
- if (nfs4_check_fh(current_fh, stp))
- goto out;
- if (!stp->st_stateowner->so_confirmed)
- goto out;
- status = check_stateid_generation(stateid, &stp->st_stateid,
- flags);
+ break;
+ case NFS4_OPEN_STID:
+ case NFS4_LOCK_STID:
+ stp = openlockstateid(s);
+ status = nfs4_check_fh(current_fh, stp);
if (status)
goto out;
+ if (stp->st_stateowner->so_is_open_owner
+ && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+ goto out;
status = nfs4_check_openmode(stp, flags);
if (status)
goto out;
- renew_client(stp->st_stateowner->so_client);
if (filpp) {
if (flags & RD_STATE)
*filpp = find_readable_file(stp->st_file);
else
*filpp = find_writeable_file(stp->st_file);
}
+ break;
+ default:
+ return nfserr_bad_stateid;
}
status = nfs_ok;
out:
@@ -3283,18 +3377,9 @@ out:
}
static __be32
-nfsd4_free_delegation_stateid(stateid_t *stateid)
+nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
{
- struct nfs4_delegation *dp = search_for_delegation(stateid);
- if (dp)
- return nfserr_locks_held;
- return nfserr_bad_stateid;
-}
-
-static __be32
-nfsd4_free_lock_stateid(struct nfs4_stateid *stp)
-{
- if (check_for_locks(stp->st_file, stp->st_stateowner))
+ if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner)))
return nfserr_locks_held;
release_lock_stateid(stp);
return nfs_ok;
@@ -3307,51 +3392,40 @@ __be32
nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_test_stateid *test_stateid)
{
- test_stateid->ts_has_session = nfsd4_has_session(cstate);
+ /* real work is done during encoding */
return nfs_ok;
}
-/*
- * Free a state id
- */
__be32
nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_free_stateid *free_stateid)
{
stateid_t *stateid = &free_stateid->fr_stateid;
- struct nfs4_stateid *stp;
- __be32 ret;
+ struct nfs4_stid *s;
+ struct nfs4_client *cl = cstate->session->se_client;
+ __be32 ret = nfserr_bad_stateid;
nfs4_lock_state();
- if (is_delegation_stateid(stateid)) {
- ret = nfsd4_free_delegation_stateid(stateid);
- goto out;
- }
-
- stp = search_for_stateid(stateid);
- if (!stp) {
- ret = nfserr_bad_stateid;
+ s = find_stateid(cl, stateid);
+ if (!s)
goto out;
- }
- if (stateid->si_generation != 0) {
- if (stateid->si_generation < stp->st_stateid.si_generation) {
- ret = nfserr_old_stateid;
- goto out;
- }
- if (stateid->si_generation > stp->st_stateid.si_generation) {
- ret = nfserr_bad_stateid;
- goto out;
- }
- }
-
- if (is_open_stateid(stp)) {
+ switch (s->sc_type) {
+ case NFS4_DELEG_STID:
ret = nfserr_locks_held;
goto out;
- } else {
- ret = nfsd4_free_lock_stateid(stp);
- goto out;
+ case NFS4_OPEN_STID:
+ case NFS4_LOCK_STID:
+ ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+ if (ret)
+ goto out;
+ if (s->sc_type == NFS4_LOCK_STID)
+ ret = nfsd4_free_lock_stateid(openlockstateid(s));
+ else
+ ret = nfserr_locks_held;
+ break;
+ default:
+ ret = nfserr_bad_stateid;
}
-
out:
nfs4_unlock_state();
return ret;
@@ -3364,124 +3438,64 @@ setlkflg (int type)
RD_STATE : WR_STATE;
}
+static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp)
+{
+ struct svc_fh *current_fh = &cstate->current_fh;
+ struct nfs4_stateowner *sop = stp->st_stateowner;
+ __be32 status;
+
+ status = nfsd4_check_seqid(cstate, sop, seqid);
+ if (status)
+ return status;
+ if (stp->st_stid.sc_type == NFS4_CLOSED_STID)
+ /*
+ * "Closed" stateid's exist *only* to return
+ * nfserr_replay_me from the previous step.
+ */
+ return nfserr_bad_stateid;
+ status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
+ if (status)
+ return status;
+ return nfs4_check_fh(current_fh, stp);
+}
+
/*
* Checks for sequence id mutating operations.
*/
static __be32
nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
- stateid_t *stateid, int flags,
- struct nfs4_stateowner **sopp,
- struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+ stateid_t *stateid, char typemask,
+ struct nfs4_ol_stateid **stpp)
{
- struct nfs4_stateid *stp;
- struct nfs4_stateowner *sop;
- struct svc_fh *current_fh = &cstate->current_fh;
__be32 status;
+ struct nfs4_stid *s;
dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
seqid, STATEID_VAL(stateid));
*stpp = NULL;
- *sopp = NULL;
-
- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
- return nfserr_bad_stateid;
- }
-
- if (STALE_STATEID(stateid))
- return nfserr_stale_stateid;
-
- if (nfsd4_has_session(cstate))
- flags |= HAS_SESSION;
-
- /*
- * We return BAD_STATEID if filehandle doesn't match stateid,
- * the confirmed flag is incorrecly set, or the generation
- * number is incorrect.
- */
- stp = find_stateid(stateid, flags);
- if (stp == NULL) {
- /*
- * Also, we should make sure this isn't just the result of
- * a replayed close:
- */
- sop = search_close_lru(stateid->si_stateownerid, flags);
- /* It's not stale; let's assume it's expired: */
- if (sop == NULL)
- return nfserr_expired;
- *sopp = sop;
- goto check_replay;
- }
-
- *stpp = stp;
- *sopp = sop = stp->st_stateowner;
-
- if (lock) {
- clientid_t *lockclid = &lock->v.new.clientid;
- struct nfs4_client *clp = sop->so_client;
- int lkflg = 0;
- __be32 status;
-
- lkflg = setlkflg(lock->lk_type);
-
- if (lock->lk_is_new) {
- if (!sop->so_is_open_owner)
- return nfserr_bad_stateid;
- if (!(flags & HAS_SESSION) &&
- !same_clid(&clp->cl_clientid, lockclid))
- return nfserr_bad_stateid;
- /* stp is the open stateid */
- status = nfs4_check_openmode(stp, lkflg);
- if (status)
- return status;
- } else {
- /* stp is the lock stateid */
- status = nfs4_check_openmode(stp->st_openstp, lkflg);
- if (status)
- return status;
- }
- }
+ status = nfsd4_lookup_stateid(stateid, typemask, &s);
+ if (status)
+ return status;
+ *stpp = openlockstateid(s);
+ cstate->replay_owner = (*stpp)->st_stateowner;
- if (nfs4_check_fh(current_fh, stp)) {
- dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n");
- return nfserr_bad_stateid;
- }
+ return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
+}
- /*
- * We now validate the seqid and stateid generation numbers.
- * For the moment, we ignore the possibility of
- * generation number wraparound.
- */
- if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
- goto check_replay;
+static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp)
+{
+ __be32 status;
+ struct nfs4_openowner *oo;
- if (sop->so_confirmed && flags & CONFIRM) {
- dprintk("NFSD: preprocess_seqid_op: expected"
- " unconfirmed stateowner!\n");
- return nfserr_bad_stateid;
- }
- if (!sop->so_confirmed && !(flags & CONFIRM)) {
- dprintk("NFSD: preprocess_seqid_op: stateowner not"
- " confirmed yet!\n");
- return nfserr_bad_stateid;
- }
- status = check_stateid_generation(stateid, &stp->st_stateid, flags);
+ status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
+ NFS4_OPEN_STID, stpp);
if (status)
return status;
- renew_client(sop->so_client);
+ oo = openowner((*stpp)->st_stateowner);
+ if (!(oo->oo_flags & NFS4_OO_CONFIRMED))
+ return nfserr_bad_stateid;
return nfs_ok;
-
-check_replay:
- if (seqid == sop->so_seqid - 1) {
- dprintk("NFSD: preprocess_seqid_op: retransmission?\n");
- /* indicate replay to calling function */
- return nfserr_replay_me;
- }
- dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
- sop->so_seqid, seqid);
- *sopp = NULL;
- return nfserr_bad_seqid;
}
__be32
@@ -3489,8 +3503,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_open_confirm *oc)
{
__be32 status;
- struct nfs4_stateowner *sop;
- struct nfs4_stateid *stp;
+ struct nfs4_openowner *oo;
+ struct nfs4_ol_stateid *stp;
dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3502,38 +3516,52 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
- if ((status = nfs4_preprocess_seqid_op(cstate,
+ status = nfs4_preprocess_seqid_op(cstate,
oc->oc_seqid, &oc->oc_req_stateid,
- CONFIRM | OPEN_STATE,
- &oc->oc_stateowner, &stp, NULL)))
- goto out;
-
- sop = oc->oc_stateowner;
- sop->so_confirmed = 1;
- update_stateid(&stp->st_stateid);
- memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
+ NFS4_OPEN_STID, &stp);
+ if (status)
+ goto out;
+ oo = openowner(stp->st_stateowner);
+ status = nfserr_bad_stateid;
+ if (oo->oo_flags & NFS4_OO_CONFIRMED)
+ goto out;
+ oo->oo_flags |= NFS4_OO_CONFIRMED;
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
- __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid));
+ __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
- nfsd4_create_clid_dir(sop->so_client);
+ nfsd4_create_clid_dir(oo->oo_owner.so_client);
+ status = nfs_ok;
out:
- if (oc->oc_stateowner) {
- nfs4_get_stateowner(oc->oc_stateowner);
- cstate->replay_owner = oc->oc_stateowner;
- }
- nfs4_unlock_state();
+ if (!cstate->replay_owner)
+ nfs4_unlock_state();
return status;
}
-static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access)
+static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
{
- int i;
+ if (!test_bit(access, &stp->st_access_bmap))
+ return;
+ nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
+ __clear_bit(access, &stp->st_access_bmap);
+}
- for (i = 1; i < 4; i++) {
- if (test_bit(i, &stp->st_access_bmap) && !(i & to_access)) {
- nfs4_file_put_access(stp->st_file, i);
- __clear_bit(i, &stp->st_access_bmap);
- }
+static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
+{
+ switch (to_access) {
+ case NFS4_SHARE_ACCESS_READ:
+ nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE);
+ nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+ break;
+ case NFS4_SHARE_ACCESS_WRITE:
+ nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ);
+ nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+ break;
+ case NFS4_SHARE_ACCESS_BOTH:
+ break;
+ default:
+ BUG();
}
}
@@ -3553,24 +3581,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
struct nfsd4_open_downgrade *od)
{
__be32 status;
- struct nfs4_stateid *stp;
+ struct nfs4_ol_stateid *stp;
dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n",
(int)cstate->current_fh.fh_dentry->d_name.len,
cstate->current_fh.fh_dentry->d_name.name);
- if (!access_valid(od->od_share_access, cstate->minorversion)
- || !deny_valid(od->od_share_deny))
- return nfserr_inval;
+ /* We don't yet support WANT bits: */
+ od->od_share_access &= NFS4_SHARE_ACCESS_MASK;
nfs4_lock_state();
- if ((status = nfs4_preprocess_seqid_op(cstate,
- od->od_seqid,
- &od->od_stateid,
- OPEN_STATE,
- &od->od_stateowner, &stp, NULL)))
+ status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
+ &od->od_stateid, &stp);
+ if (status)
goto out;
-
status = nfserr_inval;
if (!test_bit(od->od_share_access, &stp->st_access_bmap)) {
dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n",
@@ -3582,22 +3606,45 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
stp->st_deny_bmap, od->od_share_deny);
goto out;
}
- nfs4_file_downgrade(stp, od->od_share_access);
+ nfs4_stateid_downgrade(stp, od->od_share_access);
reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
- update_stateid(&stp->st_stateid);
- memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t));
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
status = nfs_ok;
out:
- if (od->od_stateowner) {
- nfs4_get_stateowner(od->od_stateowner);
- cstate->replay_owner = od->od_stateowner;
- }
- nfs4_unlock_state();
+ if (!cstate->replay_owner)
+ nfs4_unlock_state();
return status;
}
+void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so)
+{
+ struct nfs4_openowner *oo;
+ struct nfs4_ol_stateid *s;
+
+ if (!so->so_is_open_owner)
+ return;
+ oo = openowner(so);
+ s = oo->oo_last_closed_stid;
+ if (!s)
+ return;
+ if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) {
+ /* Release the last_closed_stid on the next seqid bump: */
+ oo->oo_flags |= NFS4_OO_PURGE_CLOSE;
+ return;
+ }
+ oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
+ release_last_closed_stateid(oo);
+}
+
+static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+{
+ unhash_open_stateid(s);
+ s->st_stid.sc_type = NFS4_CLOSED_STID;
+}
+
/*
* nfs4_unlock_state() called after encode
*/
@@ -3606,39 +3653,37 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_close *close)
{
__be32 status;
- struct nfs4_stateid *stp;
+ struct nfs4_openowner *oo;
+ struct nfs4_ol_stateid *stp;
dprintk("NFSD: nfsd4_close on file %.*s\n",
(int)cstate->current_fh.fh_dentry->d_name.len,
cstate->current_fh.fh_dentry->d_name.name);
nfs4_lock_state();
- /* check close_lru for replay */
- if ((status = nfs4_preprocess_seqid_op(cstate,
- close->cl_seqid,
- &close->cl_stateid,
- OPEN_STATE | CLOSE_STATE,
- &close->cl_stateowner, &stp, NULL)))
+ status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
+ &close->cl_stateid,
+ NFS4_OPEN_STID|NFS4_CLOSED_STID,
+ &stp);
+ if (status)
goto out;
+ oo = openowner(stp->st_stateowner);
status = nfs_ok;
- update_stateid(&stp->st_stateid);
- memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
- /* release_stateid() calls nfsd_close() if needed */
- release_open_stateid(stp);
+ nfsd4_close_open_stateid(stp);
+ oo->oo_last_closed_stid = stp;
/* place unused nfs4_stateowners on so_close_lru list to be
* released by the laundromat service after the lease period
* to enable us to handle CLOSE replay
*/
- if (list_empty(&close->cl_stateowner->so_stateids))
- move_to_close_lru(close->cl_stateowner);
+ if (list_empty(&oo->oo_owner.so_stateids))
+ move_to_close_lru(oo);
out:
- if (close->cl_stateowner) {
- nfs4_get_stateowner(close->cl_stateowner);
- cstate->replay_owner = close->cl_stateowner;
- }
- nfs4_unlock_state();
+ if (!cstate->replay_owner)
+ nfs4_unlock_state();
return status;
}
@@ -3648,34 +3693,22 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct nfs4_delegation *dp;
stateid_t *stateid = &dr->dr_stateid;
+ struct nfs4_stid *s;
struct inode *inode;
__be32 status;
- int flags = 0;
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
inode = cstate->current_fh.fh_dentry->d_inode;
- if (nfsd4_has_session(cstate))
- flags |= HAS_SESSION;
nfs4_lock_state();
- status = nfserr_bad_stateid;
- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
- goto out;
- status = nfserr_stale_stateid;
- if (STALE_STATEID(stateid))
- goto out;
- status = nfserr_bad_stateid;
- if (!is_delegation_stateid(stateid))
- goto out;
- status = nfserr_expired;
- dp = find_delegation_stateid(inode, stateid);
- if (!dp)
+ status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s);
+ if (status)
goto out;
- status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+ dp = delegstateid(s);
+ status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
if (status)
goto out;
- renew_client(dp->dl_client);
unhash_delegation(dp);
out:
@@ -3713,9 +3746,6 @@ last_byte_offset(u64 start, u64 len)
return end > start ? end - 1: NFS4_MAX_UINT64;
}
-#define lockownerid_hashval(id) \
- ((id) & LOCK_HASH_MASK)
-
static inline unsigned int
lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
struct xdr_netobj *ownername)
@@ -3725,101 +3755,7 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
& LOCK_HASH_MASK;
}
-static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
-static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
-
-static int
-same_stateid(stateid_t *id_one, stateid_t *id_two)
-{
- if (id_one->si_stateownerid != id_two->si_stateownerid)
- return 0;
- return id_one->si_fileid == id_two->si_fileid;
-}
-
-static struct nfs4_stateid *
-find_stateid(stateid_t *stid, int flags)
-{
- struct nfs4_stateid *local;
- u32 st_id = stid->si_stateownerid;
- u32 f_id = stid->si_fileid;
- unsigned int hashval;
-
- dprintk("NFSD: find_stateid flags 0x%x\n",flags);
- if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
- hashval = stateid_hashval(st_id, f_id);
- list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
- if ((local->st_stateid.si_stateownerid == st_id) &&
- (local->st_stateid.si_fileid == f_id))
- return local;
- }
- }
-
- if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
- hashval = stateid_hashval(st_id, f_id);
- list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
- if ((local->st_stateid.si_stateownerid == st_id) &&
- (local->st_stateid.si_fileid == f_id))
- return local;
- }
- }
- return NULL;
-}
-
-static struct nfs4_stateid *
-search_for_stateid(stateid_t *stid)
-{
- struct nfs4_stateid *local;
- unsigned int hashval = stateid_hashval(stid->si_stateownerid, stid->si_fileid);
-
- list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
- if (same_stateid(&local->st_stateid, stid))
- return local;
- }
-
- list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
- if (same_stateid(&local->st_stateid, stid))
- return local;
- }
- return NULL;
-}
-
-static struct nfs4_delegation *
-search_for_delegation(stateid_t *stid)
-{
- struct nfs4_file *fp;
- struct nfs4_delegation *dp;
- struct list_head *pos;
- int i;
-
- for (i = 0; i < FILE_HASH_SIZE; i++) {
- list_for_each_entry(fp, &file_hashtbl[i], fi_hash) {
- list_for_each(pos, &fp->fi_delegations) {
- dp = list_entry(pos, struct nfs4_delegation, dl_perfile);
- if (same_stateid(&dp->dl_stateid, stid))
- return dp;
- }
- }
- }
- return NULL;
-}
-
-static struct nfs4_delegation *
-find_delegation_stateid(struct inode *ino, stateid_t *stid)
-{
- struct nfs4_file *fp;
- struct nfs4_delegation *dl;
-
- dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__,
- STATEID_VAL(stid));
-
- fp = find_file(ino);
- if (!fp)
- return NULL;
- dl = find_delegation_file(fp, stid);
- put_nfs4_file(fp);
- return dl;
-}
/*
* TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
@@ -3846,15 +3782,21 @@ static const struct lock_manager_operations nfsd_posix_mng_ops = {
static inline void
nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
{
- struct nfs4_stateowner *sop;
+ struct nfs4_lockowner *lo;
if (fl->fl_lmops == &nfsd_posix_mng_ops) {
- sop = (struct nfs4_stateowner *) fl->fl_owner;
- kref_get(&sop->so_ref);
- deny->ld_sop = sop;
- deny->ld_clientid = sop->so_client->cl_clientid;
+ lo = (struct nfs4_lockowner *) fl->fl_owner;
+ deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data,
+ lo->lo_owner.so_owner.len, GFP_KERNEL);
+ if (!deny->ld_owner.data)
+ /* We just don't care that much */
+ goto nevermind;
+ deny->ld_owner.len = lo->lo_owner.so_owner.len;
+ deny->ld_clientid = lo->lo_owner.so_client->cl_clientid;
} else {
- deny->ld_sop = NULL;
+nevermind:
+ deny->ld_owner.len = 0;
+ deny->ld_owner.data = NULL;
deny->ld_clientid.cl_boot = 0;
deny->ld_clientid.cl_id = 0;
}
@@ -3867,8 +3809,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
deny->ld_type = NFS4_WRITE_LT;
}
-static struct nfs4_stateowner *
-find_lockstateowner_str(struct inode *inode, clientid_t *clid,
+static struct nfs4_lockowner *
+find_lockowner_str(struct inode *inode, clientid_t *clid,
struct xdr_netobj *owner)
{
unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
@@ -3876,11 +3818,17 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
if (same_owner_str(op, owner, clid))
- return op;
+ return lockowner(op);
}
return NULL;
}
+static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
+{
+ list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]);
+ list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
+}
+
/*
* Alloc a lock owner structure.
* Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has
@@ -3889,67 +3837,40 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
* strhashval = lock_ownerstr_hashval
*/
-static struct nfs4_stateowner *
-alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) {
- struct nfs4_stateowner *sop;
- struct nfs4_replay *rp;
- unsigned int idhashval;
+static struct nfs4_lockowner *
+alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) {
+ struct nfs4_lockowner *lo;
- if (!(sop = alloc_stateowner(&lock->lk_new_owner)))
+ lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
+ if (!lo)
return NULL;
- idhashval = lockownerid_hashval(current_ownerid);
- INIT_LIST_HEAD(&sop->so_idhash);
- INIT_LIST_HEAD(&sop->so_strhash);
- INIT_LIST_HEAD(&sop->so_perclient);
- INIT_LIST_HEAD(&sop->so_stateids);
- INIT_LIST_HEAD(&sop->so_perstateid);
- INIT_LIST_HEAD(&sop->so_close_lru); /* not used */
- sop->so_time = 0;
- list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]);
- list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]);
- list_add(&sop->so_perstateid, &open_stp->st_lockowners);
- sop->so_is_open_owner = 0;
- sop->so_id = current_ownerid++;
- sop->so_client = clp;
+ INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
+ lo->lo_owner.so_is_open_owner = 0;
/* It is the openowner seqid that will be incremented in encode in the
* case of new lockowners; so increment the lock seqid manually: */
- sop->so_seqid = lock->lk_new_lock_seqid + 1;
- sop->so_confirmed = 1;
- rp = &sop->so_replay;
- rp->rp_status = nfserr_serverfault;
- rp->rp_buflen = 0;
- rp->rp_buf = rp->rp_ibuf;
- return sop;
+ lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1;
+ hash_lockowner(lo, strhashval, clp, open_stp);
+ return lo;
}
-static struct nfs4_stateid *
-alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp)
+static struct nfs4_ol_stateid *
+alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp)
{
- struct nfs4_stateid *stp;
- unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
+ struct nfs4_ol_stateid *stp;
+ struct nfs4_client *clp = lo->lo_owner.so_client;
- stp = nfs4_alloc_stateid();
+ stp = nfs4_alloc_stateid(clp);
if (stp == NULL)
- goto out;
- INIT_LIST_HEAD(&stp->st_hash);
- INIT_LIST_HEAD(&stp->st_perfile);
- INIT_LIST_HEAD(&stp->st_perstateowner);
- INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
- list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
+ return NULL;
+ init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
list_add(&stp->st_perfile, &fp->fi_stateids);
- list_add(&stp->st_perstateowner, &sop->so_stateids);
- stp->st_stateowner = sop;
+ list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
+ stp->st_stateowner = &lo->lo_owner;
get_nfs4_file(fp);
stp->st_file = fp;
- stp->st_stateid.si_boot = boot_time;
- stp->st_stateid.si_stateownerid = sop->so_id;
- stp->st_stateid.si_fileid = fp->fi_id;
- stp->st_stateid.si_generation = 0;
stp->st_access_bmap = 0;
stp->st_deny_bmap = open_stp->st_deny_bmap;
stp->st_openstp = open_stp;
-
-out:
return stp;
}
@@ -3960,7 +3881,7 @@ check_lock_length(u64 offset, u64 length)
LOFF_OVERFLOW(offset, length)));
}
-static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access)
+static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
{
struct nfs4_file *fp = lock_stp->st_file;
int oflag = nfs4_access_to_omode(access);
@@ -3978,15 +3899,16 @@ __be32
nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_lock *lock)
{
- struct nfs4_stateowner *open_sop = NULL;
- struct nfs4_stateowner *lock_sop = NULL;
- struct nfs4_stateid *lock_stp;
+ struct nfs4_openowner *open_sop = NULL;
+ struct nfs4_lockowner *lock_sop = NULL;
+ struct nfs4_ol_stateid *lock_stp;
struct nfs4_file *fp;
struct file *filp = NULL;
struct file_lock file_lock;
struct file_lock conflock;
__be32 status = 0;
unsigned int strhashval;
+ int lkflg;
int err;
dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -4010,7 +3932,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* Use open owner and open stateid to create lock owner and
* lock stateid.
*/
- struct nfs4_stateid *open_stp = NULL;
+ struct nfs4_ol_stateid *open_stp = NULL;
status = nfserr_stale_clientid;
if (!nfsd4_has_session(cstate) &&
@@ -4018,26 +3940,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
/* validate and update open stateid and open seqid */
- status = nfs4_preprocess_seqid_op(cstate,
+ status = nfs4_preprocess_confirmed_seqid_op(cstate,
lock->lk_new_open_seqid,
&lock->lk_new_open_stateid,
- OPEN_STATE,
- &lock->lk_replay_owner, &open_stp,
- lock);
+ &open_stp);
if (status)
goto out;
- open_sop = lock->lk_replay_owner;
+ open_sop = openowner(open_stp->st_stateowner);
+ status = nfserr_bad_stateid;
+ if (!nfsd4_has_session(cstate) &&
+ !same_clid(&open_sop->oo_owner.so_client->cl_clientid,
+ &lock->v.new.clientid))
+ goto out;
/* create lockowner and lock stateid */
fp = open_stp->st_file;
- strhashval = lock_ownerstr_hashval(fp->fi_inode,
- open_sop->so_client->cl_clientid.cl_id,
+ strhashval = lock_ownerstr_hashval(fp->fi_inode,
+ open_sop->oo_owner.so_client->cl_clientid.cl_id,
&lock->v.new.owner);
/* XXX: Do we need to check for duplicate stateowners on
* the same file, or should they just be allowed (and
* create new stateids)? */
- status = nfserr_resource;
+ status = nfserr_jukebox;
lock_sop = alloc_init_lock_stateowner(strhashval,
- open_sop->so_client, open_stp, lock);
+ open_sop->oo_owner.so_client, open_stp, lock);
if (lock_sop == NULL)
goto out;
lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
@@ -4046,16 +3971,20 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
} else {
/* lock (lock owner + lock stateid) already exists */
status = nfs4_preprocess_seqid_op(cstate,
- lock->lk_old_lock_seqid,
- &lock->lk_old_lock_stateid,
- LOCK_STATE,
- &lock->lk_replay_owner, &lock_stp, lock);
+ lock->lk_old_lock_seqid,
+ &lock->lk_old_lock_stateid,
+ NFS4_LOCK_STID, &lock_stp);
if (status)
goto out;
- lock_sop = lock->lk_replay_owner;
+ lock_sop = lockowner(lock_stp->st_stateowner);
fp = lock_stp->st_file;
}
- /* lock->lk_replay_owner and lock_stp have been created or found */
+ /* lock_sop and lock_stp have been created or found */
+
+ lkflg = setlkflg(lock->lk_type);
+ status = nfs4_check_openmode(lock_stp, lkflg);
+ if (status)
+ goto out;
status = nfserr_grace;
if (locks_in_grace() && !lock->lk_reclaim)
@@ -4106,8 +4035,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
switch (-err) {
case 0: /* success! */
- update_stateid(&lock_stp->st_stateid);
- memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid,
+ update_stateid(&lock_stp->st_stid.sc_stateid);
+ memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid,
sizeof(stateid_t));
status = 0;
break;
@@ -4119,19 +4048,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
case (EDEADLK):
status = nfserr_deadlock;
break;
- default:
+ default:
dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err);
- status = nfserr_resource;
+ status = nfserrno(err);
break;
}
out:
if (status && lock->lk_is_new && lock_sop)
release_lockowner(lock_sop);
- if (lock->lk_replay_owner) {
- nfs4_get_stateowner(lock->lk_replay_owner);
- cstate->replay_owner = lock->lk_replay_owner;
- }
- nfs4_unlock_state();
+ if (!cstate->replay_owner)
+ nfs4_unlock_state();
return status;
}
@@ -4163,6 +4089,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct inode *inode;
struct file_lock file_lock;
+ struct nfs4_lockowner *lo;
int error;
__be32 status;
@@ -4172,19 +4099,14 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (check_lock_length(lockt->lt_offset, lockt->lt_length))
return nfserr_inval;
- lockt->lt_stateowner = NULL;
nfs4_lock_state();
status = nfserr_stale_clientid;
if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
goto out;
- if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
- dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n");
- if (status == nfserr_symlink)
- status = nfserr_inval;
+ if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
goto out;
- }
inode = cstate->current_fh.fh_dentry->d_inode;
locks_init_lock(&file_lock);
@@ -4203,10 +4125,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- lockt->lt_stateowner = find_lockstateowner_str(inode,
- &lockt->lt_clientid, &lockt->lt_owner);
- if (lockt->lt_stateowner)
- file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
+ lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
+ if (lo)
+ file_lock.fl_owner = (fl_owner_t)lo;
file_lock.fl_pid = current->tgid;
file_lock.fl_flags = FL_POSIX;
@@ -4234,7 +4155,7 @@ __be32
nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_locku *locku)
{
- struct nfs4_stateid *stp;
+ struct nfs4_ol_stateid *stp;
struct file *filp = NULL;
struct file_lock file_lock;
__be32 status;
@@ -4249,13 +4170,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
- if ((status = nfs4_preprocess_seqid_op(cstate,
- locku->lu_seqid,
- &locku->lu_stateid,
- LOCK_STATE,
- &locku->lu_stateowner, &stp, NULL)))
+ status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
+ &locku->lu_stateid, NFS4_LOCK_STID, &stp);
+ if (status)
goto out;
-
filp = find_any_file(stp->st_file);
if (!filp) {
status = nfserr_lock_range;
@@ -4264,7 +4182,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
BUG_ON(!filp);
locks_init_lock(&file_lock);
file_lock.fl_type = F_UNLCK;
- file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner;
+ file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
file_lock.fl_pid = current->tgid;
file_lock.fl_file = filp;
file_lock.fl_flags = FL_POSIX;
@@ -4285,15 +4203,12 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/*
* OK, unlock succeeded; the only thing left to do is update the stateid.
*/
- update_stateid(&stp->st_stateid);
- memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t));
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
out:
- if (locku->lu_stateowner) {
- nfs4_get_stateowner(locku->lu_stateowner);
- cstate->replay_owner = locku->lu_stateowner;
- }
- nfs4_unlock_state();
+ if (!cstate->replay_owner)
+ nfs4_unlock_state();
return status;
out_nfserr:
@@ -4307,7 +4222,7 @@ out_nfserr:
* 0: no locks held by lockowner
*/
static int
-check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
+check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
{
struct file_lock **flpp;
struct inode *inode = filp->fi_inode;
@@ -4332,7 +4247,8 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
{
clientid_t *clid = &rlockowner->rl_clientid;
struct nfs4_stateowner *sop;
- struct nfs4_stateid *stp;
+ struct nfs4_lockowner *lo;
+ struct nfs4_ol_stateid *stp;
struct xdr_netobj *owner = &rlockowner->rl_owner;
struct list_head matches;
int i;
@@ -4356,16 +4272,15 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
* data structures. */
INIT_LIST_HEAD(&matches);
for (i = 0; i < LOCK_HASH_SIZE; i++) {
- list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) {
+ list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) {
if (!same_owner_str(sop, owner, clid))
continue;
list_for_each_entry(stp, &sop->so_stateids,
st_perstateowner) {
- if (check_for_locks(stp->st_file, sop))
+ lo = lockowner(sop);
+ if (check_for_locks(stp->st_file, lo))
goto out;
- /* Note: so_perclient unused for lockowners,
- * so it's OK to fool with here. */
- list_add(&sop->so_perclient, &matches);
+ list_add(&lo->lo_list, &matches);
}
}
}
@@ -4374,12 +4289,12 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
* have been checked. */
status = nfs_ok;
while (!list_empty(&matches)) {
- sop = list_entry(matches.next, struct nfs4_stateowner,
- so_perclient);
+ lo = list_entry(matches.next, struct nfs4_lockowner,
+ lo_list);
/* unhash_stateowner deletes so_perclient only
* for openowners. */
- list_del(&sop->so_perclient);
- release_lockowner(sop);
+ list_del(&lo->lo_list);
+ release_lockowner(lo);
}
out:
nfs4_unlock_state();
@@ -4501,16 +4416,10 @@ nfs4_state_init(void)
for (i = 0; i < FILE_HASH_SIZE; i++) {
INIT_LIST_HEAD(&file_hashtbl[i]);
}
- for (i = 0; i < OWNER_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
- INIT_LIST_HEAD(&ownerid_hashtbl[i]);
- }
- for (i = 0; i < STATEID_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&stateid_hashtbl[i]);
- INIT_LIST_HEAD(&lockstateid_hashtbl[i]);
+ for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
+ INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
}
for (i = 0; i < LOCK_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]);
INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
}
memset(&onestateid, ~0, sizeof(stateid_t));
@@ -4527,7 +4436,7 @@ nfsd4_load_reboot_recovery_data(void)
int status;
nfs4_lock_state();
- nfsd4_init_recdir(user_recovery_dirname);
+ nfsd4_init_recdir();
status = nfsd4_recdir_load();
nfs4_unlock_state();
if (status)
@@ -4636,40 +4545,3 @@ nfs4_state_shutdown(void)
nfs4_unlock_state();
nfsd4_destroy_callback_queue();
}
-
-/*
- * user_recovery_dirname is protected by the nfsd_mutex since it's only
- * accessed when nfsd is starting.
- */
-static void
-nfs4_set_recdir(char *recdir)
-{
- strcpy(user_recovery_dirname, recdir);
-}
-
-/*
- * Change the NFSv4 recovery directory to recdir.
- */
-int
-nfs4_reset_recoverydir(char *recdir)
-{
- int status;
- struct path path;
-
- status = kern_path(recdir, LOOKUP_FOLLOW, &path);
- if (status)
- return status;
- status = -ENOTDIR;
- if (S_ISDIR(path.dentry->d_inode->i_mode)) {
- nfs4_set_recdir(recdir);
- status = 0;
- }
- path_put(&path);
- return status;
-}
-
-char *
-nfs4_recoverydir(void)
-{
- return user_recovery_dirname;
-}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c8bf405d19d..b6fa792d6b8 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -456,7 +456,6 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
{
DECODE_HEAD;
- close->cl_stateowner = NULL;
READ_BUF(4);
READ32(close->cl_seqid);
return nfsd4_decode_stateid(argp, &close->cl_stateid);
@@ -551,7 +550,6 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
{
DECODE_HEAD;
- lock->lk_replay_owner = NULL;
/*
* type, reclaim(boolean), offset, length, new_lock_owner(boolean)
*/
@@ -611,7 +609,6 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
{
DECODE_HEAD;
- locku->lu_stateowner = NULL;
READ_BUF(8);
READ32(locku->lu_type);
if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
@@ -642,6 +639,83 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
DECODE_TAIL;
}
+static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
+{
+ __be32 *p;
+ u32 w;
+
+ READ_BUF(4);
+ READ32(w);
+ *x = w;
+ switch (w & NFS4_SHARE_ACCESS_MASK) {
+ case NFS4_SHARE_ACCESS_READ:
+ case NFS4_SHARE_ACCESS_WRITE:
+ case NFS4_SHARE_ACCESS_BOTH:
+ break;
+ default:
+ return nfserr_bad_xdr;
+ }
+ w &= ~NFS4_SHARE_ACCESS_MASK;
+ if (!w)
+ return nfs_ok;
+ if (!argp->minorversion)
+ return nfserr_bad_xdr;
+ switch (w & NFS4_SHARE_WANT_MASK) {
+ case NFS4_SHARE_WANT_NO_PREFERENCE:
+ case NFS4_SHARE_WANT_READ_DELEG:
+ case NFS4_SHARE_WANT_WRITE_DELEG:
+ case NFS4_SHARE_WANT_ANY_DELEG:
+ case NFS4_SHARE_WANT_NO_DELEG:
+ case NFS4_SHARE_WANT_CANCEL:
+ break;
+ default:
+ return nfserr_bad_xdr;
+ }
+ w &= ~NFS4_SHARE_WANT_MASK;
+ if (!w)
+ return nfs_ok;
+ switch (w) {
+ case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
+ case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED:
+ case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL |
+ NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED):
+ return nfs_ok;
+ }
+xdr_error:
+ return nfserr_bad_xdr;
+}
+
+static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
+{
+ __be32 *p;
+
+ READ_BUF(4);
+ READ32(*x);
+ /* Note: unlinke access bits, deny bits may be zero. */
+ if (*x & ~NFS4_SHARE_DENY_BOTH)
+ return nfserr_bad_xdr;
+ return nfs_ok;
+xdr_error:
+ return nfserr_bad_xdr;
+}
+
+static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
+{
+ __be32 *p;
+
+ READ_BUF(4);
+ READ32(o->len);
+
+ if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
+ return nfserr_bad_xdr;
+
+ READ_BUF(o->len);
+ SAVEMEM(o->data, o->len);
+ return nfs_ok;
+xdr_error:
+ return nfserr_bad_xdr;
+}
+
static __be32
nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
{
@@ -649,19 +723,23 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
memset(open->op_bmval, 0, sizeof(open->op_bmval));
open->op_iattr.ia_valid = 0;
- open->op_stateowner = NULL;
+ open->op_openowner = NULL;
/* seqid, share_access, share_deny, clientid, ownerlen */
- READ_BUF(16 + sizeof(clientid_t));
+ READ_BUF(4);
READ32(open->op_seqid);
- READ32(open->op_share_access);
- READ32(open->op_share_deny);
+ status = nfsd4_decode_share_access(argp, &open->op_share_access);
+ if (status)
+ goto xdr_error;
+ status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
+ if (status)
+ goto xdr_error;
+ READ_BUF(sizeof(clientid_t));
COPYMEM(&open->op_clientid, sizeof(clientid_t));
- READ32(open->op_owner.len);
-
- /* owner, open_flag */
- READ_BUF(open->op_owner.len + 4);
- SAVEMEM(open->op_owner.data, open->op_owner.len);
+ status = nfsd4_decode_opaque(argp, &open->op_owner);
+ if (status)
+ goto xdr_error;
+ READ_BUF(4);
READ32(open->op_create);
switch (open->op_create) {
case NFS4_OPEN_NOCREATE:
@@ -727,6 +805,19 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
return status;
break;
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ if (argp->minorversion < 1)
+ goto xdr_error;
+ /* void */
+ break;
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ if (argp->minorversion < 1)
+ goto xdr_error;
+ status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+ if (status)
+ return status;
+ break;
default:
goto xdr_error;
}
@@ -739,7 +830,6 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
{
DECODE_HEAD;
- open_conf->oc_stateowner = NULL;
status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
if (status)
return status;
@@ -754,15 +844,17 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
{
DECODE_HEAD;
- open_down->od_stateowner = NULL;
status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
if (status)
return status;
- READ_BUF(12);
+ READ_BUF(4);
READ32(open_down->od_seqid);
- READ32(open_down->od_share_access);
- READ32(open_down->od_share_deny);
-
+ status = nfsd4_decode_share_access(argp, &open_down->od_share_access);
+ if (status)
+ return status;
+ status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
+ if (status)
+ return status;
DECODE_TAIL;
}
@@ -903,12 +995,13 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
{
DECODE_HEAD;
- READ_BUF(12);
+ READ_BUF(8);
COPYMEM(setclientid->se_verf.data, 8);
- READ32(setclientid->se_namelen);
- READ_BUF(setclientid->se_namelen + 8);
- SAVEMEM(setclientid->se_name, setclientid->se_namelen);
+ status = nfsd4_decode_opaque(argp, &setclientid->se_name);
+ if (status)
+ return nfserr_bad_xdr;
+ READ_BUF(8);
READ32(setclientid->se_callback_prog);
READ32(setclientid->se_callback_netid_len);
@@ -1051,11 +1144,9 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
READ_BUF(NFS4_VERIFIER_SIZE);
COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
- READ_BUF(4);
- READ32(exid->clname.len);
-
- READ_BUF(exid->clname.len);
- SAVEMEM(exid->clname.data, exid->clname.len);
+ status = nfsd4_decode_opaque(argp, &exid->clname);
+ if (status)
+ return nfserr_bad_xdr;
READ_BUF(4);
READ32(exid->flags);
@@ -1326,6 +1417,16 @@ xdr_error:
goto out;
}
+static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc)
+{
+ DECODE_HEAD;
+
+ READ_BUF(8);
+ COPYMEM(&dc->clientid, 8);
+
+ DECODE_TAIL;
+}
+
static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
{
DECODE_HEAD;
@@ -1447,7 +1548,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
[OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid,
[OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
[OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
};
@@ -1630,15 +1731,20 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
* we know whether the error to be returned is a sequence id mutating error.
*/
-#define ENCODE_SEQID_OP_TAIL(stateowner) do { \
- if (seqid_mutating_err(nfserr) && stateowner) { \
- stateowner->so_seqid++; \
- stateowner->so_replay.rp_status = nfserr; \
- stateowner->so_replay.rp_buflen = \
- (((char *)(resp)->p - (char *)save)); \
- memcpy(stateowner->so_replay.rp_buf, save, \
- stateowner->so_replay.rp_buflen); \
- } } while (0);
+static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr)
+{
+ struct nfs4_stateowner *stateowner = resp->cstate.replay_owner;
+
+ if (seqid_mutating_err(ntohl(nfserr)) && stateowner) {
+ stateowner->so_seqid++;
+ stateowner->so_replay.rp_status = nfserr;
+ stateowner->so_replay.rp_buflen =
+ (char *)resp->p - (char *)save;
+ memcpy(stateowner->so_replay.rp_buf, save,
+ stateowner->so_replay.rp_buflen);
+ nfsd4_purge_closed_stateid(stateowner);
+ }
+}
/* Encode as an array of strings the string given with components
* separated @sep.
@@ -1697,36 +1803,89 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
}
/*
- * Return the path to an export point in the pseudo filesystem namespace
- * Returned string is safe to use as long as the caller holds a reference
- * to @exp.
+ * Encode a path in RFC3530 'pathname4' format
*/
-static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
+static __be32 nfsd4_encode_path(const struct path *root,
+ const struct path *path, __be32 **pp, int *buflen)
{
- struct svc_fh tmp_fh;
- char *path = NULL, *rootpath;
- size_t rootlen;
+ struct path cur = {
+ .mnt = path->mnt,
+ .dentry = path->dentry,
+ };
+ __be32 *p = *pp;
+ struct dentry **components = NULL;
+ unsigned int ncomponents = 0;
+ __be32 err = nfserr_jukebox;
- fh_init(&tmp_fh, NFS4_FHSIZE);
- *stat = exp_pseudoroot(rqstp, &tmp_fh);
- if (*stat)
- return NULL;
- rootpath = tmp_fh.fh_export->ex_pathname;
+ dprintk("nfsd4_encode_components(");
- path = exp->ex_pathname;
+ path_get(&cur);
+ /* First walk the path up to the nfsd root, and store the
+ * dentries/path components in an array.
+ */
+ for (;;) {
+ if (cur.dentry == root->dentry && cur.mnt == root->mnt)
+ break;
+ if (cur.dentry == cur.mnt->mnt_root) {
+ if (follow_up(&cur))
+ continue;
+ goto out_free;
+ }
+ if ((ncomponents & 15) == 0) {
+ struct dentry **new;
+ new = krealloc(components,
+ sizeof(*new) * (ncomponents + 16),
+ GFP_KERNEL);
+ if (!new)
+ goto out_free;
+ components = new;
+ }
+ components[ncomponents++] = cur.dentry;
+ cur.dentry = dget_parent(cur.dentry);
+ }
- rootlen = strlen(rootpath);
- if (strncmp(path, rootpath, rootlen)) {
- dprintk("nfsd: fs_locations failed;"
- "%s is not contained in %s\n", path, rootpath);
- *stat = nfserr_notsupp;
- path = NULL;
- goto out;
+ *buflen -= 4;
+ if (*buflen < 0)
+ goto out_free;
+ WRITE32(ncomponents);
+
+ while (ncomponents) {
+ struct dentry *dentry = components[ncomponents - 1];
+ unsigned int len = dentry->d_name.len;
+
+ *buflen -= 4 + (XDR_QUADLEN(len) << 2);
+ if (*buflen < 0)
+ goto out_free;
+ WRITE32(len);
+ WRITEMEM(dentry->d_name.name, len);
+ dprintk("/%s", dentry->d_name.name);
+ dput(dentry);
+ ncomponents--;
}
- path += rootlen;
-out:
- fh_put(&tmp_fh);
- return path;
+
+ *pp = p;
+ err = 0;
+out_free:
+ dprintk(")\n");
+ while (ncomponents)
+ dput(components[--ncomponents]);
+ kfree(components);
+ path_put(&cur);
+ return err;
+}
+
+static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
+ const struct path *path, __be32 **pp, int *buflen)
+{
+ struct svc_export *exp_ps;
+ __be32 res;
+
+ exp_ps = rqst_find_fsidzero_export(rqstp);
+ if (IS_ERR(exp_ps))
+ return nfserrno(PTR_ERR(exp_ps));
+ res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen);
+ exp_put(exp_ps);
+ return res;
}
/*
@@ -1740,11 +1899,8 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
int i;
__be32 *p = *pp;
struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
- char *root = nfsd4_path(rqstp, exp, &status);
- if (status)
- return status;
- status = nfsd4_encode_components('/', root, &p, buflen);
+ status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen);
if (status)
return status;
if ((*buflen -= 4) < 0)
@@ -1760,12 +1916,19 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
return 0;
}
-static u32 nfs4_ftypes[16] = {
- NF4BAD, NF4FIFO, NF4CHR, NF4BAD,
- NF4DIR, NF4BAD, NF4BLK, NF4BAD,
- NF4REG, NF4BAD, NF4LNK, NF4BAD,
- NF4SOCK, NF4BAD, NF4LNK, NF4BAD,
-};
+static u32 nfs4_file_type(umode_t mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFIFO: return NF4FIFO;
+ case S_IFCHR: return NF4CHR;
+ case S_IFDIR: return NF4DIR;
+ case S_IFBLK: return NF4BLK;
+ case S_IFLNK: return NF4LNK;
+ case S_IFREG: return NF4REG;
+ case S_IFSOCK: return NF4SOCK;
+ default: return NF4BAD;
+ };
+}
static __be32
nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
@@ -1954,7 +2117,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if (bmval0 & FATTR4_WORD0_TYPE) {
if ((buflen -= 4) < 0)
goto out_resource;
- dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12];
+ dummy = nfs4_file_type(stat.mode);
if (dummy == NF4BAD)
goto out_serverfault;
WRITE32(dummy);
@@ -2488,7 +2651,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
if (!nfserr)
nfsd4_encode_stateid(resp, &close->cl_stateid);
- ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
+ encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2564,17 +2727,18 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
static void
nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
{
+ struct xdr_netobj *conf = &ld->ld_owner;
__be32 *p;
- RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0));
+ RESERVE_SPACE(32 + XDR_LEN(conf->len));
WRITE64(ld->ld_start);
WRITE64(ld->ld_length);
WRITE32(ld->ld_type);
- if (ld->ld_sop) {
+ if (conf->len) {
WRITEMEM(&ld->ld_clientid, 8);
- WRITE32(ld->ld_sop->so_owner.len);
- WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len);
- kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner);
+ WRITE32(conf->len);
+ WRITEMEM(conf->data, conf->len);
+ kfree(conf->data);
} else { /* non - nfsv4 lock in conflict, no clientid nor owner */
WRITE64((u64)0); /* clientid */
WRITE32(0); /* length of owner name */
@@ -2592,7 +2756,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
else if (nfserr == nfserr_denied)
nfsd4_encode_lock_denied(resp, &lock->lk_denied);
- ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
+ encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2612,7 +2776,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
if (!nfserr)
nfsd4_encode_stateid(resp, &locku->lu_stateid);
- ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
+ encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2693,7 +2857,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
}
/* XXX save filehandle here */
out:
- ENCODE_SEQID_OP_TAIL(open->op_stateowner);
+ encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2705,7 +2869,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
if (!nfserr)
nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
- ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
+ encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2717,7 +2881,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
if (!nfserr)
nfsd4_encode_stateid(resp, &od->od_stateid);
- ENCODE_SEQID_OP_TAIL(od->od_stateowner);
+ encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2759,8 +2923,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
&maxcount);
- if (nfserr == nfserr_symlink)
- nfserr = nfserr_inval;
if (nfserr)
return nfserr;
eof = (read->rd_offset + maxcount >=
@@ -2886,8 +3048,6 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
readdir->common.err == nfserr_toosmall &&
readdir->buffer == page)
nfserr = nfserr_toosmall;
- if (nfserr == nfserr_symlink)
- nfserr = nfserr_notdir;
if (nfserr)
goto err_no_verf;
@@ -3218,9 +3378,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
WRITE32(seq->seqid);
WRITE32(seq->slotid);
- WRITE32(seq->maxslots);
- /* For now: target_maxslots = maxslots */
- WRITE32(seq->maxslots);
+ /* Note slotid's are numbered from zero: */
+ WRITE32(seq->maxslots - 1); /* sr_highest_slotid */
+ WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */
WRITE32(seq->status_flags);
ADJUST_ARGS();
@@ -3233,6 +3393,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
struct nfsd4_test_stateid *test_stateid)
{
struct nfsd4_compoundargs *argp;
+ struct nfs4_client *cl = resp->cstate.session->se_client;
stateid_t si;
__be32 *p;
int i;
@@ -3248,7 +3409,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
nfs4_lock_state();
for (i = 0; i < test_stateid->ts_num_ids; i++) {
nfsd4_decode_stateid(argp, &si);
- valid = nfs4_validate_stateid(&si, test_stateid->ts_has_session);
+ valid = nfs4_validate_stateid(cl, &si);
RESERVE_SPACE(4);
*p++ = htonl(valid);
resp->p = p;
@@ -3334,34 +3495,29 @@ static nfsd4_enc nfsd4_enc_ops[] = {
/*
* Calculate the total amount of memory that the compound response has taken
- * after encoding the current operation.
+ * after encoding the current operation with pad.
*
- * pad: add on 8 bytes for the next operation's op_code and status so that
- * there is room to cache a failure on the next operation.
+ * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop()
+ * which was specified at nfsd4_operation, else pad is zero.
*
- * Compare this length to the session se_fmaxresp_cached.
+ * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached.
*
* Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
* will be at least a page and will therefore hold the xdr_buf head.
*/
-static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
{
- int status = 0;
struct xdr_buf *xb = &resp->rqstp->rq_res;
- struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
struct nfsd4_session *session = NULL;
struct nfsd4_slot *slot = resp->cstate.slot;
- u32 length, tlen = 0, pad = 8;
+ u32 length, tlen = 0;
if (!nfsd4_has_session(&resp->cstate))
- return status;
+ return 0;
session = resp->cstate.session;
- if (session == NULL || slot->sl_cachethis == 0)
- return status;
-
- if (resp->opcnt >= args->opcnt)
- pad = 0; /* this is the last operation */
+ if (session == NULL)
+ return 0;
if (xb->page_len == 0) {
length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
@@ -3374,10 +3530,14 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
length, xb->page_len, tlen, pad);
- if (length <= session->se_fchannel.maxresp_cached)
- return status;
- else
+ if (length > session->se_fchannel.maxresp_sz)
+ return nfserr_rep_too_big;
+
+ if (slot->sl_cachethis == 1 &&
+ length > session->se_fchannel.maxresp_cached)
return nfserr_rep_too_big_to_cache;
+
+ return 0;
}
void
@@ -3397,8 +3557,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
!nfsd4_enc_ops[op->opnum]);
op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
/* nfsd4_check_drc_limit guarantees enough room for error status */
- if (!op->status && nfsd4_check_drc_limit(resp))
- op->status = nfserr_rep_too_big_to_cache;
+ if (!op->status)
+ op->status = nfsd4_check_resp_size(resp, 0);
status:
/*
* Note: We write the status directly, instead of using WRITE32(),
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c7716143cbd..c45a2ea4a09 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -9,11 +9,11 @@
#include <linux/ctype.h>
#include <linux/sunrpc/svcsock.h>
-#include <linux/nfsd/syscall.h>
#include <linux/lockd/lockd.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/gss_api.h>
#include <linux/sunrpc/gss_krb5_enctypes.h>
+#include <linux/module.h>
#include "idmap.h"
#include "nfsd.h"
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 7ecfa242030..58134a23fdf 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -11,13 +11,39 @@
#include <linux/types.h>
#include <linux/mount.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/sunrpc/msg_prot.h>
+
#include <linux/nfsd/debug.h>
#include <linux/nfsd/export.h>
#include <linux/nfsd/stats.h>
+
/*
* nfsd version
*/
#define NFSD_SUPPORTED_MINOR_VERSION 1
+/*
+ * Maximum blocksizes supported by daemon under various circumstances.
+ */
+#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD
+/* NFSv2 is limited by the protocol specification, see RFC 1094 */
+#define NFSSVC_MAXBLKSIZE_V2 (8*1024)
+
+
+/*
+ * Largest number of bytes we need to allocate for an NFS
+ * call or reply. Used to control buffer sizes. We use
+ * the length of v3 WRITE, READDIR and READDIR replies
+ * which are an RPC header, up to 26 XDR units of reply
+ * data, and some page data.
+ *
+ * Note that accuracy here doesn't matter too much as the
+ * size is rounded up to a page size when allocating space.
+ */
+#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE)
struct readdir_cd {
__be32 err; /* 0, nfserr, or nfserr_eof */
@@ -335,6 +361,13 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
NFSD_WRITEABLE_ATTRS_WORD2
+extern int nfsd4_is_junction(struct dentry *dentry);
+#else
+static inline int nfsd4_is_junction(struct dentry *dentry)
+{
+ return 0;
+}
+
#endif /* CONFIG_NFSD_V4 */
#endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 90c6aa6d5e0..c763de5c115 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -59,28 +59,25 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
* the write call).
*/
static inline __be32
-nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
+nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested)
{
- /* Type can be negative when creating hardlinks - not to a dir */
- if (type > 0 && (mode & S_IFMT) != type) {
- if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
- return nfserr_symlink;
- else if (type == S_IFDIR)
- return nfserr_notdir;
- else if ((mode & S_IFMT) == S_IFDIR)
- return nfserr_isdir;
- else
- return nfserr_inval;
- }
- if (type < 0 && (mode & S_IFMT) == -type) {
- if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
- return nfserr_symlink;
- else if (type == -S_IFDIR)
- return nfserr_isdir;
- else
- return nfserr_notdir;
- }
- return 0;
+ mode &= S_IFMT;
+
+ if (requested == 0) /* the caller doesn't care */
+ return nfs_ok;
+ if (mode == requested)
+ return nfs_ok;
+ /*
+ * v4 has an error more specific than err_notdir which we should
+ * return in preference to err_notdir:
+ */
+ if (rqstp->rq_vers == 4 && mode == S_IFLNK)
+ return nfserr_symlink;
+ if (requested == S_IFDIR)
+ return nfserr_notdir;
+ if (mode == S_IFDIR)
+ return nfserr_isdir;
+ return nfserr_inval;
}
static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index dc5a1bf476b..eda7d7e55e0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/freezer.h>
+#include <linux/module.h>
#include <linux/fs_struct.h>
#include <linux/swap.h>
@@ -256,6 +257,8 @@ static void nfsd_last_thread(struct svc_serv *serv)
nfsd_serv = NULL;
nfsd_shutdown();
+ svc_rpcb_cleanup(serv);
+
printk(KERN_WARNING "nfsd: last server has exited, flushing export "
"cache\n");
nfsd_export_flush();
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4eefaf1b42e..a3cf38476a1 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
#ifndef _NFSD4_STATE_H
#define _NFSD4_STATE_H
+#include <linux/idr.h>
#include <linux/sunrpc/svc_xprt.h>
#include <linux/nfsd/nfsfh.h>
#include "nfsfh.h"
@@ -45,24 +46,20 @@ typedef struct {
} clientid_t;
typedef struct {
- u32 so_boot;
- u32 so_stateownerid;
- u32 so_fileid;
+ clientid_t so_clid;
+ u32 so_id;
} stateid_opaque_t;
typedef struct {
u32 si_generation;
stateid_opaque_t si_opaque;
} stateid_t;
-#define si_boot si_opaque.so_boot
-#define si_stateownerid si_opaque.so_stateownerid
-#define si_fileid si_opaque.so_fileid
#define STATEID_FMT "(%08x/%08x/%08x/%08x)"
#define STATEID_VAL(s) \
- (s)->si_boot, \
- (s)->si_stateownerid, \
- (s)->si_fileid, \
+ (s)->si_opaque.so_clid.cl_boot, \
+ (s)->si_opaque.so_clid.cl_id, \
+ (s)->si_opaque.so_id, \
(s)->si_generation
struct nfsd4_callback {
@@ -76,17 +73,27 @@ struct nfsd4_callback {
bool cb_done;
};
+struct nfs4_stid {
+#define NFS4_OPEN_STID 1
+#define NFS4_LOCK_STID 2
+#define NFS4_DELEG_STID 4
+/* For an open stateid kept around *only* to process close replays: */
+#define NFS4_CLOSED_STID 8
+ unsigned char sc_type;
+ stateid_t sc_stateid;
+ struct nfs4_client *sc_client;
+};
+
struct nfs4_delegation {
+ struct nfs4_stid dl_stid; /* must be first field */
struct list_head dl_perfile;
struct list_head dl_perclnt;
struct list_head dl_recall_lru; /* delegation recalled */
atomic_t dl_count; /* ref count */
- struct nfs4_client *dl_client;
struct nfs4_file *dl_file;
u32 dl_type;
time_t dl_time;
/* For recall: */
- stateid_t dl_stateid;
struct knfsd_fh dl_fh;
int dl_retries;
struct nfsd4_callback dl_recall;
@@ -104,6 +111,11 @@ struct nfs4_cb_conn {
struct svc_xprt *cb_xprt; /* minorversion 1 only */
};
+static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
+{
+ return container_of(s, struct nfs4_delegation, dl_stid);
+}
+
/* Maximum number of slots per session. 160 is useful for long haul TCP */
#define NFSD_MAX_SLOTS_PER_SESSION 160
/* Maximum number of operations per session compound */
@@ -220,6 +232,7 @@ struct nfs4_client {
struct list_head cl_idhash; /* hash by cl_clientid.id */
struct list_head cl_strhash; /* hash by cl_name */
struct list_head cl_openowners;
+ struct idr cl_stateids; /* stateid lookup */
struct list_head cl_delegations;
struct list_head cl_lru; /* tail queue */
struct xdr_netobj cl_name; /* id generated by client */
@@ -245,6 +258,7 @@ struct nfs4_client {
#define NFSD4_CB_UP 0
#define NFSD4_CB_UNKNOWN 1
#define NFSD4_CB_DOWN 2
+#define NFSD4_CB_FAULT 3
int cl_cb_state;
struct nfsd4_callback cl_cb_null;
struct nfsd4_session *cl_cb_session;
@@ -293,6 +307,9 @@ static inline void
update_stateid(stateid_t *stateid)
{
stateid->si_generation++;
+ /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
+ if (stateid->si_generation == 0)
+ stateid->si_generation = 1;
}
/* A reasonable value for REPLAY_ISIZE was estimated as follows:
@@ -312,49 +329,57 @@ struct nfs4_replay {
__be32 rp_status;
unsigned int rp_buflen;
char *rp_buf;
- unsigned intrp_allocated;
struct knfsd_fh rp_openfh;
char rp_ibuf[NFSD4_REPLAY_ISIZE];
};
-/*
-* nfs4_stateowner can either be an open_owner, or a lock_owner
-*
-* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[]
-* for lock_owner
-* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[]
-* for lock_owner
-* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client
-* struct is reaped.
-* so_perfilestate: heads the list of nfs4_stateid (either open or lock)
-* and is used to ensure no dangling nfs4_stateid references when we
-* release a stateowner.
-* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when
-* close is called to reap associated byte-range locks
-* so_close_lru: (open) stateowner is placed on this list instead of being
-* reaped (when so_perfilestate is empty) to hold the last close replay.
-* reaped by laundramat thread after lease period.
-*/
struct nfs4_stateowner {
- struct kref so_ref;
- struct list_head so_idhash; /* hash by so_id */
struct list_head so_strhash; /* hash by op_name */
- struct list_head so_perclient;
struct list_head so_stateids;
- struct list_head so_perstateid; /* for lockowners only */
- struct list_head so_close_lru; /* tail queue */
- time_t so_time; /* time of placement on so_close_lru */
- int so_is_open_owner; /* 1=openowner,0=lockowner */
- u32 so_id;
struct nfs4_client * so_client;
/* after increment in ENCODE_SEQID_OP_TAIL, represents the next
* sequence id expected from the client: */
u32 so_seqid;
struct xdr_netobj so_owner; /* open owner name */
- int so_confirmed; /* successful OPEN_CONFIRM? */
struct nfs4_replay so_replay;
+ bool so_is_open_owner;
};
+struct nfs4_openowner {
+ struct nfs4_stateowner oo_owner; /* must be first field */
+ struct list_head oo_perclient;
+ /*
+ * We keep around openowners a little while after last close,
+ * which saves clients from having to confirm, and allows us to
+ * handle close replays if they come soon enough. The close_lru
+ * is a list of such openowners, to be reaped by the laundromat
+ * thread eventually if they remain unused:
+ */
+ struct list_head oo_close_lru;
+ struct nfs4_ol_stateid *oo_last_closed_stid;
+ time_t oo_time; /* time of placement on so_close_lru */
+#define NFS4_OO_CONFIRMED 1
+#define NFS4_OO_PURGE_CLOSE 2
+#define NFS4_OO_NEW 4
+ unsigned char oo_flags;
+};
+
+struct nfs4_lockowner {
+ struct nfs4_stateowner lo_owner; /* must be first element */
+ struct list_head lo_perstateid; /* for lockowners only */
+ struct list_head lo_list; /* for temporary uses */
+};
+
+static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
+{
+ return container_of(so, struct nfs4_openowner, oo_owner);
+}
+
+static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
+{
+ return container_of(so, struct nfs4_lockowner, lo_owner);
+}
+
/*
* nfs4_file: a file opened by some number of (open) nfs4_stateowners.
* o fi_perfile list is used to search for conflicting
@@ -368,17 +393,17 @@ struct nfs4_file {
/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
struct file * fi_fds[3];
/*
- * Each open or lock stateid contributes 1 to either
- * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending
- * on open or lock mode:
+ * Each open or lock stateid contributes 0-4 to the counts
+ * below depending on which bits are set in st_access_bitmap:
+ * 1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set
+ * + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set
+ * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set.
*/
atomic_t fi_access[2];
struct file *fi_deleg_file;
struct file_lock *fi_lease;
atomic_t fi_delegees;
struct inode *fi_inode;
- u32 fi_id; /* used with stateowner->so_id
- * for stateid_hashtbl hash */
bool fi_had_conflict;
};
@@ -408,50 +433,27 @@ static inline struct file *find_any_file(struct nfs4_file *f)
return f->fi_fds[O_RDONLY];
}
-/*
-* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
-*
-* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file
-*
-* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry
-* st_perfile: file_hashtbl[] entry.
-* st_perfile_state: nfs4_stateowner->so_perfilestate
-* st_perlockowner: (open stateid) list of lock nfs4_stateowners
-* st_access_bmap: used only for open stateid
-* st_deny_bmap: used only for open stateid
-* st_openstp: open stateid lock stateid was derived from
-*
-* XXX: open stateids and lock stateids have diverged sufficiently that
-* we should consider defining separate structs for the two cases.
-*/
-
-struct nfs4_stateid {
- struct list_head st_hash;
+/* "ol" stands for "Open or Lock". Better suggestions welcome. */
+struct nfs4_ol_stateid {
+ struct nfs4_stid st_stid; /* must be first field */
struct list_head st_perfile;
struct list_head st_perstateowner;
struct list_head st_lockowners;
struct nfs4_stateowner * st_stateowner;
struct nfs4_file * st_file;
- stateid_t st_stateid;
unsigned long st_access_bmap;
unsigned long st_deny_bmap;
- struct nfs4_stateid * st_openstp;
+ struct nfs4_ol_stateid * st_openstp;
};
+static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
+{
+ return container_of(s, struct nfs4_ol_stateid, st_stid);
+}
+
/* flags for preprocess_seqid_op() */
-#define HAS_SESSION 0x00000001
-#define CONFIRM 0x00000002
-#define OPEN_STATE 0x00000004
-#define LOCK_STATE 0x00000008
#define RD_STATE 0x00000010
#define WR_STATE 0x00000020
-#define CLOSE_STATE 0x00000040
-
-#define seqid_mutating_err(err) \
- (((err) != nfserr_stale_clientid) && \
- ((err) != nfserr_bad_seqid) && \
- ((err) != nfserr_stale_stateid) && \
- ((err) != nfserr_bad_stateid))
struct nfsd4_compound_state;
@@ -461,7 +463,8 @@ extern void nfs4_lock_state(void);
extern void nfs4_unlock_state(void);
extern int nfs4_in_grace(void);
extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
-extern void nfs4_free_stateowner(struct kref *kref);
+extern void nfs4_free_openowner(struct nfs4_openowner *);
+extern void nfs4_free_lockowner(struct nfs4_lockowner *);
extern int set_callback_cred(void);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
@@ -473,7 +476,7 @@ extern void nfsd4_destroy_callback_queue(void);
extern void nfsd4_shutdown_callback(struct nfs4_client *);
extern void nfs4_put_delegation(struct nfs4_delegation *dp);
extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
-extern void nfsd4_init_recdir(char *recdir_name);
+extern void nfsd4_init_recdir(void);
extern int nfsd4_recdir_load(void);
extern void nfsd4_shutdown_recdir(void);
extern int nfs4_client_to_reclaim(const char *name);
@@ -482,18 +485,7 @@ extern void nfsd4_recdir_purge_old(void);
extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
extern void release_session_client(struct nfsd4_session *);
-extern __be32 nfs4_validate_stateid(stateid_t *, int);
-
-static inline void
-nfs4_put_stateowner(struct nfs4_stateowner *so)
-{
- kref_put(&so->so_ref, nfs4_free_stateowner);
-}
-
-static inline void
-nfs4_get_stateowner(struct nfs4_stateowner *so)
-{
- kref_get(&so->so_ref);
-}
+extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
+extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fd0acca5370..7a2e442623c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -168,6 +168,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
{
if (d_mountpoint(dentry))
return 1;
+ if (nfsd4_is_junction(dentry))
+ return 1;
if (!(exp->ex_flags & NFSEXP_V4ROOT))
return 0;
return dentry->d_inode != NULL;
@@ -502,7 +504,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int flags = 0;
/* Get inode */
- error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+ error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
if (error)
return error;
@@ -592,6 +594,22 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
return error;
}
+#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction."
+#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type"
+int nfsd4_is_junction(struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+
+ if (inode == NULL)
+ return 0;
+ if (inode->i_mode & S_IXUGO)
+ return 0;
+ if (!(inode->i_mode & S_ISVTX))
+ return 0;
+ if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0)
+ return 0;
+ return 1;
+}
#endif /* defined(CONFIG_NFSD_V4) */
#ifdef CONFIG_NFSD_V3
@@ -1352,7 +1370,7 @@ __be32
do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *fname, int flen, struct iattr *iap,
struct svc_fh *resfhp, int createmode, u32 *verifier,
- int *truncp, int *created)
+ bool *truncp, bool *created)
{
struct dentry *dentry, *dchild = NULL;
struct inode *dirp;
@@ -1632,10 +1650,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
if (err)
goto out;
- err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP);
+ err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP);
if (err)
goto out;
-
+ err = nfserr_isdir;
+ if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode))
+ goto out;
err = nfserr_perm;
if (!len)
goto out;
@@ -2114,7 +2134,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
/* Allow read access to binaries even when mode 111 */
if (err == -EACCES && S_ISREG(inode->i_mode) &&
- acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
+ (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
+ acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
err = inode_permission(inode, MAY_EXEC);
return err? nfserrno(err) : 0;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index e0bbac04d1d..3f54ad03bb2 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -10,21 +10,22 @@
/*
* Flags for nfsd_permission
*/
-#define NFSD_MAY_NOP 0
-#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */
-#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */
-#define NFSD_MAY_READ 4 /* == MAY_READ */
-#define NFSD_MAY_SATTR 8
-#define NFSD_MAY_TRUNC 16
-#define NFSD_MAY_LOCK 32
-#define NFSD_MAY_MASK 63
+#define NFSD_MAY_NOP 0
+#define NFSD_MAY_EXEC 0x001 /* == MAY_EXEC */
+#define NFSD_MAY_WRITE 0x002 /* == MAY_WRITE */
+#define NFSD_MAY_READ 0x004 /* == MAY_READ */
+#define NFSD_MAY_SATTR 0x008
+#define NFSD_MAY_TRUNC 0x010
+#define NFSD_MAY_LOCK 0x020
+#define NFSD_MAY_MASK 0x03f
/* extra hints to permission and open routines: */
-#define NFSD_MAY_OWNER_OVERRIDE 64
-#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
-#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
-#define NFSD_MAY_NOT_BREAK_LEASE 512
-#define NFSD_MAY_BYPASS_GSS 1024
+#define NFSD_MAY_OWNER_OVERRIDE 0x040
+#define NFSD_MAY_LOCAL_ACCESS 0x080 /* for device special files */
+#define NFSD_MAY_BYPASS_GSS_ON_ROOT 0x100
+#define NFSD_MAY_NOT_BREAK_LEASE 0x200
+#define NFSD_MAY_BYPASS_GSS 0x400
+#define NFSD_MAY_READ_IF_EXEC 0x800
#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -61,7 +62,7 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
struct svc_fh *res, int createmode,
- u32 *verifier, int *truncp, int *created);
+ u32 *verifier, bool *truncp, bool *created);
__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
loff_t, unsigned long);
#endif /* CONFIG_NFSD_V3 */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d2a8d04428c..2364747ee97 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -81,7 +81,6 @@ struct nfsd4_access {
struct nfsd4_close {
u32 cl_seqid; /* request */
stateid_t cl_stateid; /* request+response */
- struct nfs4_stateowner * cl_stateowner; /* response */
};
struct nfsd4_commit {
@@ -131,7 +130,7 @@ struct nfsd4_link {
struct nfsd4_lock_denied {
clientid_t ld_clientid;
- struct nfs4_stateowner *ld_sop;
+ struct xdr_netobj ld_owner;
u64 ld_start;
u64 ld_length;
u32 ld_type;
@@ -165,9 +164,6 @@ struct nfsd4_lock {
} ok;
struct nfsd4_lock_denied denied;
} u;
- /* The lk_replay_owner is the open owner in the open_to_lock_owner
- * case and the lock owner otherwise: */
- struct nfs4_stateowner *lk_replay_owner;
};
#define lk_new_open_seqid v.new.open_seqid
#define lk_new_open_stateid v.new.open_stateid
@@ -188,7 +184,6 @@ struct nfsd4_lockt {
struct xdr_netobj lt_owner;
u64 lt_offset;
u64 lt_length;
- struct nfs4_stateowner * lt_stateowner;
struct nfsd4_lock_denied lt_denied;
};
@@ -199,7 +194,6 @@ struct nfsd4_locku {
stateid_t lu_stateid;
u64 lu_offset;
u64 lu_length;
- struct nfs4_stateowner *lu_stateowner;
};
@@ -232,8 +226,11 @@ struct nfsd4_open {
u32 op_recall; /* recall */
struct nfsd4_change_info op_cinfo; /* response */
u32 op_rflags; /* response */
- int op_truncate; /* used during processing */
- struct nfs4_stateowner *op_stateowner; /* used during processing */
+ bool op_truncate; /* used during processing */
+ bool op_created; /* used during processing */
+ struct nfs4_openowner *op_openowner; /* used during processing */
+ struct nfs4_file *op_file; /* used during processing */
+ struct nfs4_ol_stateid *op_stp; /* used during processing */
struct nfs4_acl *op_acl;
};
#define op_iattr iattr
@@ -243,7 +240,6 @@ struct nfsd4_open_confirm {
stateid_t oc_req_stateid /* request */;
u32 oc_seqid /* request */;
stateid_t oc_resp_stateid /* response */;
- struct nfs4_stateowner * oc_stateowner; /* response */
};
struct nfsd4_open_downgrade {
@@ -251,7 +247,6 @@ struct nfsd4_open_downgrade {
u32 od_seqid;
u32 od_share_access;
u32 od_share_deny;
- struct nfs4_stateowner *od_stateowner;
};
@@ -325,8 +320,7 @@ struct nfsd4_setattr {
struct nfsd4_setclientid {
nfs4_verifier se_verf; /* request */
- u32 se_namelen; /* request */
- char * se_name; /* request */
+ struct xdr_netobj se_name;
u32 se_callback_prog; /* request */
u32 se_callback_netid_len; /* request */
char * se_callback_netid_val; /* request */
@@ -351,7 +345,6 @@ struct nfsd4_saved_compoundargs {
struct nfsd4_test_stateid {
__be32 ts_num_ids;
- __be32 ts_has_session;
struct nfsd4_compoundargs *ts_saved_args;
struct nfsd4_saved_compoundargs ts_savedp;
};
@@ -405,6 +398,10 @@ struct nfsd4_destroy_session {
struct nfs4_sessionid sessionid;
};
+struct nfsd4_destroy_clientid {
+ clientid_t clientid;
+};
+
struct nfsd4_reclaim_complete {
u32 rca_one_fs;
};
@@ -532,6 +529,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
struct nfsd4_compoundargs *);
int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
struct nfsd4_compoundres *);
+int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
@@ -558,11 +556,13 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
extern __be32 nfsd4_destroy_session(struct svc_rqst *,
struct nfsd4_compound_state *,
struct nfsd4_destroy_session *);
+extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
struct nfsd4_open *open);
extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
struct svc_fh *current_fh, struct nfsd4_open *open);
+extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
extern __be32 nfsd4_close(struct svc_rqst *rqstp,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 666628b395f..b50ffb72e5b 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -354,7 +354,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
failed_acl:
failed_bmap:
- inode->i_nlink = 0;
+ clear_nlink(inode);
iput(inode); /* raw_inode will be deleted through
generic_delete_inode() */
goto failed;
@@ -396,7 +396,7 @@ int nilfs_read_inode_common(struct inode *inode,
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le64_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 41d6743d303..ac258beeda3 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -625,6 +625,9 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment)
goto out_free;
+ if (argv[n].v_nmembs >= UINT_MAX / argv[n].v_size)
+ goto out_free;
+
len = argv[n].v_size * argv[n].v_nmembs;
base = (void __user *)(unsigned long)argv[n].v_base;
if (len == 0) {
@@ -842,6 +845,19 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FS_IOC32_GETVERSION:
cmd = FS_IOC_GETVERSION;
break;
+ case NILFS_IOCTL_CHANGE_CPMODE:
+ case NILFS_IOCTL_DELETE_CHECKPOINT:
+ case NILFS_IOCTL_GET_CPINFO:
+ case NILFS_IOCTL_GET_CPSTAT:
+ case NILFS_IOCTL_GET_SUINFO:
+ case NILFS_IOCTL_GET_SUSTAT:
+ case NILFS_IOCTL_GET_VINFO:
+ case NILFS_IOCTL_GET_BDESCS:
+ case NILFS_IOCTL_CLEAN_SEGMENTS:
+ case NILFS_IOCTL_SYNC:
+ case NILFS_IOCTL_RESIZE:
+ case NILFS_IOCTL_SET_ALLOC_RANGE:
+ break;
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index a3141990061..768982de10e 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -289,7 +289,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
nilfs_warning(inode->i_sb, __func__,
"deleting nonexistent file (%lu), %d\n",
inode->i_ino, inode->i_nlink);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
err = nilfs_delete_entry(de, page);
if (err)
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 255d5e1c03b..3777d138f89 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -276,10 +276,10 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
/* super.c */
extern struct inode *nilfs_alloc_inode(struct super_block *);
extern void nilfs_destroy_inode(struct inode *);
-extern void nilfs_error(struct super_block *, const char *, const char *, ...)
- __attribute__ ((format (printf, 3, 4)));
-extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
- __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void nilfs_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void nilfs_warning(struct super_block *, const char *, const char *, ...);
extern struct nilfs_super_block *
nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
extern int nilfs_store_magic_and_option(struct super_block *,
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 2142b1c68b6..53c27eaf230 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -30,8 +30,9 @@
extern int debug_msgs;
-extern void __ntfs_debug(const char *file, int line, const char *function,
- const char *format, ...) __attribute__ ((format (printf, 4, 5)));
+extern __printf(4, 5)
+void __ntfs_debug(const char *file, int line, const char *function,
+ const char *format, ...);
/**
* ntfs_debug - write a debug level message to syslog
* @f: a printf format string containing the message
@@ -52,12 +53,14 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl);
#endif /* !DEBUG */
-extern void __ntfs_warning(const char *function, const struct super_block *sb,
- const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void __ntfs_warning(const char *function, const struct super_block *sb,
+ const char *fmt, ...);
#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a)
-extern void __ntfs_error(const char *function, const struct super_block *sb,
- const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void __ntfs_error(const char *function, const struct super_block *sb,
+ const char *fmt, ...);
#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a)
#endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 1371487da95..97e2dacbc86 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -612,7 +612,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
* might be tricky due to vfs interactions. Need to think about this
* some more when implementing the unlink command.
*/
- vi->i_nlink = le16_to_cpu(m->link_count);
+ set_nlink(vi, le16_to_cpu(m->link_count));
/*
* FIXME: Reparse points can have the directory bit set even though
* they would be S_IFLNK. Need to deal with this further below when we
@@ -634,7 +634,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
vi->i_mode &= ~vol->dmask;
/* Things break without this kludge! */
if (vi->i_nlink > 1)
- vi->i_nlink = 1;
+ set_nlink(vi, 1);
} else {
vi->i_mode |= S_IFREG;
/* Apply the file permissions mask set in the mount options. */
@@ -1242,7 +1242,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
vi->i_version = base_vi->i_version;
vi->i_uid = base_vi->i_uid;
vi->i_gid = base_vi->i_gid;
- vi->i_nlink = base_vi->i_nlink;
+ set_nlink(vi, base_vi->i_nlink);
vi->i_mtime = base_vi->i_mtime;
vi->i_ctime = base_vi->i_ctime;
vi->i_atime = base_vi->i_atime;
@@ -1508,7 +1508,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
vi->i_version = base_vi->i_version;
vi->i_uid = base_vi->i_uid;
vi->i_gid = base_vi->i_gid;
- vi->i_nlink = base_vi->i_nlink;
+ set_nlink(vi, base_vi->i_nlink);
vi->i_mtime = base_vi->i_mtime;
vi->i_ctime = base_vi->i_ctime;
vi->i_atime = base_vi->i_atime;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ed553c60de8..3165aebb43c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5699,7 +5699,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_commit;
}
dquot_free_space_nodirty(inode,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c1efe939c77..78b68af3b0e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -290,7 +290,15 @@ static int ocfs2_readpage(struct file *file, struct page *page)
}
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+ /*
+ * Unlock the page and cycle ip_alloc_sem so that we don't
+ * busyloop waiting for ip_alloc_sem to unlock
+ */
ret = AOP_TRUNCATED_PAGE;
+ unlock_page(page);
+ unlock = 0;
+ down_read(&oi->ip_alloc_sem);
+ up_read(&oi->ip_alloc_sem);
goto out_inode_unlock;
}
@@ -563,6 +571,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
int level;
+ wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -570,6 +579,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
if (ocfs2_iocb_is_sem_locked(iocb))
ocfs2_iocb_clear_sem_locked(iocb);
+ if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+
+ if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
+ waitqueue_active(wq)) {
+ wake_up_all(wq);
+ }
+ }
+
ocfs2_iocb_clear_rw_locked(iocb);
level = ocfs2_iocb_rw_locked_level(iocb);
@@ -863,6 +881,12 @@ struct ocfs2_write_ctxt {
struct page *w_target_page;
/*
+ * w_target_locked is used for page_mkwrite path indicating no unlocking
+ * against w_target_page in ocfs2_write_end_nolock.
+ */
+ unsigned int w_target_locked:1;
+
+ /*
* ocfs2_write_end() uses this to know what the real range to
* write in the target should be.
*/
@@ -895,6 +919,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
{
+ int i;
+
+ /*
+ * w_target_locked is only set to true in the page_mkwrite() case.
+ * The intent is to allow us to lock the target page from write_begin()
+ * to write_end(). The caller must hold a ref on w_target_page.
+ */
+ if (wc->w_target_locked) {
+ BUG_ON(!wc->w_target_page);
+ for (i = 0; i < wc->w_num_pages; i++) {
+ if (wc->w_target_page == wc->w_pages[i]) {
+ wc->w_pages[i] = NULL;
+ break;
+ }
+ }
+ mark_page_accessed(wc->w_target_page);
+ page_cache_release(wc->w_target_page);
+ }
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
brelse(wc->w_di_bh);
@@ -1132,20 +1174,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
*/
lock_page(mmap_page);
+ /* Exit and let the caller retry */
if (mmap_page->mapping != mapping) {
+ WARN_ON(mmap_page->mapping);
unlock_page(mmap_page);
- /*
- * Sanity check - the locking in
- * ocfs2_pagemkwrite() should ensure
- * that this code doesn't trigger.
- */
- ret = -EINVAL;
- mlog_errno(ret);
+ ret = -EAGAIN;
goto out;
}
page_cache_get(mmap_page);
wc->w_pages[i] = mmap_page;
+ wc->w_target_locked = true;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
@@ -1160,6 +1199,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
wc->w_target_page = wc->w_pages[i];
}
out:
+ if (ret)
+ wc->w_target_locked = false;
return ret;
}
@@ -1817,11 +1858,23 @@ try_again:
*/
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
- if (ret) {
+ if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out_quota;
}
+ /*
+ * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+ * the target page. In this case, we exit with no error and no target
+ * page. This will trigger the caller, page_mkwrite(), to re-try
+ * the operation.
+ */
+ if (ret == -EAGAIN) {
+ BUG_ON(wc->w_target_page);
+ ret = 0;
+ goto out_quota;
+ }
+
ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
len);
if (ret) {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 75cf3ad987a..ffb2da370a9 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0,
OCFS2_IOCB_RW_LOCK_LEVEL,
OCFS2_IOCB_SEM,
+ OCFS2_IOCB_UNALIGNED_IO,
OCFS2_IOCB_NUM_LOCKS
};
@@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
#define ocfs2_iocb_is_sem_locked(iocb) \
test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+
+#define ocfs2_iocb_set_unaligned_aio(iocb) \
+ set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_unaligned_aio(iocb) \
+ clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_unaligned_aio(iocb) \
+ test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+
+#define OCFS2_IOEND_WQ_HASH_SZ 37
+#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\
+ OCFS2_IOEND_WQ_HASH_SZ])
+extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9a3e6bbff27..a4e855e3690 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -216,6 +216,7 @@ struct o2hb_region {
struct list_head hr_all_item;
unsigned hr_unclean_stop:1,
+ hr_aborted_start:1,
hr_item_pinned:1,
hr_item_dropped:1;
@@ -254,6 +255,10 @@ struct o2hb_region {
* a more complete api that doesn't lead to this sort of fragility. */
atomic_t hr_steady_iterations;
+ /* terminate o2hb thread if it does not reach steady state
+ * (hr_steady_iterations == 0) within hr_unsteady_iterations */
+ atomic_t hr_unsteady_iterations;
+
char hr_dev_name[BDEVNAME_SIZE];
unsigned int hr_timeout_ms;
@@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work)
static void o2hb_arm_write_timeout(struct o2hb_region *reg)
{
+ /* Arm writeout only after thread reaches steady state */
+ if (atomic_read(&reg->hr_steady_iterations) != 0)
+ return;
+
mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
O2HB_MAX_WRITE_TIMEOUT_MS);
@@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
return read == computed;
}
-/* We want to make sure that nobody is heartbeating on top of us --
- * this will help detect an invalid configuration. */
-static void o2hb_check_last_timestamp(struct o2hb_region *reg)
+/*
+ * Compare the slot data with what we wrote in the last iteration.
+ * If the match fails, print an appropriate error message. This is to
+ * detect errors like... another node hearting on the same slot,
+ * flaky device that is losing writes, etc.
+ * Returns 1 if check succeeds, 0 otherwise.
+ */
+static int o2hb_check_own_slot(struct o2hb_region *reg)
{
struct o2hb_disk_slot *slot;
struct o2hb_disk_heartbeat_block *hb_block;
@@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
slot = &reg->hr_slots[o2nm_this_node()];
/* Don't check on our 1st timestamp */
if (!slot->ds_last_time)
- return;
+ return 0;
hb_block = slot->ds_raw_block;
if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
hb_block->hb_node == slot->ds_node_num)
- return;
+ return 1;
#define ERRSTR1 "Another node is heartbeating on device"
#define ERRSTR2 "Heartbeat generation mismatch on device"
@@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
(unsigned long long)slot->ds_last_time, hb_block->hb_node,
(unsigned long long)le64_to_cpu(hb_block->hb_generation),
(unsigned long long)le64_to_cpu(hb_block->hb_seq));
+
+ return 0;
}
static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
o2nm_node_put(node);
}
-static void o2hb_set_quorum_device(struct o2hb_region *reg,
- struct o2hb_disk_slot *slot)
+static void o2hb_set_quorum_device(struct o2hb_region *reg)
{
- assert_spin_locked(&o2hb_live_lock);
-
if (!o2hb_global_heartbeat_active())
return;
- if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ /* Prevent race with o2hb_heartbeat_group_drop_item() */
+ if (kthread_should_stop())
+ return;
+
+ /* Tag region as quorum only after thread reaches steady state */
+ if (atomic_read(&reg->hr_steady_iterations) != 0)
return;
+ spin_lock(&o2hb_live_lock);
+
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ goto unlock;
+
/*
* A region can be added to the quorum only when it sees all
* live nodes heartbeat on it. In other words, the region has been
@@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
*/
if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
sizeof(o2hb_live_node_bitmap)))
- return;
-
- if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
- return;
+ goto unlock;
- printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
- config_item_name(&reg->hr_item));
+ printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
@@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
o2hb_region_unpin(NULL);
+unlock:
+ spin_unlock(&o2hb_live_lock);
}
static int o2hb_check_slot(struct o2hb_region *reg,
@@ -925,8 +947,6 @@ fire_callbacks:
slot->ds_equal_samples = 0;
}
out:
- o2hb_set_quorum_device(reg, slot);
-
spin_unlock(&o2hb_live_lock);
o2hb_run_event_list(&event);
@@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes,
static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
{
- int i, ret, highest_node, change = 0;
+ int i, ret, highest_node;
+ int membership_change = 0, own_slot_ok = 0;
unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct o2hb_bio_wait_ctxt write_wc;
@@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
sizeof(configured_nodes));
if (ret) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
/*
@@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
if (highest_node >= O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
- return -EINVAL;
+ mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
+ ret = -EINVAL;
+ goto bail;
}
/* No sense in reading the slots of nodes that don't exist
@@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
ret = o2hb_read_slots(reg, highest_node + 1);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
/* With an up to date view of the slots, we can check that no
* other node has been improperly configured to heartbeat in
* our slot. */
- o2hb_check_last_timestamp(reg);
+ own_slot_ok = o2hb_check_own_slot(reg);
/* fill in the proper info for our next heartbeat */
o2hb_prepare_block(reg, reg->hr_generation);
- /* And fire off the write. Note that we don't wait on this I/O
- * until later. */
ret = o2hb_issue_node_write(reg, &write_wc);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
i = -1;
while((i = find_next_bit(configured_nodes,
O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
- change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+ membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
}
/*
@@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* disk */
mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
write_wc.wc_error, reg->hr_dev_name);
- return write_wc.wc_error;
+ ret = write_wc.wc_error;
+ goto bail;
}
- o2hb_arm_write_timeout(reg);
+ /* Skip disarming the timeout if own slot has stale/bad data */
+ if (own_slot_ok) {
+ o2hb_set_quorum_device(reg);
+ o2hb_arm_write_timeout(reg);
+ }
+bail:
/* let the person who launched us know when things are steady */
- if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
- if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ if (!ret && own_slot_ok && !membership_change) {
+ if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ wake_up(&o2hb_steady_queue);
+ }
+ }
+
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
+ printk(KERN_NOTICE "o2hb: Unable to stabilize "
+ "heartbeart on region %s (%s)\n",
+ config_item_name(&reg->hr_item),
+ reg->hr_dev_name);
+ atomic_set(&reg->hr_steady_iterations, 0);
+ reg->hr_aborted_start = 1;
wake_up(&o2hb_steady_queue);
+ ret = -EIO;
+ }
}
- return 0;
+ return ret;
}
/* Subtract b from a, storing the result in a. a *must* have a larger
@@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data)
/* Pin node */
o2nm_depend_this_node();
- while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+ while (!kthread_should_stop() &&
+ !reg->hr_unclean_stop && !reg->hr_aborted_start) {
/* We track the time spent inside
* o2hb_do_disk_heartbeat so that we avoid more than
* hr_timeout_ms between disk writes. On busy systems
@@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data)
* likely to time itself out. */
do_gettimeofday(&before_hb);
- i = 0;
- do {
- ret = o2hb_do_disk_heartbeat(reg);
- } while (ret && ++i < 2);
+ ret = o2hb_do_disk_heartbeat(reg);
do_gettimeofday(&after_hb);
elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
@@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data)
after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
elapsed_msec);
- if (elapsed_msec < reg->hr_timeout_ms) {
+ if (!kthread_should_stop() &&
+ elapsed_msec < reg->hr_timeout_ms) {
/* the kthread api has blocked signals for us so no
* need to record the return value. */
msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
@@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data)
* to timeout on this region when we could just as easily
* write a clear generation - thus indicating to them that
* this node has left this region.
- *
- * XXX: Should we skip this on unclean_stop? */
- o2hb_prepare_block(reg, 0);
- ret = o2hb_issue_node_write(reg, &write_wc);
- if (ret == 0) {
- o2hb_wait_on_io(reg, &write_wc);
- } else {
- mlog_errno(ret);
+ */
+ if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
+ o2hb_prepare_block(reg, 0);
+ ret = o2hb_issue_node_write(reg, &write_wc);
+ if (ret == 0)
+ o2hb_wait_on_io(reg, &write_wc);
+ else
+ mlog_errno(ret);
}
/* Unpin node */
o2nm_undepend_this_node();
- mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+ mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
return 0;
}
@@ -1158,6 +1198,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
struct o2hb_debug_buf *db = inode->i_private;
struct o2hb_region *reg;
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long lts;
char *buf = NULL;
int i = -1;
int out = 0;
@@ -1194,9 +1235,11 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
reg = (struct o2hb_region *)db->db_data;
- out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
- jiffies_to_msecs(jiffies -
- reg->hr_last_timeout_start));
+ lts = reg->hr_last_timeout_start;
+ /* If 0, it has never been set before */
+ if (lts)
+ lts = jiffies_to_msecs(jiffies - lts);
+ out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
goto done;
case O2HB_DB_TYPE_REGION_PINNED:
@@ -1426,6 +1469,8 @@ static void o2hb_region_release(struct config_item *item)
struct page *page;
struct o2hb_region *reg = to_o2hb_region(item);
+ mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
+
if (reg->hr_tmp_block)
kfree(reg->hr_tmp_block);
@@ -1792,7 +1837,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
live_threshold <<= 1;
spin_unlock(&o2hb_live_lock);
}
- atomic_set(&reg->hr_steady_iterations, live_threshold + 1);
+ ++live_threshold;
+ atomic_set(&reg->hr_steady_iterations, live_threshold);
+ /* unsteady_iterations is double the steady_iterations */
+ atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
reg->hr_item.ci_name);
@@ -1809,14 +1857,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = wait_event_interruptible(o2hb_steady_queue,
atomic_read(&reg->hr_steady_iterations) == 0);
if (ret) {
- /* We got interrupted (hello ptrace!). Clean up */
- spin_lock(&o2hb_live_lock);
- hb_task = reg->hr_task;
- reg->hr_task = NULL;
- spin_unlock(&o2hb_live_lock);
+ atomic_set(&reg->hr_steady_iterations, 0);
+ reg->hr_aborted_start = 1;
+ }
- if (hb_task)
- kthread_stop(hb_task);
+ if (reg->hr_aborted_start) {
+ ret = -EIO;
goto out;
}
@@ -1833,8 +1879,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = -EIO;
if (hb_task && o2hb_global_heartbeat_active())
- printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
- config_item_name(&reg->hr_item));
+ printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
out:
if (filp)
@@ -2092,13 +2138,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
/* stop the thread when the user removes the region dir */
spin_lock(&o2hb_live_lock);
- if (o2hb_global_heartbeat_active()) {
- clear_bit(reg->hr_region_num, o2hb_region_bitmap);
- clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
- if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
- quorum_region = 1;
- clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
- }
hb_task = reg->hr_task;
reg->hr_task = NULL;
reg->hr_item_dropped = 1;
@@ -2107,19 +2146,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
if (hb_task)
kthread_stop(hb_task);
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+ clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ quorum_region = 1;
+ clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
+ ((atomic_read(&reg->hr_steady_iterations) == 0) ?
+ "stopped" : "start aborted"), config_item_name(item),
+ reg->hr_dev_name);
+ }
+
/*
* If we're racing a dev_write(), we need to wake them. They will
* check reg->hr_task
*/
if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ reg->hr_aborted_start = 1;
atomic_set(&reg->hr_steady_iterations, 0);
wake_up(&o2hb_steady_queue);
}
- if (o2hb_global_heartbeat_active())
- printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
- config_item_name(&reg->hr_item));
-
config_item_put(item);
if (!o2hb_global_heartbeat_active() || !quorum_region)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 3a5835904b3..dc45deb19e6 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -47,6 +47,7 @@
#define SC_DEBUG_NAME "sock_containers"
#define NST_DEBUG_NAME "send_tracking"
#define STATS_DEBUG_NAME "stats"
+#define NODES_DEBUG_NAME "connected_nodes"
#define SHOW_SOCK_CONTAINERS 0
#define SHOW_SOCK_STATS 1
@@ -55,6 +56,7 @@ static struct dentry *o2net_dentry;
static struct dentry *sc_dentry;
static struct dentry *nst_dentry;
static struct dentry *stats_dentry;
+static struct dentry *nodes_dentry;
static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -491,53 +493,87 @@ static const struct file_operations sc_seq_fops = {
.release = sc_fop_release,
};
-int o2net_debugfs_init(void)
+static int o2net_fill_bitmap(char *buf, int len)
{
- o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
- if (!o2net_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int i = -1, out = 0;
- nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &nst_seq_fops);
- if (!nst_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ o2net_fill_node_map(map, sizeof(map));
- sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &sc_seq_fops);
- if (!sc_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+ out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+ out += snprintf(buf + out, PAGE_SIZE - out, "\n");
- stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &stats_seq_fops);
- if (!stats_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ return out;
+}
+
+static int nodes_fop_open(struct inode *inode, struct file *file)
+{
+ char *buf;
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
+
+ file->private_data = buf;
return 0;
-bail:
- debugfs_remove(stats_dentry);
- debugfs_remove(sc_dentry);
- debugfs_remove(nst_dentry);
- debugfs_remove(o2net_dentry);
- return -ENOMEM;
}
+static int o2net_debug_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static ssize_t o2net_debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+ i_size_read(file->f_mapping->host));
+}
+
+static const struct file_operations nodes_fops = {
+ .open = nodes_fop_open,
+ .release = o2net_debug_release,
+ .read = o2net_debug_read,
+ .llseek = generic_file_llseek,
+};
+
void o2net_debugfs_exit(void)
{
+ debugfs_remove(nodes_dentry);
debugfs_remove(stats_dentry);
debugfs_remove(sc_dentry);
debugfs_remove(nst_dentry);
debugfs_remove(o2net_dentry);
}
+int o2net_debugfs_init(void)
+{
+ mode_t mode = S_IFREG|S_IRUSR;
+
+ o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+ if (o2net_dentry)
+ nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &nst_seq_fops);
+ if (nst_dentry)
+ sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &sc_seq_fops);
+ if (sc_dentry)
+ stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &stats_seq_fops);
+ if (stats_dentry)
+ nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &nodes_fops);
+ if (nodes_dentry)
+ return 0;
+
+ o2net_debugfs_exit();
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+}
+
#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index db5ee4b4f47..044e7b58d31 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -59,6 +59,7 @@
#include <linux/idr.h>
#include <linux/kref.h>
#include <linux/net.h>
+#include <linux/export.h>
#include <net/tcp.h>
#include <asm/uaccess.h>
@@ -545,7 +546,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
}
if (was_valid && !valid) {
- printk(KERN_NOTICE "o2net: no longer connected to "
+ printk(KERN_NOTICE "o2net: No longer connected to "
SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
o2net_complete_nodes_nsw(nn);
}
@@ -555,7 +556,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
cancel_delayed_work(&nn->nn_connect_expired);
printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
o2nm_this_node() > sc->sc_node->nd_num ?
- "connected to" : "accepted connection from",
+ "Connected to" : "Accepted connection from",
SC_NODEF_ARGS(sc));
}
@@ -643,7 +644,7 @@ static void o2net_state_change(struct sock *sk)
o2net_sc_queue_work(sc, &sc->sc_connect_work);
break;
default:
- printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+ printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
" shutdown, state %d\n",
SC_NODEF_ARGS(sc), sk->sk_state);
o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
@@ -1034,6 +1035,25 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
return ret;
}
+/* Get a map of all nodes to which this node is currently connected to */
+void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+{
+ struct o2net_sock_container *sc;
+ int node, ret;
+
+ BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+ memset(map, 0, bytes);
+ for (node = 0; node < O2NM_MAX_NODES; ++node) {
+ o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+ if (!ret) {
+ set_bit(node, map);
+ sc_put(sc);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(o2net_fill_node_map);
+
int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
size_t caller_veclen, u8 target_node, int *status)
{
@@ -1284,11 +1304,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
- mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
- "version %llu but %llu is required, disconnecting\n",
- SC_NODEF_ARGS(sc),
- (unsigned long long)be64_to_cpu(hand->protocol_version),
- O2NET_PROTOCOL_VERSION);
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net "
+ "protocol version %llu but %llu is required. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ (unsigned long long)be64_to_cpu(hand->protocol_version),
+ O2NET_PROTOCOL_VERSION);
/* don't bother reconnecting if its the wrong version. */
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
@@ -1302,33 +1322,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
*/
if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
o2net_idle_timeout()) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2net_idle_timeout_ms),
- o2net_idle_timeout());
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network "
+ "idle timeout of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2net_idle_timeout_ms),
+ o2net_idle_timeout());
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
o2net_keepalive_delay()) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2net_keepalive_delay_ms),
- o2net_keepalive_delay());
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive "
+ "delay of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2net_keepalive_delay_ms),
+ o2net_keepalive_delay());
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
O2HB_MAX_WRITE_TIMEOUT_MS) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
- O2HB_MAX_WRITE_TIMEOUT_MS);
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat "
+ "timeout of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
+ O2HB_MAX_WRITE_TIMEOUT_MS);
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
@@ -1539,28 +1559,16 @@ static void o2net_idle_timer(unsigned long data)
{
struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
-
#ifdef CONFIG_DEBUG_FS
- ktime_t now = ktime_get();
+ unsigned long msecs = ktime_to_ms(ktime_get()) -
+ ktime_to_ms(sc->sc_tv_timer);
+#else
+ unsigned long msecs = o2net_idle_timeout();
#endif
- printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
- "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
- o2net_idle_timeout() / 1000,
- o2net_idle_timeout() % 1000);
-
-#ifdef CONFIG_DEBUG_FS
- mlog(ML_NOTICE, "Here are some times that might help debug the "
- "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
- "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
- (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
- (long long)ktime_to_us(sc->sc_tv_data_ready),
- (long long)ktime_to_us(sc->sc_tv_advance_start),
- (long long)ktime_to_us(sc->sc_tv_advance_stop),
- sc->sc_msg_key, sc->sc_msg_type,
- (long long)ktime_to_us(sc->sc_tv_func_start),
- (long long)ktime_to_us(sc->sc_tv_func_stop));
-#endif
+ printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
+ "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
+ msecs / 1000, msecs % 1000);
/*
* Initialize the nn_timeout so that the next connection attempt
@@ -1693,8 +1701,8 @@ static void o2net_start_connect(struct work_struct *work)
out:
if (ret) {
- mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
- "with errno %d\n", SC_NODEF_ARGS(sc), ret);
+ printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
+ " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
/* 0 err so that another will be queued and attempted
* from set_nn_state */
if (sc)
@@ -1717,8 +1725,8 @@ static void o2net_connect_expired(struct work_struct *work)
spin_lock(&nn->nn_lock);
if (!nn->nn_sc_valid) {
- mlog(ML_ERROR, "no connection established with node %u after "
- "%u.%u seconds, giving up and returning errors.\n",
+ printk(KERN_NOTICE "o2net: No connection established with "
+ "node %u after %u.%u seconds, giving up.\n",
o2net_num_from_nn(nn),
o2net_idle_timeout() / 1000,
o2net_idle_timeout() % 1000);
@@ -1861,21 +1869,21 @@ static int o2net_accept_one(struct socket *sock)
node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
if (node == NULL) {
- mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n",
- &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Attempt to connect from unknown "
+ "node at %pI4:%d\n", &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
if (o2nm_this_node() >= node->nd_num) {
local_node = o2nm_get_node_by_num(o2nm_this_node());
- mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
- "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
- local_node->nd_name, local_node->nd_num,
- &(local_node->nd_ipv4_address),
- ntohs(local_node->nd_ipv4_port),
- node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
- ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Unexpected connect attempt seen "
+ "at node '%s' (%u, %pI4:%d) from node '%s' (%u, "
+ "%pI4:%d)\n", local_node->nd_name, local_node->nd_num,
+ &(local_node->nd_ipv4_address),
+ ntohs(local_node->nd_ipv4_port), node->nd_name,
+ node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
@@ -1900,10 +1908,10 @@ static int o2net_accept_one(struct socket *sock)
ret = 0;
spin_unlock(&nn->nn_lock);
if (ret) {
- mlog(ML_NOTICE, "attempt to connect from node '%s' at "
- "%pI4:%d but it already has an open connection\n",
- node->nd_name, &sin.sin_addr.s_addr,
- ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' "
+ "at %pI4:%d but it already has an open connection\n",
+ node->nd_name, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
goto out;
}
@@ -1983,7 +1991,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0) {
- mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
+ printk(KERN_ERR "o2net: Error %d while creating socket\n", ret);
goto out;
}
@@ -2000,16 +2008,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
sock->sk->sk_reuse = 1;
ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
if (ret < 0) {
- mlog(ML_ERROR, "unable to bind socket at %pI4:%u, "
- "ret=%d\n", &addr, ntohs(port), ret);
+ printk(KERN_ERR "o2net: Error %d while binding socket at "
+ "%pI4:%u\n", ret, &addr, ntohs(port));
goto out;
}
ret = sock->ops->listen(sock, 64);
- if (ret < 0) {
- mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n",
- &addr, ntohs(port), ret);
- }
+ if (ret < 0)
+ printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n",
+ ret, &addr, ntohs(port));
out:
if (ret) {
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index fd6179eb26d..5bada2a69b5 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
struct list_head *unreg_list);
void o2net_unregister_handler_list(struct list_head *list);
+void o2net_fill_node_map(unsigned long *map, unsigned bytes);
+
struct o2nm_node;
int o2net_register_hb_callbacks(void);
void o2net_unregister_hb_callbacks(void);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8582e3f4f12..8fe4e2892ab 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1184,8 +1184,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
if (pde)
le16_add_cpu(&pde->rec_len,
le16_to_cpu(de->rec_len));
- else
- de->inode = 0;
+ de->inode = 0;
dir->i_version++;
ocfs2_journal_dirty(handle, bh);
goto bail;
@@ -2292,7 +2291,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, di_bh);
i_size_write(inode, size);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_blocks = ocfs2_inode_sector_count(inode);
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
@@ -2354,7 +2353,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, new_bh);
i_size_write(inode, inode->i_sb->s_blocksize);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_blocks = ocfs2_inode_sector_count(inode);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d602abb51b6..a5952ceecba 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -859,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
void dlm_put(struct dlm_ctxt *dlm);
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -877,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res)
kref_get(&res->refs);
}
void dlm_lockres_put(struct dlm_lock_resource *res);
-void __dlm_unhash_lockres(struct dlm_lock_resource *res);
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
const char *name,
unsigned int len,
@@ -902,46 +901,15 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int namelen);
-#define dlm_lockres_set_refmap_bit(bit,res) \
- __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__)
-#define dlm_lockres_clear_refmap_bit(bit,res) \
- __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__)
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit);
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit);
-static inline void __dlm_lockres_set_refmap_bit(int bit,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
-{
- //printk("%s:%d:%.*s: setting bit %d\n", file, line,
- // res->lockname.len, res->lockname.name, bit);
- set_bit(bit, res->refmap);
-}
-
-static inline void __dlm_lockres_clear_refmap_bit(int bit,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
-{
- //printk("%s:%d:%.*s: clearing bit %d\n", file, line,
- // res->lockname.len, res->lockname.name, bit);
- clear_bit(bit, res->refmap);
-}
-
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- const char *file,
- int line);
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- int new_lockres,
- const char *file,
- int line);
-#define dlm_lockres_drop_inflight_ref(d,r) \
- __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref(d,r) \
- __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref_new(d,r) \
- __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 56f82cb912e..0e28e242226 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -30,6 +30,7 @@
#include <linux/sysctl.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
+#include <linux/export.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6ed6b95dcf9..92f2ead0fab 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -157,16 +157,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing,
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
-void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- if (!hlist_unhashed(&lockres->hash_node)) {
- hlist_del_init(&lockres->hash_node);
- dlm_lockres_put(lockres);
- }
+ if (hlist_unhashed(&res->hash_node))
+ return;
+
+ mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
+ hlist_del_init(&res->hash_node);
+ dlm_lockres_put(res);
}
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res)
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
struct hlist_head *bucket;
struct qstr *q;
@@ -180,6 +182,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
dlm_lockres_get(res);
hlist_add_head(&res->hash_node, bucket);
+
+ mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
}
struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
@@ -539,17 +544,17 @@ again:
static void __dlm_print_nodes(struct dlm_ctxt *dlm)
{
- int node = -1;
+ int node = -1, num = 0;
assert_spin_locked(&dlm->spinlock);
- printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
-
+ printk("( ");
while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
printk("%d ", node);
+ ++num;
}
- printk("\n");
+ printk(") %u nodes\n", num);
}
static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -566,11 +571,10 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
node = exit_msg->node_idx;
- printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
-
spin_lock(&dlm->spinlock);
clear_bit(node, dlm->domain_map);
clear_bit(node, dlm->exit_domain_map);
+ printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name);
__dlm_print_nodes(dlm);
/* notify anything attached to the heartbeat events */
@@ -755,6 +759,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
dlm_mark_domain_leaving(dlm);
dlm_leave_domain(dlm);
+ printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name);
dlm_force_free_mles(dlm);
dlm_complete_dlm_shutdown(dlm);
}
@@ -970,7 +975,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
clear_bit(assert->node_idx, dlm->exit_domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
+ printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ",
assert->node_idx, dlm->name);
__dlm_print_nodes(dlm);
@@ -1701,8 +1706,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
bail:
spin_lock(&dlm->spinlock);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- if (!status)
+ if (!status) {
+ printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name);
__dlm_print_nodes(dlm);
+ }
spin_unlock(&dlm->spinlock);
if (ctxt) {
@@ -2131,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
goto leave;
}
- if (!o2hb_check_local_node_heartbeating()) {
- mlog(ML_ERROR, "the local node has not been configured, or is "
- "not heartbeating\n");
- ret = -EPROTO;
- goto leave;
- }
-
mlog(0, "register called for domain \"%s\"\n", domain);
retry:
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 8d39e0fd66f..975810b9849 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -183,10 +183,6 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
kick_thread = 1;
}
}
- /* reduce the inflight count, this may result in the lockres
- * being purged below during calc_usage */
- if (lock->ml.node == dlm->node_num)
- dlm_lockres_drop_inflight_ref(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
@@ -231,10 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
lock->ml.type, res->lockname.len,
res->lockname.name, flags);
+ /*
+ * Wait if resource is getting recovered, remastered, etc.
+ * If the resource was remastered and new owner is self, then exit.
+ */
spin_lock(&res->spinlock);
-
- /* will exit this call with spinlock held */
__dlm_wait_on_lockres(res);
+ if (res->owner == dlm->node_num) {
+ spin_unlock(&res->spinlock);
+ return DLM_RECOVERING;
+ }
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* add lock to local (secondary) queue */
@@ -319,27 +321,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
sizeof(create), res->owner, &status);
if (tmpret >= 0) {
- // successfully sent and received
- ret = status; // this is already a dlm_status
+ ret = status;
if (ret == DLM_REJECTED) {
- mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres "
- "no longer owned by %u. that node is coming back "
- "up currently.\n", dlm->name, create.namelen,
+ mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
+ "owned by node %u. That node is coming back up "
+ "currently.\n", dlm->name, create.namelen,
create.name, res->owner);
dlm_print_one_lock_resource(res);
BUG();
}
} else {
- mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
- "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
- res->owner);
- if (dlm_is_host_down(tmpret)) {
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
+ "node %u\n", dlm->name, create.namelen, create.name,
+ tmpret, res->owner);
+ if (dlm_is_host_down(tmpret))
ret = DLM_RECOVERING;
- mlog(0, "node %u died so returning DLM_RECOVERING "
- "from lock message!\n", res->owner);
- } else {
+ else
ret = dlm_err_to_dlm_status(tmpret);
- }
}
return ret;
@@ -440,7 +438,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
/* zero memory only if kernel-allocated */
lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
if (!lksb) {
- kfree(lock);
+ kmem_cache_free(dlm_lock_cache, lock);
return NULL;
}
kernel_allocated = 1;
@@ -718,18 +716,10 @@ retry_lock:
if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
status == DLM_FORWARD) {
- mlog(0, "retrying lock with migration/"
- "recovery/in progress\n");
msleep(100);
- /* no waiting for dlm_reco_thread */
if (recovery) {
if (status != DLM_RECOVERING)
goto retry_lock;
-
- mlog(0, "%s: got RECOVERING "
- "for $RECOVERY lock, master "
- "was %u\n", dlm->name,
- res->owner);
/* wait to see the node go down, then
* drop down and allow the lockres to
* get cleaned up. need to remaster. */
@@ -741,6 +731,14 @@ retry_lock:
}
}
+ /* Inflight taken in dlm_get_lock_resource() is dropped here */
+ spin_lock(&res->spinlock);
+ dlm_lockres_drop_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
+
+ dlm_lockres_calc_usage(dlm, res);
+ dlm_kick_thread(dlm, res);
+
if (status != DLM_NORMAL) {
lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 11eefb8c12e..005261c333b 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -631,39 +631,54 @@ error:
return NULL;
}
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- int new_lockres,
- const char *file,
- int line)
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit)
{
- if (!new_lockres)
- assert_spin_locked(&res->spinlock);
+ assert_spin_locked(&res->spinlock);
+
+ mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
+ res->lockname.name, bit, __builtin_return_address(0));
+
+ set_bit(bit, res->refmap);
+}
+
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit)
+{
+ assert_spin_locked(&res->spinlock);
+
+ mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
+ res->lockname.name, bit, __builtin_return_address(0));
+
+ clear_bit(bit, res->refmap);
+}
+
+
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
- if (!test_bit(dlm->node_num, res->refmap)) {
- BUG_ON(res->inflight_locks != 0);
- dlm_lockres_set_refmap_bit(dlm->node_num, res);
- }
res->inflight_locks++;
- mlog(0, "%s:%.*s: inflight++: now %u\n",
- dlm->name, res->lockname.len, res->lockname.name,
- res->inflight_locks);
+
+ mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
+ res->lockname.len, res->lockname.name, res->inflight_locks,
+ __builtin_return_address(0));
}
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
{
assert_spin_locked(&res->spinlock);
BUG_ON(res->inflight_locks == 0);
+
res->inflight_locks--;
- mlog(0, "%s:%.*s: inflight--: now %u\n",
- dlm->name, res->lockname.len, res->lockname.name,
- res->inflight_locks);
- if (res->inflight_locks == 0)
- dlm_lockres_clear_refmap_bit(dlm->node_num, res);
+
+ mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
+ res->lockname.len, res->lockname.name, res->inflight_locks,
+ __builtin_return_address(0));
+
wake_up(&res->wq);
}
@@ -697,7 +712,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
unsigned int hash;
int tries = 0;
int bit, wait_on_recovery = 0;
- int drop_inflight_if_nonlocal = 0;
BUG_ON(!lockid);
@@ -709,36 +723,33 @@ lookup:
spin_lock(&dlm->spinlock);
tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
if (tmpres) {
- int dropping_ref = 0;
-
spin_unlock(&dlm->spinlock);
-
spin_lock(&tmpres->spinlock);
- /* We wait for the other thread that is mastering the resource */
+ /* Wait on the thread that is mastering the resource */
if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
__dlm_wait_on_lockres(tmpres);
BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+ spin_unlock(&tmpres->spinlock);
+ dlm_lockres_put(tmpres);
+ tmpres = NULL;
+ goto lookup;
}
- if (tmpres->owner == dlm->node_num) {
- BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
- dlm_lockres_grab_inflight_ref(dlm, tmpres);
- } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
- dropping_ref = 1;
- spin_unlock(&tmpres->spinlock);
-
- /* wait until done messaging the master, drop our ref to allow
- * the lockres to be purged, start over. */
- if (dropping_ref) {
- spin_lock(&tmpres->spinlock);
- __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
+ /* Wait on the resource purge to complete before continuing */
+ if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
+ BUG_ON(tmpres->owner == dlm->node_num);
+ __dlm_wait_on_lockres_flags(tmpres,
+ DLM_LOCK_RES_DROPPING_REF);
spin_unlock(&tmpres->spinlock);
dlm_lockres_put(tmpres);
tmpres = NULL;
goto lookup;
}
- mlog(0, "found in hash!\n");
+ /* Grab inflight ref to pin the resource */
+ dlm_lockres_grab_inflight_ref(dlm, tmpres);
+
+ spin_unlock(&tmpres->spinlock);
if (res)
dlm_lockres_put(res);
res = tmpres;
@@ -829,8 +840,8 @@ lookup:
* but they might own this lockres. wait on them. */
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
- "recover before lock mastery can begin\n",
+ mlog(0, "%s: res %.*s, At least one node (%d) "
+ "to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
}
@@ -843,12 +854,11 @@ lookup:
/* finally add the lockres to its hash bucket */
__dlm_insert_lockres(dlm, res);
- /* since this lockres is new it doesn't not require the spinlock */
- dlm_lockres_grab_inflight_ref_new(dlm, res);
- /* if this node does not become the master make sure to drop
- * this inflight reference below */
- drop_inflight_if_nonlocal = 1;
+ /* Grab inflight ref to pin the resource */
+ spin_lock(&res->spinlock);
+ dlm_lockres_grab_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
/* get an extra ref on the mle in case this is a BLOCK
* if so, the creator of the BLOCK may try to put the last
@@ -864,8 +874,8 @@ redo_request:
* dlm spinlock would be detectable be a change on the mle,
* so we only need to clear out the recovery map once. */
if (dlm_is_recovery_lock(lockid, namelen)) {
- mlog(ML_NOTICE, "%s: recovery map is not empty, but "
- "must master $RECOVERY lock now\n", dlm->name);
+ mlog(0, "%s: Recovery map is not empty, but must "
+ "master $RECOVERY lock now\n", dlm->name);
if (!dlm_pre_master_reco_lockres(dlm, res))
wait_on_recovery = 0;
else {
@@ -883,8 +893,8 @@ redo_request:
spin_lock(&dlm->spinlock);
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
- "recover before lock mastery can begin\n",
+ mlog(0, "%s: res %.*s, At least one node (%d) "
+ "to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
} else
@@ -913,8 +923,8 @@ redo_request:
* yet, keep going until it does. this is how the
* master will know that asserts are needed back to
* the lower nodes. */
- mlog(0, "%s:%.*s: requests only up to %u but master "
- "is %u, keep going\n", dlm->name, namelen,
+ mlog(0, "%s: res %.*s, Requests only up to %u but "
+ "master is %u, keep going\n", dlm->name, namelen,
lockid, nodenum, mle->master);
}
}
@@ -924,13 +934,12 @@ wait:
ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
if (ret < 0) {
wait_on_recovery = 1;
- mlog(0, "%s:%.*s: node map changed, redo the "
- "master request now, blocked=%d\n",
- dlm->name, res->lockname.len,
+ mlog(0, "%s: res %.*s, Node map changed, redo the master "
+ "request now, blocked=%d\n", dlm->name, res->lockname.len,
res->lockname.name, blocked);
if (++tries > 20) {
- mlog(ML_ERROR, "%s:%.*s: spinning on "
- "dlm_wait_for_lock_mastery, blocked=%d\n",
+ mlog(ML_ERROR, "%s: res %.*s, Spinning on "
+ "dlm_wait_for_lock_mastery, blocked = %d\n",
dlm->name, res->lockname.len,
res->lockname.name, blocked);
dlm_print_one_lock_resource(res);
@@ -940,7 +949,8 @@ wait:
goto redo_request;
}
- mlog(0, "lockres mastered by %u\n", res->owner);
+ mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
+ res->lockname.name, res->owner);
/* make sure we never continue without this */
BUG_ON(res->owner == O2NM_MAX_NODES);
@@ -952,8 +962,6 @@ wait:
wake_waiters:
spin_lock(&res->spinlock);
- if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
- dlm_lockres_drop_inflight_ref(dlm, res);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
@@ -1426,9 +1434,7 @@ way_up_top:
}
if (res->owner == dlm->node_num) {
- mlog(0, "%s:%.*s: setting bit %u in refmap\n",
- dlm->name, namelen, name, request->node_idx);
- dlm_lockres_set_refmap_bit(request->node_idx, res);
+ dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
spin_unlock(&res->spinlock);
response = DLM_MASTER_RESP_YES;
if (mle)
@@ -1493,10 +1499,8 @@ way_up_top:
* go back and clean the mles on any
* other nodes */
dispatch_assert = 1;
- dlm_lockres_set_refmap_bit(request->node_idx, res);
- mlog(0, "%s:%.*s: setting bit %u in refmap\n",
- dlm->name, namelen, name,
- request->node_idx);
+ dlm_lockres_set_refmap_bit(dlm, res,
+ request->node_idx);
} else
response = DLM_MASTER_RESP_NO;
} else {
@@ -1702,7 +1706,7 @@ again:
"lockres, set the bit in the refmap\n",
namelen, lockname, to);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(to, res);
+ dlm_lockres_set_refmap_bit(dlm, res, to);
spin_unlock(&res->spinlock);
}
}
@@ -2187,8 +2191,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
namelen = res->lockname.len;
BUG_ON(namelen > O2NM_MAX_NAME_LEN);
- mlog(0, "%s:%.*s: sending deref to %d\n",
- dlm->name, namelen, lockname, res->owner);
memset(&deref, 0, sizeof(deref));
deref.node_idx = dlm->node_num;
deref.namelen = namelen;
@@ -2197,14 +2199,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
&deref, sizeof(deref), res->owner, &r);
if (ret < 0)
- mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
- "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
- res->owner);
+ mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
+ dlm->name, namelen, lockname, ret, res->owner);
else if (r < 0) {
/* BAD. other node says I did not have a ref. */
- mlog(ML_ERROR,"while dropping ref on %s:%.*s "
- "(master=%u) got %d.\n", dlm->name, namelen,
- lockname, res->owner, r);
+ mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+ dlm->name, namelen, lockname, res->owner, r);
dlm_print_one_lock_resource(res);
BUG();
}
@@ -2260,7 +2260,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
else {
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
if (test_bit(node, res->refmap)) {
- dlm_lockres_clear_refmap_bit(node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
}
@@ -2320,7 +2320,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
if (test_bit(node, res->refmap)) {
__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
- dlm_lockres_clear_refmap_bit(node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
spin_unlock(&res->spinlock);
@@ -2802,7 +2802,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
BUG_ON(!list_empty(&lock->bast_list));
BUG_ON(lock->ast_pending);
BUG_ON(lock->bast_pending);
- dlm_lockres_clear_refmap_bit(lock->ml.node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res,
+ lock->ml.node);
list_del_init(&lock->list);
dlm_lock_put(lock);
/* In a normal unlock, we would have added a
@@ -2823,7 +2824,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
mlog(0, "%s:%.*s: node %u had a ref to this "
"migrating lockres, clearing\n", dlm->name,
res->lockname.len, res->lockname.name, bit);
- dlm_lockres_clear_refmap_bit(bit, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, bit);
}
bit++;
}
@@ -2916,9 +2917,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
&migrate, sizeof(migrate), nodenum,
&status);
if (ret < 0) {
- mlog(ML_ERROR, "Error %d when sending message %u (key "
- "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
- dlm->key, nodenum);
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send "
+ "MIGRATE_REQUEST to node %u\n", dlm->name,
+ migrate.namelen, migrate.name, ret, nodenum);
if (!dlm_is_host_down(ret)) {
mlog(ML_ERROR, "unhandled error=%d!\n", ret);
BUG();
@@ -2937,7 +2938,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
dlm->name, res->lockname.len, res->lockname.name,
nodenum);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(nodenum, res);
+ dlm_lockres_set_refmap_bit(dlm, res, nodenum);
spin_unlock(&res->spinlock);
}
}
@@ -3271,7 +3272,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
* mastery reference here since old_master will briefly have
* a reference after the migration completes */
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(old_master, res);
+ dlm_lockres_set_refmap_bit(dlm, res, old_master);
spin_unlock(&res->spinlock);
mlog(0, "now time to do a migrate request to other nodes\n");
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7efab6d28a2..01ebfd0bdad 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -362,40 +362,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
}
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
{
- if (timeout) {
- mlog(ML_NOTICE, "%s: waiting %dms for notification of "
- "death of node %u\n", dlm->name, timeout, node);
+ if (dlm_is_node_dead(dlm, node))
+ return;
+
+ printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
+ "domain %s\n", node, dlm->name);
+
+ if (timeout)
wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_dead(dlm, node),
- msecs_to_jiffies(timeout));
- } else {
- mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
- "of death of node %u\n", dlm->name, node);
+ dlm_is_node_dead(dlm, node),
+ msecs_to_jiffies(timeout));
+ else
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_dead(dlm, node));
- }
- /* for now, return 0 */
- return 0;
}
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
{
- if (timeout) {
- mlog(0, "%s: waiting %dms for notification of "
- "recovery of node %u\n", dlm->name, timeout, node);
+ if (dlm_is_node_recovered(dlm, node))
+ return;
+
+ printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
+ "domain %s\n", node, dlm->name);
+
+ if (timeout)
wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_recovered(dlm, node),
- msecs_to_jiffies(timeout));
- } else {
- mlog(0, "%s: waiting indefinitely for notification "
- "of recovery of node %u\n", dlm->name, node);
+ dlm_is_node_recovered(dlm, node),
+ msecs_to_jiffies(timeout));
+ else
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_recovered(dlm, node));
- }
- /* for now, return 0 */
- return 0;
}
/* callers of the top-level api calls (dlmlock/dlmunlock) should
@@ -430,6 +428,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm)
{
spin_lock(&dlm->spinlock);
BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+ printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
+ dlm->name, dlm->reco.dead_node);
dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
spin_unlock(&dlm->spinlock);
}
@@ -440,9 +440,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
spin_unlock(&dlm->spinlock);
+ printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
wake_up(&dlm->reco.event);
}
+static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
+{
+ printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
+ "dead node %u in domain %s\n", dlm->reco.new_master,
+ (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
+ dlm->reco.dead_node, dlm->name);
+}
+
static int dlm_do_recovery(struct dlm_ctxt *dlm)
{
int status = 0;
@@ -505,9 +514,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
}
mlog(0, "another node will master this recovery session.\n");
}
- mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
- dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master,
- dlm->node_num, dlm->reco.dead_node);
+
+ dlm_print_recovery_master(dlm);
/* it is safe to start everything back up here
* because all of the dead node's lock resources
@@ -518,15 +526,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
return 0;
master_here:
- mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node "
- "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
- dlm->node_num, dlm->reco.dead_node, dlm->name);
+ dlm_print_recovery_master(dlm);
status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
if (status < 0) {
/* we should never hit this anymore */
- mlog(ML_ERROR, "error %d remastering locks for node %u, "
- "retrying.\n", status, dlm->reco.dead_node);
+ mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
+ "retrying.\n", dlm->name, status, dlm->reco.dead_node);
/* yield a bit to allow any final network messages
* to get handled on remaining nodes */
msleep(100);
@@ -567,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
- mlog(0, "requesting lock info from node %u\n",
+ mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
ndata->node_num);
if (ndata->node_num == dlm->node_num) {
@@ -640,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
spin_unlock(&dlm_reco_state_lock);
}
- mlog(0, "done requesting all lock info\n");
+ mlog(0, "%s: Done requesting all lock info\n", dlm->name);
/* nodes should be sending reco data now
* just need to wait */
@@ -802,10 +808,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
/* negative status is handled by caller */
if (ret < 0)
- mlog(ML_ERROR, "Error %d when sending message %u (key "
- "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
- dlm->key, request_from);
-
+ mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
+ "to recover dead node %u\n", dlm->name, ret,
+ request_from, dead_node);
// return from here, then
// sleep until all received or error
return ret;
@@ -956,9 +961,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
sizeof(done_msg), send_to, &tmpret);
if (ret < 0) {
- mlog(ML_ERROR, "Error %d when sending message %u (key "
- "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
- dlm->key, send_to);
+ mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
+ "to recover dead node %u\n", dlm->name, ret, send_to,
+ dead_node);
if (!dlm_is_host_down(ret)) {
BUG();
}
@@ -1127,9 +1132,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
if (ret < 0) {
/* XXX: negative status is not handled.
* this will end up killing this node. */
- mlog(ML_ERROR, "Error %d when sending message %u (key "
- "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
- dlm->key, send_to);
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
+ "node %u (%s)\n", dlm->name, mres->lockname_len,
+ mres->lockname, ret, send_to,
+ (orig_flags & DLM_MRES_MIGRATION ?
+ "migration" : "recovery"));
} else {
/* might get an -ENOMEM back here */
ret = status;
@@ -1767,7 +1774,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
dlm->name, mres->lockname_len, mres->lockname,
from);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(from, res);
+ dlm_lockres_set_refmap_bit(dlm, res, from);
spin_unlock(&res->spinlock);
added++;
break;
@@ -1965,7 +1972,7 @@ skip_lvb:
mlog(0, "%s:%.*s: added lock for node %u, "
"setting refmap bit\n", dlm->name,
res->lockname.len, res->lockname.name, ml->node);
- dlm_lockres_set_refmap_bit(ml->node, res);
+ dlm_lockres_set_refmap_bit(dlm, res, ml->node);
added++;
}
spin_unlock(&res->spinlock);
@@ -2084,6 +2091,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
if (res->owner == dead_node) {
+ mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->owner, new_master);
list_del_init(&res->recovering);
spin_lock(&res->spinlock);
/* new_master has our reference from
@@ -2105,40 +2115,30 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
- if (res->state & DLM_LOCK_RES_RECOVERING) {
- if (res->owner == dead_node) {
- mlog(0, "(this=%u) res %.*s owner=%u "
- "was not on recovering list, but "
- "clearing state anyway\n",
- dlm->node_num, res->lockname.len,
- res->lockname.name, new_master);
- } else if (res->owner == dlm->node_num) {
- mlog(0, "(this=%u) res %.*s owner=%u "
- "was not on recovering list, "
- "owner is THIS node, clearing\n",
- dlm->node_num, res->lockname.len,
- res->lockname.name, new_master);
- } else
- continue;
+ if (!(res->state & DLM_LOCK_RES_RECOVERING))
+ continue;
- if (!list_empty(&res->recovering)) {
- mlog(0, "%s:%.*s: lockres was "
- "marked RECOVERING, owner=%u\n",
- dlm->name, res->lockname.len,
- res->lockname.name, res->owner);
- list_del_init(&res->recovering);
- dlm_lockres_put(res);
- }
- spin_lock(&res->spinlock);
- /* new_master has our reference from
- * the lock state sent during recovery */
- dlm_change_lockres_owner(dlm, res, new_master);
- res->state &= ~DLM_LOCK_RES_RECOVERING;
- if (__dlm_lockres_has_locks(res))
- __dlm_dirty_lockres(dlm, res);
- spin_unlock(&res->spinlock);
- wake_up(&res->wq);
+ if (res->owner != dead_node &&
+ res->owner != dlm->node_num)
+ continue;
+
+ if (!list_empty(&res->recovering)) {
+ list_del_init(&res->recovering);
+ dlm_lockres_put(res);
}
+
+ /* new_master has our reference from
+ * the lock state sent during recovery */
+ mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->owner, new_master);
+ spin_lock(&res->spinlock);
+ dlm_change_lockres_owner(dlm, res, new_master);
+ res->state &= ~DLM_LOCK_RES_RECOVERING;
+ if (__dlm_lockres_has_locks(res))
+ __dlm_dirty_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
}
}
}
@@ -2252,12 +2252,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
res->lockname.len, res->lockname.name, freed, dead_node);
__dlm_print_one_lock_resource(res);
}
- dlm_lockres_clear_refmap_bit(dead_node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
} else if (test_bit(dead_node, res->refmap)) {
mlog(0, "%s:%.*s: dead node %u had a ref, but had "
"no locks and had not purged before dying\n", dlm->name,
res->lockname.len, res->lockname.name, dead_node);
- dlm_lockres_clear_refmap_bit(dead_node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
}
/* do not kick thread yet */
@@ -2324,9 +2324,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
- mlog(ML_NOTICE, "Ignore %.*s for "
+ mlog(ML_NOTICE, "%s: res %.*s, Skip "
"recovery as it is being freed\n",
- res->lockname.len,
+ dlm->name, res->lockname.len,
res->lockname.name);
} else
dlm_move_lockres_to_recovery_list(dlm,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 1d6d1d22c47..e73c833fc2a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -94,24 +94,26 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
{
int bit;
+ assert_spin_locked(&res->spinlock);
+
if (__dlm_lockres_has_locks(res))
return 0;
+ /* Locks are in the process of being created */
+ if (res->inflight_locks)
+ return 0;
+
if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
return 0;
if (res->state & DLM_LOCK_RES_RECOVERING)
return 0;
+ /* Another node has this resource with this node as the master */
bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES)
return 0;
- /*
- * since the bit for dlm->node_num is not set, inflight_locks better
- * be zero
- */
- BUG_ON(res->inflight_locks != 0);
return 1;
}
@@ -185,8 +187,6 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
/* clear our bit from the master's refmap, ignore errors */
ret = dlm_drop_lockres_ref(dlm, res);
if (ret < 0) {
- mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
- res->lockname.len, res->lockname.name, ret);
if (!dlm_is_host_down(ret))
BUG();
}
@@ -209,7 +209,7 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
BUG();
}
- __dlm_unhash_lockres(res);
+ __dlm_unhash_lockres(dlm, res);
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7642d7ca73e..81a4cd22f80 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1692,7 +1692,7 @@ int ocfs2_open_lock(struct inode *inode)
mlog(0, "inode %llu take PRMODE open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- if (ocfs2_mount_local(osb))
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
goto out;
lockres = &OCFS2_I(inode)->ip_open_lockres;
@@ -1718,6 +1718,12 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (write)
+ status = -EROFS;
+ goto out;
+ }
+
if (ocfs2_mount_local(osb))
goto out;
@@ -2092,7 +2098,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
inode->i_gid = be32_to_cpu(lvb->lvb_igid);
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
- inode->i_nlink = be16_to_cpu(lvb->lvb_inlink);
+ set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
ocfs2_unpack_timespec(&inode->i_atime,
be64_to_cpu(lvb->lvb_iatime_packed));
ocfs2_unpack_timespec(&inode->i_mtime,
@@ -2298,7 +2304,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
if (ocfs2_is_hard_readonly(osb)) {
if (ex)
status = -EROFS;
- goto bail;
+ goto getbh;
}
if (ocfs2_mount_local(osb))
@@ -2356,7 +2362,7 @@ local:
mlog_errno(status);
goto bail;
}
-
+getbh:
if (ret_bh) {
status = ocfs2_assign_bh(inode, ret_bh, local_bh);
if (status < 0) {
@@ -2628,8 +2634,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
BUG_ON(!dl);
- if (ocfs2_is_hard_readonly(osb))
- return -EROFS;
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (ex)
+ return -EROFS;
+ return 0;
+ }
if (ocfs2_mount_local(osb))
return 0;
@@ -2647,7 +2656,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (!ocfs2_mount_local(osb))
+ if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
}
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 23457b491e8..2f5b92ef0e5 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -832,6 +832,102 @@ out:
return ret;
}
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret;
+ unsigned int is_last = 0, is_data = 0;
+ u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 cpos, cend, clen, hole_size;
+ u64 extoff, extlen;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_extent_rec rec;
+
+ BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE);
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ if (*offset >= inode->i_size) {
+ ret = -ENXIO;
+ goto out_unlock;
+ }
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ if (origin == SEEK_HOLE)
+ *offset = inode->i_size;
+ goto out_unlock;
+ }
+
+ clen = 0;
+ cpos = *offset >> cs_bits;
+ cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);
+
+ while (cpos < cend && !is_last) {
+ ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
+ &rec, &is_last);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ extoff = cpos;
+ extoff <<= cs_bits;
+
+ if (rec.e_blkno == 0ULL) {
+ clen = hole_size;
+ is_data = 0;
+ } else {
+ clen = le16_to_cpu(rec.e_leaf_clusters) -
+ (cpos - le32_to_cpu(rec.e_cpos));
+ is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
+ }
+
+ if ((!is_data && origin == SEEK_HOLE) ||
+ (is_data && origin == SEEK_DATA)) {
+ if (extoff > *offset)
+ *offset = extoff;
+ goto out_unlock;
+ }
+
+ if (!is_last)
+ cpos += clen;
+ }
+
+ if (origin == SEEK_HOLE) {
+ extoff = cpos;
+ extoff <<= cs_bits;
+ extlen = clen;
+ extlen <<= cs_bits;
+
+ if ((extoff + extlen) > inode->i_size)
+ extlen = inode->i_size - extoff;
+ extoff += extlen;
+ if (extoff > *offset)
+ *offset = extoff;
+ goto out_unlock;
+ }
+
+ ret = -ENXIO;
+
+out_unlock:
+
+ brelse(di_bh);
+
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ocfs2_inode_unlock(inode, 0);
+out:
+ if (ret && ret != -ENXIO)
+ ret = -ENXIO;
+ return ret;
+}
+
int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index e79d41c2c90..67ea57d2fd5 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 map_start, u64 map_len);
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
+
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
struct ocfs2_extent_list *el,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index de4ea1af041..6e396683c3d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1950,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (ret < 0)
mlog_errno(ret);
+ if (file->f_flags & O_SYNC)
+ handle->h_sync = 1;
+
ocfs2_commit_trans(osb, handle);
out_inode_unlock:
@@ -2052,6 +2055,23 @@ out:
return ret;
}
+static void ocfs2_aiodio_wait(struct inode *inode)
+{
+ wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+
+ wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+ int blockmask = inode->i_sb->s_blocksize - 1;
+ loff_t final_size = pos + count;
+
+ if ((pos & blockmask) || (final_size & blockmask))
+ return 1;
+ return 0;
+}
+
static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
struct file *file,
loff_t pos, size_t count,
@@ -2230,6 +2250,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
+ int unaligned_dio = 0;
trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2297,6 +2318,10 @@ relock:
goto out;
}
+ if (direct_io && !is_sync_kiocb(iocb))
+ unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+ *ppos);
+
/*
* We can't complete the direct I/O as requested, fall back to
* buffered I/O.
@@ -2311,6 +2336,18 @@ relock:
goto relock;
}
+ if (unaligned_dio) {
+ /*
+ * Wait on previous unaligned aio to complete before
+ * proceeding.
+ */
+ ocfs2_aiodio_wait(inode);
+
+ /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+ atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+ ocfs2_iocb_set_unaligned_aio(iocb);
+ }
+
/*
* To later detect whether a journal commit for sync writes is
* necessary, we sample i_size, and cluster count here.
@@ -2382,8 +2419,12 @@ out_dio:
if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
have_alloc_sem = 0;
+ unaligned_dio = 0;
}
+ if (unaligned_dio)
+ atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
+
out:
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
@@ -2591,6 +2632,57 @@ bail:
return ret;
}
+/* Refer generic_file_llseek_unlocked() */
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ switch (origin) {
+ case SEEK_SET:
+ break;
+ case SEEK_END:
+ offset += inode->i_size;
+ break;
+ case SEEK_CUR:
+ if (offset == 0) {
+ offset = file->f_pos;
+ goto out;
+ }
+ offset += file->f_pos;
+ break;
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
+ if (ret)
+ goto out;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+ ret = -EINVAL;
+ if (!ret && offset > inode->i_sb->s_maxbytes)
+ ret = -EINVAL;
+ if (ret)
+ goto out;
+
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+
+out:
+ mutex_unlock(&inode->i_mutex);
+ if (ret)
+ return ret;
+ return offset;
+}
+
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
@@ -2615,7 +2707,7 @@ const struct inode_operations ocfs2_special_file_iops = {
* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
*/
const struct file_operations ocfs2_fops = {
- .llseek = generic_file_llseek,
+ .llseek = ocfs2_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.mmap = ocfs2_mmap,
@@ -2663,7 +2755,7 @@ const struct file_operations ocfs2_dops = {
* the cluster.
*/
const struct file_operations ocfs2_fops_no_plocks = {
- .llseek = generic_file_llseek,
+ .llseek = ocfs2_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.mmap = ocfs2_mmap,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b4c8bb6b8d2..17454a904d7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -291,7 +291,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)le64_to_cpu(fe->i_blkno));
- inode->i_nlink = ocfs2_read_links_count(fe);
+ set_nlink(inode, ocfs2_read_links_count(fe));
trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
le32_to_cpu(fe->i_flags));
@@ -951,7 +951,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
trace_ocfs2_cleanup_delete_inode(
(unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
if (sync_data)
- write_inode_now(inode, 1);
+ filemap_write_and_wait(inode->i_mapping);
truncate_inode_pages(&inode->i_data, 0);
}
@@ -1290,7 +1290,7 @@ void ocfs2_refresh_inode(struct inode *inode,
OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
ocfs2_set_inode_flags(inode);
i_size_write(inode, le64_to_cpu(fe->i_size));
- inode->i_nlink = ocfs2_read_links_count(fe);
+ set_nlink(inode, ocfs2_read_links_count(fe));
inode->i_uid = le32_to_cpu(fe->i_uid);
inode->i_gid = le32_to_cpu(fe->i_gid);
inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1c508b149b3..88924a3133f 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,6 +43,9 @@ struct ocfs2_inode_info
/* protects extended attribute changes on this inode */
struct rw_semaphore ip_xattr_sem;
+ /* Number of outstanding AIO's which are not page aligned */
+ atomic_t ip_unaligned_aio;
+
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index bc91072b721..726ff265b29 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -122,7 +122,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
(OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
if (!capable(CAP_LINUX_IMMUTABLE))
- goto bail_unlock;
+ goto bail_commit;
}
ocfs2_inode->ip_attr = flags;
@@ -132,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if (status < 0)
mlog_errno(status);
+bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
ocfs2_inode_unlock(inode, 1);
@@ -381,7 +382,7 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
if (!oifi) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_err;
}
if (o2info_from_user(*oifi, req))
@@ -431,7 +432,7 @@ bail:
o2info_set_request_error(&oifi->ifi_req, req);
kfree(oifi);
-
+out_err:
return status;
}
@@ -666,7 +667,7 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
if (!oiff) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_err;
}
if (o2info_from_user(*oiff, req))
@@ -716,7 +717,7 @@ bail:
o2info_set_request_error(&oiff->iff_req, req);
kfree(oiff);
-
+out_err:
return status;
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 295d56454e8..0a42ae96dca 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1544,9 +1544,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
/* we need to run complete recovery for offline orphan slots */
ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
- mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
- node_num, slot_num,
- MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+ printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\
+ "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -1601,6 +1601,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
jbd2_journal_destroy(journal);
+ printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\
+ "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
done:
/* drop the lock on this nodes journal */
if (got_lock)
@@ -1808,6 +1811,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
* every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
* is done to catch any orphans that are left over in orphan directories.
*
+ * It scans all slots, even ones that are in use. It does so to handle the
+ * case described below:
+ *
+ * Node 1 has an inode it was using. The dentry went away due to memory
+ * pressure. Node 1 closes the inode, but it's on the free list. The node
+ * has the open lock.
+ * Node 2 unlinks the inode. It grabs the dentry lock to notify others,
+ * but node 1 has no dentry and doesn't get the message. It trylocks the
+ * open lock, sees that another node has a PR, and does nothing.
+ * Later node 2 runs its orphan dir. It igets the inode, trylocks the
+ * open lock, sees the PR still, and does nothing.
+ * Basically, we have to trigger an orphan iput on node 1. The only way
+ * for this to happen is if node 1 runs node 2's orphan dir.
+ *
* ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
* seconds. It gets an EX lock on os_lockres and checks sequence number
* stored in LVB. If the sequence number has changed, it means some other
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 68cf2f6d3c6..a3385b63ff5 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -441,10 +441,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
- * update on dir + index leaf + dx root update for free list */
+ * update on dir + index leaf + dx root update for free list +
+ * previous dirblock update in the free list */
static inline int ocfs2_link_credits(struct super_block *sb)
{
- return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
+ return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
ocfs2_quota_trans_credits(sb);
}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 3e9393ca39e..9cd41083e99 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
struct page *page)
{
- int ret;
+ int ret = VM_FAULT_NOPAGE;
struct inode *inode = file->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
@@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
void *fsdata;
loff_t size = i_size_read(inode);
- /*
- * Another node might have truncated while we were waiting on
- * cluster locks.
- * We don't check size == 0 before the shift. This is borrowed
- * from do_generic_file_read.
- */
last_index = (size - 1) >> PAGE_CACHE_SHIFT;
- if (unlikely(!size || page->index > last_index)) {
- ret = -EINVAL;
- goto out;
- }
/*
- * The i_size check above doesn't catch the case where nodes
- * truncated and then re-extended the file. We'll re-check the
- * page mapping after taking the page lock inside of
- * ocfs2_write_begin_nolock().
+ * There are cases that lead to the page no longer bebongs to the
+ * mapping.
+ * 1) pagecache truncates locally due to memory pressure.
+ * 2) pagecache truncates when another is taking EX lock against
+ * inode lock. see ocfs2_data_convert_worker.
+ *
+ * The i_size check doesn't catch the case where nodes truncated and
+ * then re-extended the file. We'll re-check the page mapping after
+ * taking the page lock inside of ocfs2_write_begin_nolock().
+ *
+ * Let VM retry with these cases.
*/
- if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
- /*
- * the page has been umapped in ocfs2_data_downconvert_worker.
- * So return 0 here and let VFS retry.
- */
- ret = 0;
+ if ((page->mapping != inode->i_mapping) ||
+ (!PageUptodate(page)) ||
+ (page_offset(page) >= size))
goto out;
- }
/*
* Call ocfs2_write_begin() and ocfs2_write_end() to take
@@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
- fsdata);
- if (ret < 0) {
- mlog_errno(ret);
+ if (!locked_page) {
+ ret = VM_FAULT_NOPAGE;
goto out;
}
+ ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+ fsdata);
BUG_ON(ret != len);
- ret = 0;
+ ret = VM_FAULT_LOCKED;
out:
return ret;
}
@@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
out:
ocfs2_unblock_signals(&oldset);
- if (ret)
- ret = VM_FAULT_SIGBUS;
return ret;
}
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index d53cb706f14..184c76b8c29 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -745,7 +745,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
*/
ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
new_phys_cpos);
- if (!new_phys_cpos) {
+ if (!*new_phys_cpos) {
ret = -ENOSPC;
goto out_commit;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 53aa41ed7bf..a8b2bfea574 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -199,9 +199,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
* these are used by the support functions here and in
* callers. */
if (S_ISDIR(mode))
- inode->i_nlink = 2;
- else
- inode->i_nlink = 1;
+ set_nlink(inode, 2);
inode_init_owner(inode, dir, mode);
dquot_initialize(inode);
return inode;
@@ -1379,7 +1377,7 @@ static int ocfs2_rename(struct inode *old_dir,
}
if (new_inode) {
- new_inode->i_nlink--;
+ drop_nlink(new_inode);
new_inode->i_ctime = CURRENT_TIME;
}
old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
@@ -1387,9 +1385,9 @@ static int ocfs2_rename(struct inode *old_dir,
if (update_dot_dot) {
status = ocfs2_update_entry(old_inode, handle,
&old_inode_dot_dot_res, new_dir);
- old_dir->i_nlink--;
+ drop_nlink(old_dir);
if (new_inode) {
- new_inode->i_nlink--;
+ drop_nlink(new_inode);
} else {
inc_nlink(new_dir);
mark_inode_dirty(new_dir);
@@ -2018,7 +2016,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
ocfs2_add_links_count(orphan_fe, 1);
- orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
ocfs2_journal_dirty(handle, orphan_dir_bh);
status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
@@ -2116,7 +2114,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
ocfs2_add_links_count(orphan_fe, -1);
- orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
ocfs2_journal_dirty(handle, orphan_dir_bh);
leave:
@@ -2282,7 +2280,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
goto leave;
}
- inode->i_nlink = 0;
+ clear_nlink(inode);
/* do the real work now. */
status = __ocfs2_mknod_locked(dir, inode,
0, &new_di_bh, parent_di_bh, handle,
@@ -2437,7 +2435,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
di = (struct ocfs2_dinode *)di_bh->b_data;
le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
di->i_orphaned_slot = 0;
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
ocfs2_set_links_count(di, inode->i_nlink);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 409285854f6..d355e6e36b3 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -836,18 +836,65 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
{
- __test_and_set_bit_le(bit, bitmap);
+ __set_bit_le(bit, bitmap);
}
#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
{
- __test_and_clear_bit_le(bit, bitmap);
+ __clear_bit_le(bit, bitmap);
}
#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
#define ocfs2_test_bit test_bit_le
#define ocfs2_find_next_zero_bit find_next_zero_bit_le
#define ocfs2_find_next_bit find_next_bit_le
+
+static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr)
+{
+#if BITS_PER_LONG == 64
+ *bit += ((unsigned long) addr & 7UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~7UL);
+#elif BITS_PER_LONG == 32
+ *bit += ((unsigned long) addr & 3UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~3UL);
+#else
+#error "how many bits you are?!"
+#endif
+ return addr;
+}
+
+static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ ocfs2_set_bit(bit, bitmap);
+}
+
+static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ ocfs2_clear_bit(bit, bitmap);
+}
+
+static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ return ocfs2_test_bit(bit, bitmap);
+}
+
+static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max,
+ int start)
+{
+ int fix = 0, ret, tmpmax;
+ bitmap = correct_addr_and_bit_unaligned(&fix, bitmap);
+ tmpmax = max + fix;
+ start += fix;
+
+ ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
+}
+
#endif /* OCFS2_H */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dc8007fc924..f100bf70a90 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -404,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
int status = 0;
struct ocfs2_quota_recovery *rec;
- mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for "
+ "slot %u\n", osb->dev_str, slot_num);
+
rec = ocfs2_alloc_quota_recovery();
if (!rec)
return ERR_PTR(-ENOMEM);
@@ -549,8 +551,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
goto out_commit;
}
lock_buffer(qbh);
- WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
- ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+ WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap));
+ ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, 1);
unlock_buffer(qbh);
ocfs2_journal_dirty(handle, qbh);
@@ -596,7 +598,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
struct inode *lqinode;
unsigned int flags;
- mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
+ "slot %u\n", osb->dev_str, slot_num);
+
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
for (type = 0; type < MAXQUOTAS; type++) {
if (list_empty(&(rec->r_list[type])))
@@ -612,8 +616,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
/* Someone else is holding the lock? Then he must be
* doing the recovery. Just skip the file... */
if (status == -EAGAIN) {
- mlog(ML_NOTICE, "skipping quota recovery for slot %d "
- "because quota file is locked.\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Skipping quota recovery on "
+ "device (%s) for slot %d because quota file is "
+ "locked.\n", osb->dev_str, slot_num);
status = 0;
goto out_put;
} else if (status < 0) {
@@ -944,7 +949,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
* ol_quota_entries_per_block(sb);
}
- found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+ found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0);
/* We failed? */
if (found == len) {
mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
@@ -1208,7 +1213,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
struct ocfs2_local_disk_chunk *dchunk;
dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
- ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+ ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, -1);
}
@@ -1289,7 +1294,7 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
(od->dq_chunk->qc_headerbh->b_data);
/* Mark structure as freed */
lock_buffer(od->dq_chunk->qc_headerbh);
- ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+ ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, 1);
unlock_buffer(od->dq_chunk->qc_headerbh);
ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 26fc0014d50..1424c151ccc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -493,8 +493,8 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
goto bail;
}
} else
- mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
- slot);
+ printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+ "allocated to this node!\n", slot, osb->dev_str);
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 19965b00c43..94368017edb 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -28,6 +28,7 @@
#include "cluster/masklog.h"
#include "cluster/nodemanager.h"
#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
#include "stackglue.h"
@@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
}
/*
+ * Check if this node is heartbeating and is connected to all other
+ * heartbeating nodes.
+ */
+static int o2cb_cluster_check(void)
+{
+ u8 node_num;
+ int i;
+ unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+ node_num = o2nm_this_node();
+ if (node_num == O2NM_MAX_NODES) {
+ printk(KERN_ERR "o2cb: This node has not been configured.\n");
+ return -EINVAL;
+ }
+
+ /*
+ * o2dlm expects o2net sockets to be created. If not, then
+ * dlm_join_domain() fails with a stack of errors which are both cryptic
+ * and incomplete. The idea here is to detect upfront whether we have
+ * managed to connect to all nodes or not. If not, then list the nodes
+ * to allow the user to check the configuration (incorrect IP, firewall,
+ * etc.) Yes, this is racy. But its not the end of the world.
+ */
+#define O2CB_MAP_STABILIZE_COUNT 60
+ for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
+ o2hb_fill_node_map(hbmap, sizeof(hbmap));
+ if (!test_bit(node_num, hbmap)) {
+ printk(KERN_ERR "o2cb: %s heartbeat has not been "
+ "started.\n", (o2hb_global_heartbeat_active() ?
+ "Global" : "Local"));
+ return -EINVAL;
+ }
+ o2net_fill_node_map(netmap, sizeof(netmap));
+ /* Force set the current node to allow easy compare */
+ set_bit(node_num, netmap);
+ if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+ return 0;
+ if (i < O2CB_MAP_STABILIZE_COUNT)
+ msleep(1000);
+ }
+
+ printk(KERN_ERR "o2cb: This node could not connect to nodes:");
+ i = -1;
+ while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (!test_bit(i, netmap))
+ printk(" %u", i);
+ }
+ printk(".\n");
+
+ return -ENOTCONN;
+}
+
+/*
* Called from the dlm when it's about to evict a node. This is how the
* classic stack signals node death.
*/
@@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data)
{
struct ocfs2_cluster_connection *conn = data;
- mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
- node_num, conn->cc_namelen, conn->cc_name);
+ printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
+ node_num, conn->cc_namelen, conn->cc_name);
conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
}
@@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
BUG_ON(conn == NULL);
BUG_ON(conn->cc_proto == NULL);
- /* for now we only have one cluster/node, make sure we see it
- * in the heartbeat universe */
- if (!o2hb_check_local_node_heartbeating()) {
- if (o2hb_global_heartbeat_active())
- mlog(ML_ERROR, "Global heartbeat not started\n");
- rc = -EINVAL;
+ /* Ensure cluster stack is up and all nodes are connected */
+ rc = o2cb_cluster_check();
+ if (rc) {
+ printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
+ "before retrying.\n");
goto out;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 56f61027236..4994f8b0e60 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -54,6 +54,7 @@
#include "ocfs1_fs_compat.h"
#include "alloc.h"
+#include "aops.h"
#include "blockcheck.h"
#include "dlmglue.h"
#include "export.h"
@@ -1107,9 +1108,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
ocfs2_set_ro_flag(osb, 1);
- printk(KERN_NOTICE "Readonly device detected. No cluster "
- "services will be utilized for this mount. Recovery "
- "will be skipped.\n");
+ printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
+ "Cluster services will not be used for this mount. "
+ "Recovery will be skipped.\n", osb->dev_str);
}
if (!ocfs2_is_hard_readonly(osb)) {
@@ -1616,12 +1617,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
return 0;
}
+wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
static int __init ocfs2_init(void)
{
- int status;
+ int status, i;
ocfs2_print_version();
+ for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
+ init_waitqueue_head(&ocfs2__ioend_wq[i]);
+
status = init_ocfs2_uptodate_cache();
if (status < 0) {
mlog_errno(status);
@@ -1760,7 +1766,7 @@ static void ocfs2_inode_init_once(void *data)
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
oi->ip_dir_start_lookup = 0;
-
+ atomic_set(&oi->ip_unaligned_aio, 0);
init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
@@ -1974,7 +1980,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
* If we failed before we got a uuid_str yet, we can't stop
* heartbeat. Otherwise, do it.
*/
- if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+ if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
+ !ocfs2_is_hard_readonly(osb))
hangup_needed = 1;
if (osb->cconn)
@@ -2353,7 +2360,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog_errno(status);
goto bail;
}
- cleancache_init_shared_fs((char *)&uuid_net_key, sb);
+ cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
bail:
return status;
@@ -2462,8 +2469,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
} else {
- mlog(ML_NOTICE, "File system was not unmounted cleanly, "
- "recovering volume.\n");
+ printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
+ "unmounted cleanly, recovering it.\n", osb->dev_str);
}
local = ocfs2_mount_local(osb);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 40c7de084c1..74ff74cf78f 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -31,17 +31,15 @@ extern struct workqueue_struct *ocfs2_wq;
int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
-void __ocfs2_error(struct super_block *sb,
- const char *function,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
+__printf(3, 4)
+void __ocfs2_error(struct super_block *sb, const char *function,
+ const char *fmt, ...);
#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
-void __ocfs2_abort(struct super_block *sb,
- const char *function,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
+__printf(3, 4)
+void __ocfs2_abort(struct super_block *sb, const char *function,
+ const char *fmt, ...);
#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 81ecf9c0bf0..aa9e8777b09 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2376,16 +2376,18 @@ static int ocfs2_remove_value_outside(struct inode*inode,
}
ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
- if (ret < 0) {
- mlog_errno(ret);
- break;
- }
ocfs2_commit_trans(osb, ctxt.handle);
if (ctxt.meta_ac) {
ocfs2_free_alloc_context(ctxt.meta_ac);
ctxt.meta_ac = NULL;
}
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
}
if (ctxt.meta_ac)
@@ -7185,20 +7187,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
{
int ret = 0;
struct buffer_head *dir_bh = NULL;
- struct ocfs2_security_xattr_info si = {
- .enable = 1,
- };
- ret = ocfs2_init_security_get(inode, dir, qstr, &si);
+ ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
if (!ret) {
- ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
- si.name, si.value, si.value_len,
- XATTR_CREATE);
- if (ret) {
- mlog_errno(ret);
- goto leave;
- }
- } else if (ret != -EOPNOTSUPP) {
mlog_errno(ret);
goto leave;
}
@@ -7255,6 +7246,22 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
+int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
+{
+ const struct xattr *xattr;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, XATTR_CREATE);
+ if (err)
+ break;
+ }
+ return err;
+}
+
int ocfs2_init_security_get(struct inode *inode,
struct inode *dir,
const struct qstr *qstr,
@@ -7263,8 +7270,13 @@ int ocfs2_init_security_get(struct inode *inode,
/* check whether ocfs2 support feature xattr */
if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
return -EOPNOTSUPP;
- return security_inode_init_security(inode, dir, qstr, &si->name,
- &si->value, &si->value_len);
+ if (si)
+ return security_old_inode_init_security(inode, dir, qstr,
+ &si->name, &si->value,
+ &si->value_len);
+
+ return security_inode_init_security(inode, dir, qstr,
+ &ocfs2_initxattrs, NULL);
}
int ocfs2_init_security_set(handle_t *handle,
diff --git a/fs/open.c b/fs/open.c
index f7119210945..22c41b543f2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -685,6 +685,10 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
if (error)
goto cleanup_all;
+ error = break_lease(inode, f->f_flags);
+ if (error)
+ goto cleanup_all;
+
if (!open && f->f_op)
open = f->f_op->open;
if (open) {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index a2a5bff774e..e4e0ff7962e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -242,7 +242,7 @@ found:
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
inode->i_op = &openprom_inode_operations;
inode->i_fop = &openprom_operations;
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
break;
case op_inode_prop:
if (!strcmp(dp->name, "options") && (len == 17) &&
@@ -251,7 +251,7 @@ found:
else
inode->i_mode = S_IFREG | S_IRUGO;
inode->i_fop = &openpromfs_prop_ops;
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_size = ent_oi->u.prop->length;
break;
}
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index af9fdf04676..bd8ae788f68 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -49,18 +49,20 @@
#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a)
#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a)
-__attribute__ ((format (printf, 3, 4)))
-static void _ldm_printk (const char *level, const char *function,
- const char *fmt, ...)
+static __printf(3, 4)
+void _ldm_printk(const char *level, const char *function, const char *fmt, ...)
{
- static char buf[128];
+ struct va_format vaf;
va_list args;
va_start (args, fmt);
- vsnprintf (buf, sizeof (buf), fmt, args);
- va_end (args);
- printk ("%s%s(): %s\n", level, function, buf);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk("%s%s(): %pV\n", level, function, &vaf);
+
+ va_end(args);
}
/**
diff --git a/fs/pipe.c b/fs/pipe.c
index 0e0be1dc0f8..4065f07366b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1254,6 +1254,7 @@ out:
static const struct super_operations pipefs_ops = {
.destroy_inode = free_inode_nonrcu,
+ .statfs = simple_statfs,
};
/*
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 10027b42b7e..cea4623f1ed 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -218,6 +218,8 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
const struct posix_acl_entry *pa, *pe, *mask_obj;
int found = 0;
+ want &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
+
FOREACH_ACL_ENTRY(pa, acl, pe) {
switch(pa->e_tag) {
case ACL_USER_OBJ:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5eb02069e1b..851ba3dcdc2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1107,13 +1107,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
goto err_sighand;
}
- if (oom_adjust != task->signal->oom_adj) {
- if (oom_adjust == OOM_DISABLE)
- atomic_inc(&task->mm->oom_disable_count);
- if (task->signal->oom_adj == OOM_DISABLE)
- atomic_dec(&task->mm->oom_disable_count);
- }
-
/*
* Warn that /proc/pid/oom_adj is deprecated, see
* Documentation/feature-removal-schedule.txt.
@@ -1215,12 +1208,6 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
goto err_sighand;
}
- if (oom_score_adj != task->signal->oom_score_adj) {
- if (oom_score_adj == OOM_SCORE_ADJ_MIN)
- atomic_inc(&task->mm->oom_disable_count);
- if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
- atomic_dec(&task->mm->oom_disable_count);
- }
task->signal->oom_score_adj = oom_score_adj;
if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
task->signal->oom_score_adj_min = oom_score_adj;
@@ -2261,7 +2248,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
ei = PROC_I(inode);
inode->i_mode = p->mode;
if (S_ISDIR(inode->i_mode))
- inode->i_nlink = 2; /* Use getattr to fix if necessary */
+ set_nlink(inode, 2); /* Use getattr to fix if necessary */
if (p->iop)
inode->i_op = p->iop;
if (p->fop)
@@ -2655,7 +2642,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
inode->i_mode = p->mode;
if (S_ISDIR(inode->i_mode))
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
if (S_ISLNK(inode->i_mode))
inode->i_size = 64;
if (p->iop)
@@ -2994,8 +2981,8 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
inode->i_fop = &proc_tgid_base_operations;
inode->i_flags|=S_IMMUTABLE;
- inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
- ARRAY_SIZE(tgid_base_stuff));
+ set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
+ ARRAY_SIZE(tgid_base_stuff)));
d_set_d_op(dentry, &pid_dentry_operations);
@@ -3246,8 +3233,8 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
inode->i_fop = &proc_tid_base_operations;
inode->i_flags|=S_IMMUTABLE;
- inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
- ARRAY_SIZE(tid_base_stuff));
+ set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
+ ARRAY_SIZE(tid_base_stuff)));
d_set_d_op(dentry, &pid_dentry_operations);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 9d99131d0d6..10090d9c7ad 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -283,7 +283,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct inode *inode = dentry->d_inode;
struct proc_dir_entry *de = PROC_I(inode)->pde;
if (de && de->nlink)
- inode->i_nlink = de->nlink;
+ set_nlink(inode, de->nlink);
generic_fillattr(inode, stat);
return 0;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 7ed72d6c1c6..7737c5468a4 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -445,7 +445,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
if (de->size)
inode->i_size = de->size;
if (de->nlink)
- inode->i_nlink = de->nlink;
+ set_nlink(inode, de->nlink);
if (de->proc_iops)
inode->i_op = de->proc_iops;
if (de->proc_fops) {
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 586174168e2..80e4645f799 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -131,12 +131,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(i.freeswap),
K(global_page_state(NR_FILE_DIRTY)),
K(global_page_state(NR_WRITEBACK)),
- K(global_page_state(NR_ANON_PAGES)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ K(global_page_state(NR_ANON_PAGES)
+ global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
- HPAGE_PMD_NR
+ HPAGE_PMD_NR),
+#else
+ K(global_page_state(NR_ANON_PAGES)),
#endif
- ),
K(global_page_state(NR_FILE_MAPPED)),
K(global_page_state(NR_SHMEM)),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1a77dbef226..a6b62173d4c 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -3,6 +3,7 @@
*/
#include <linux/init.h>
#include <linux/sysctl.h>
+#include <linux/poll.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/namei.h>
@@ -14,6 +15,15 @@ static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
+void proc_sys_poll_notify(struct ctl_table_poll *poll)
+{
+ if (!poll)
+ return;
+
+ atomic_inc(&poll->event);
+ wake_up_interruptible(&poll->wait);
+}
+
static struct inode *proc_sys_make_inode(struct super_block *sb,
struct ctl_table_header *head, struct ctl_table *table)
{
@@ -39,7 +49,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode->i_fop = &proc_sys_file_operations;
} else {
inode->i_mode |= S_IFDIR;
- inode->i_nlink = 0;
+ clear_nlink(inode);
inode->i_op = &proc_sys_dir_operations;
inode->i_fop = &proc_sys_dir_file_operations;
}
@@ -176,6 +186,39 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
}
+static int proc_sys_open(struct inode *inode, struct file *filp)
+{
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+ if (table->poll)
+ filp->private_data = proc_sys_poll_event(table->poll);
+
+ return 0;
+}
+
+static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ unsigned long event = (unsigned long)filp->private_data;
+ unsigned int ret = DEFAULT_POLLMASK;
+
+ if (!table->proc_handler)
+ goto out;
+
+ if (!table->poll)
+ goto out;
+
+ poll_wait(filp, &table->poll->wait, wait);
+
+ if (event != atomic_read(&table->poll->event)) {
+ filp->private_data = proc_sys_poll_event(table->poll);
+ ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+ }
+
+out:
+ return ret;
+}
static int proc_sys_fill_cache(struct file *filp, void *dirent,
filldir_t filldir,
@@ -364,12 +407,15 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
}
static const struct file_operations proc_sys_file_operations = {
+ .open = proc_sys_open,
+ .poll = proc_sys_poll,
.read = proc_sys_read,
.write = proc_sys_write,
.llseek = default_llseek,
};
static const struct file_operations proc_sys_dir_file_operations = {
+ .read = generic_read_dir,
.readdir = proc_sys_readdir,
.llseek = generic_file_llseek,
};
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9a8a2b77b87..03102d97818 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -91,20 +91,18 @@ static struct file_system_type proc_fs_type = {
void __init proc_root_init(void)
{
- struct vfsmount *mnt;
int err;
proc_init_inodecache();
err = register_filesystem(&proc_fs_type);
if (err)
return;
- mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
- if (IS_ERR(mnt)) {
+ err = pid_ns_prepare_proc(&init_pid_ns);
+ if (err) {
unregister_filesystem(&proc_fs_type);
return;
}
- init_pid_ns.proc_mnt = mnt;
proc_symlink("mounts", NULL, "self/mounts");
proc_net_init();
@@ -209,5 +207,5 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
void pid_ns_release_proc(struct pid_namespace *ns)
{
- mntput(ns->proc_mnt);
+ kern_unmount(ns->proc_mnt);
}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 9758b654a1b..0855e6f2039 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -10,6 +10,7 @@
#include <linux/time.h>
#include <linux/irqnr.h>
#include <asm/cputime.h>
+#include <linux/tick.h>
#ifndef arch_irq_stat_cpu
#define arch_irq_stat_cpu(cpu) 0
@@ -21,6 +22,35 @@
#define arch_idle_time(cpu) 0
#endif
+static cputime64_t get_idle_time(int cpu)
+{
+ u64 idle_time = get_cpu_idle_time_us(cpu, NULL);
+ cputime64_t idle;
+
+ if (idle_time == -1ULL) {
+ /* !NO_HZ so we can rely on cpustat.idle */
+ idle = kstat_cpu(cpu).cpustat.idle;
+ idle = cputime64_add(idle, arch_idle_time(cpu));
+ } else
+ idle = usecs_to_cputime64(idle_time);
+
+ return idle;
+}
+
+static cputime64_t get_iowait_time(int cpu)
+{
+ u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL);
+ cputime64_t iowait;
+
+ if (iowait_time == -1ULL)
+ /* !NO_HZ so we can rely on cpustat.iowait */
+ iowait = kstat_cpu(cpu).cpustat.iowait;
+ else
+ iowait = usecs_to_cputime64(iowait_time);
+
+ return iowait;
+}
+
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
@@ -42,9 +72,8 @@ static int show_stat(struct seq_file *p, void *v)
user = cputime64_add(user, kstat_cpu(i).cpustat.user);
nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
system = cputime64_add(system, kstat_cpu(i).cpustat.system);
- idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
- idle = cputime64_add(idle, arch_idle_time(i));
- iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
+ idle = cputime64_add(idle, get_idle_time(i));
+ iowait = cputime64_add(iowait, get_iowait_time(i));
irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
@@ -76,14 +105,12 @@ static int show_stat(struct seq_file *p, void *v)
(unsigned long long)cputime64_to_clock_t(guest),
(unsigned long long)cputime64_to_clock_t(guest_nice));
for_each_online_cpu(i) {
-
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
user = kstat_cpu(i).cpustat.user;
nice = kstat_cpu(i).cpustat.nice;
system = kstat_cpu(i).cpustat.system;
- idle = kstat_cpu(i).cpustat.idle;
- idle = cputime64_add(idle, arch_idle_time(i));
- iowait = kstat_cpu(i).cpustat.iowait;
+ idle = get_idle_time(i);
+ iowait = get_iowait_time(i);
irq = kstat_cpu(i).cpustat.irq;
softirq = kstat_cpu(i).cpustat.softirq;
steal = kstat_cpu(i).cpustat.steal;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5afaa58a863..e418c5abdb0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -44,6 +44,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
"VmLck:\t%8lu kB\n"
+ "VmPin:\t%8lu kB\n"
"VmHWM:\t%8lu kB\n"
"VmRSS:\t%8lu kB\n"
"VmData:\t%8lu kB\n"
@@ -55,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
hiwater_vm << (PAGE_SHIFT-10),
(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
+ mm->pinned_vm << (PAGE_SHIFT-10),
hiwater_rss << (PAGE_SHIFT-10),
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
@@ -1039,6 +1041,9 @@ static int show_numa_map(struct seq_file *m, void *v)
seq_printf(m, " stack");
}
+ if (is_vm_hugetlb_page(vma))
+ seq_printf(m, " huge");
+
walk_page_range(vma->vm_start, vma->vm_end, &walk);
if (!md->pages)
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cd99bf55765..b0f450a2bb7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
#include <linux/user.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
+#include <linux/export.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/bootmem.h>
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 893b961dcfd..379a02dc121 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -24,6 +24,7 @@
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
+#include <linux/list.h>
#include <linux/string.h>
#include <linux/mount.h>
#include <linux/ramfs.h>
@@ -32,13 +33,18 @@
#include <linux/magic.h>
#include <linux/pstore.h>
#include <linux/slab.h>
+#include <linux/spinlock.h>
#include <linux/uaccess.h>
#include "internal.h"
#define PSTORE_NAMELEN 64
+static DEFINE_SPINLOCK(allpstore_lock);
+static LIST_HEAD(allpstore);
+
struct pstore_private {
+ struct list_head list;
struct pstore_info *psi;
enum pstore_type_id type;
u64 id;
@@ -81,8 +87,16 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
static void pstore_evict_inode(struct inode *inode)
{
+ struct pstore_private *p = inode->i_private;
+ unsigned long flags;
+
end_writeback(inode);
- kfree(inode->i_private);
+ if (p) {
+ spin_lock_irqsave(&allpstore_lock, flags);
+ list_del(&p->list);
+ spin_unlock_irqrestore(&allpstore_lock, flags);
+ kfree(p);
+ }
}
static const struct inode_operations pstore_dir_inode_operations = {
@@ -182,9 +196,23 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
struct dentry *root = pstore_sb->s_root;
struct dentry *dentry;
struct inode *inode;
- int rc;
+ int rc = 0;
char name[PSTORE_NAMELEN];
- struct pstore_private *private;
+ struct pstore_private *private, *pos;
+ unsigned long flags;
+
+ spin_lock_irqsave(&allpstore_lock, flags);
+ list_for_each_entry(pos, &allpstore, list) {
+ if (pos->type == type &&
+ pos->id == id &&
+ pos->psi == psi) {
+ rc = -EEXIST;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&allpstore_lock, flags);
+ if (rc)
+ return rc;
rc = -ENOMEM;
inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
@@ -229,6 +257,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
d_add(dentry, inode);
+ spin_lock_irqsave(&allpstore_lock, flags);
+ list_add(&private->list, &allpstore);
+ spin_unlock_irqrestore(&allpstore_lock, flags);
+
mutex_unlock(&root->d_inode->i_mutex);
return 0;
@@ -277,7 +309,7 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent)
goto fail;
}
- pstore_get_records();
+ pstore_get_records(0);
return 0;
fail:
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 611c1b3c46f..3bde461c3f3 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -1,5 +1,5 @@
extern void pstore_set_kmsg_bytes(int);
-extern void pstore_get_records(void);
+extern void pstore_get_records(int);
extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
char *data, size_t size,
struct timespec time, struct pstore_info *psi);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index c5300ec3169..57bbf9078ac 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -25,12 +25,30 @@
#include <linux/module.h>
#include <linux/pstore.h>
#include <linux/string.h>
+#include <linux/timer.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/hardirq.h>
+#include <linux/workqueue.h>
#include "internal.h"
/*
+ * We defer making "oops" entries appear in pstore - see
+ * whether the system is actually still running well enough
+ * to let someone see the entry
+ */
+#define PSTORE_INTERVAL (60 * HZ)
+
+static int pstore_new_entry;
+
+static void pstore_timefunc(unsigned long);
+static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0);
+
+static void pstore_dowork(struct work_struct *);
+static DECLARE_WORK(pstore_work, pstore_dowork);
+
+/*
* pstore_lock just protects "psinfo" during
* calls to pstore_register()
*/
@@ -69,15 +87,22 @@ static void pstore_dump(struct kmsg_dumper *dumper,
unsigned long size, total = 0;
char *dst, *why;
u64 id;
- int hsize;
+ int hsize, ret;
unsigned int part = 1;
+ unsigned long flags = 0;
+ int is_locked = 0;
if (reason < ARRAY_SIZE(reason_str))
why = reason_str[reason];
else
why = "Unknown";
- mutex_lock(&psinfo->buf_mutex);
+ if (in_nmi()) {
+ is_locked = spin_trylock(&psinfo->buf_lock);
+ if (!is_locked)
+ pr_err("pstore dump routine blocked in NMI, may corrupt error record\n");
+ } else
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
oopscount++;
while (total < kmsg_bytes) {
dst = psinfo->buf;
@@ -97,18 +122,20 @@ static void pstore_dump(struct kmsg_dumper *dumper,
memcpy(dst, s1 + s1_start, l1_cpy);
memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
- id = psinfo->write(PSTORE_TYPE_DMESG, part,
+ ret = psinfo->write(PSTORE_TYPE_DMESG, &id, part,
hsize + l1_cpy + l2_cpy, psinfo);
- if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
- pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
- psinfo->buf, hsize + l1_cpy + l2_cpy,
- CURRENT_TIME, psinfo);
+ if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
+ pstore_new_entry = 1;
l1 -= l1_cpy;
l2 -= l2_cpy;
total += l1_cpy + l2_cpy;
part++;
}
- mutex_unlock(&psinfo->buf_mutex);
+ if (in_nmi()) {
+ if (is_locked)
+ spin_unlock(&psinfo->buf_lock);
+ } else
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
static struct kmsg_dumper pstore_dumper = {
@@ -140,6 +167,7 @@ int pstore_register(struct pstore_info *psi)
}
psinfo = psi;
+ mutex_init(&psinfo->read_mutex);
spin_unlock(&pstore_lock);
if (owner && !try_module_get(owner)) {
@@ -148,21 +176,27 @@ int pstore_register(struct pstore_info *psi)
}
if (pstore_is_mounted())
- pstore_get_records();
+ pstore_get_records(0);
kmsg_dump_register(&pstore_dumper);
+ pstore_timer.expires = jiffies + PSTORE_INTERVAL;
+ add_timer(&pstore_timer);
+
return 0;
}
EXPORT_SYMBOL_GPL(pstore_register);
/*
- * Read all the records from the persistent store. Create and
- * file files in our filesystem.
+ * Read all the records from the persistent store. Create
+ * files in our filesystem. Don't warn about -EEXIST errors
+ * when we are re-scanning the backing store looking to add new
+ * error records.
*/
-void pstore_get_records(void)
+void pstore_get_records(int quiet)
{
struct pstore_info *psi = psinfo;
+ char *buf = NULL;
ssize_t size;
u64 id;
enum pstore_type_id type;
@@ -172,32 +206,52 @@ void pstore_get_records(void)
if (!psi)
return;
- mutex_lock(&psinfo->buf_mutex);
+ mutex_lock(&psi->read_mutex);
rc = psi->open(psi);
if (rc)
goto out;
- while ((size = psi->read(&id, &type, &time, psi)) > 0) {
- if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
- time, psi))
+ while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) {
+ rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size,
+ time, psi);
+ kfree(buf);
+ buf = NULL;
+ if (rc && (rc != -EEXIST || !quiet))
failed++;
}
psi->close(psi);
out:
- mutex_unlock(&psinfo->buf_mutex);
+ mutex_unlock(&psi->read_mutex);
if (failed)
printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
failed, psi->name);
}
+static void pstore_dowork(struct work_struct *work)
+{
+ pstore_get_records(1);
+}
+
+static void pstore_timefunc(unsigned long dummy)
+{
+ if (pstore_new_entry) {
+ pstore_new_entry = 0;
+ schedule_work(&pstore_work);
+ }
+
+ mod_timer(&pstore_timer, jiffies + PSTORE_INTERVAL);
+}
+
/*
* Call platform driver to write a record to the
* persistent store.
*/
int pstore_write(enum pstore_type_id type, char *buf, size_t size)
{
- u64 id;
+ u64 id;
+ int ret;
+ unsigned long flags;
if (!psinfo)
return -ENODEV;
@@ -205,13 +259,13 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size)
if (size > psinfo->bufsize)
return -EFBIG;
- mutex_lock(&psinfo->buf_mutex);
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
memcpy(psinfo->buf, buf, size);
- id = psinfo->write(type, 0, size, psinfo);
- if (pstore_is_mounted())
+ ret = psinfo->write(type, &id, 0, size, psinfo);
+ if (ret == 0 && pstore_is_mounted())
pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
size, CURRENT_TIME, psinfo);
- mutex_unlock(&psinfo->buf_mutex);
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
return 0;
}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2b0646613f5..3bdd2141843 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -379,7 +379,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
inode->i_mode = le16_to_cpu(raw_inode->di_mode);
inode->i_uid = (uid_t)le16_to_cpu(raw_inode->di_uid);
inode->i_gid = (gid_t)le16_to_cpu(raw_inode->di_gid);
- inode->i_nlink = le16_to_cpu(raw_inode->di_nlink);
+ set_nlink(inode, le16_to_cpu(raw_inode->di_nlink));
inode->i_size = le32_to_cpu(raw_inode->di_size);
inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime);
inode->i_mtime.tv_nsec = 0;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 10b6be3ca28..35f4b0ecdeb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
/* caller already holds s_umount */
if (sb->s_flags & MS_RDONLY)
return -EROFS;
- writeback_inodes_sb(sb);
+ writeback_inodes_sb(sb, WB_REASON_SYNC);
return 0;
default:
return -EINVAL;
@@ -363,12 +363,15 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
}
sb = quotactl_block(special);
- if (IS_ERR(sb))
- return PTR_ERR(sb);
+ if (IS_ERR(sb)) {
+ ret = PTR_ERR(sb);
+ goto out;
+ }
ret = do_quotactl(sb, type, cmds, id, addr, pathp);
drop_super(sb);
+out:
if (pathp && !IS_ERR(pathp))
path_put(pathp);
return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index eacb166fb25..462ceb38fec 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -23,7 +23,6 @@
* caches is sufficient.
*/
-#include <linux/module.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
@@ -288,14 +287,7 @@ static int __init init_ramfs_fs(void)
{
return register_filesystem(&ramfs_fs_type);
}
-
-static void __exit exit_ramfs_fs(void)
-{
- unregister_filesystem(&ramfs_fs_type);
-}
-
module_init(init_ramfs_fs)
-module_exit(exit_ramfs_fs)
int __init init_rootfs(void)
{
@@ -311,5 +303,3 @@ int __init init_rootfs(void)
return err;
}
-
-MODULE_LICENSE("GPL");
diff --git a/fs/read_write.c b/fs/read_write.c
index 179f1c33ea5..5ad4248b0cd 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -35,23 +35,45 @@ static inline int unsigned_offsets(struct file *file)
return file->f_mode & FMODE_UNSIGNED_OFFSET;
}
+static loff_t lseek_execute(struct file *file, struct inode *inode,
+ loff_t offset, loff_t maxsize)
+{
+ if (offset < 0 && !unsigned_offsets(file))
+ return -EINVAL;
+ if (offset > maxsize)
+ return -EINVAL;
+
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+ return offset;
+}
+
/**
- * generic_file_llseek_unlocked - lockless generic llseek implementation
+ * generic_file_llseek_size - generic llseek implementation for regular files
* @file: file structure to seek on
* @offset: file offset to seek to
* @origin: type of seek
+ * @size: max size of file system
+ *
+ * This is a variant of generic_file_llseek that allows passing in a custom
+ * file size.
*
- * Updates the file offset to the value specified by @offset and @origin.
- * Locking must be provided by the caller.
+ * Synchronization:
+ * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
+ * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
+ * read/writes behave like SEEK_SET against seeks.
*/
loff_t
-generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
+generic_file_llseek_size(struct file *file, loff_t offset, int origin,
+ loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
switch (origin) {
case SEEK_END:
- offset += inode->i_size;
+ offset += i_size_read(inode);
break;
case SEEK_CUR:
/*
@@ -62,14 +84,22 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
*/
if (offset == 0)
return file->f_pos;
- offset += file->f_pos;
- break;
+ /*
+ * f_lock protects against read/modify/write race with other
+ * SEEK_CURs. Note that parallel writes and reads behave
+ * like SEEK_SET.
+ */
+ spin_lock(&file->f_lock);
+ offset = lseek_execute(file, inode, file->f_pos + offset,
+ maxsize);
+ spin_unlock(&file->f_lock);
+ return offset;
case SEEK_DATA:
/*
* In the generic case the entire file is data, so as long as
* offset isn't at the end of the file then the offset is data.
*/
- if (offset >= inode->i_size)
+ if (offset >= i_size_read(inode))
return -ENXIO;
break;
case SEEK_HOLE:
@@ -77,26 +107,15 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
* There is a virtual hole at the end of the file, so as long as
* offset isn't i_size or larger, return i_size.
*/
- if (offset >= inode->i_size)
+ if (offset >= i_size_read(inode))
return -ENXIO;
- offset = inode->i_size;
+ offset = i_size_read(inode);
break;
}
- if (offset < 0 && !unsigned_offsets(file))
- return -EINVAL;
- if (offset > inode->i_sb->s_maxbytes)
- return -EINVAL;
-
- /* Special lock needed here? */
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
-
- return offset;
+ return lseek_execute(file, inode, offset, maxsize);
}
-EXPORT_SYMBOL(generic_file_llseek_unlocked);
+EXPORT_SYMBOL(generic_file_llseek_size);
/**
* generic_file_llseek - generic llseek implementation for regular files
@@ -110,13 +129,10 @@ EXPORT_SYMBOL(generic_file_llseek_unlocked);
*/
loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
{
- loff_t rval;
-
- mutex_lock(&file->f_dentry->d_inode->i_mutex);
- rval = generic_file_llseek_unlocked(file, offset, origin);
- mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+ struct inode *inode = file->f_mapping->host;
- return rval;
+ return generic_file_llseek_size(file, offset, origin,
+ inode->i_sb->s_maxbytes);
}
EXPORT_SYMBOL(generic_file_llseek);
@@ -617,7 +633,8 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
unsigned long nr_segs, unsigned long fast_segs,
struct iovec *fast_pointer,
- struct iovec **ret_pointer)
+ struct iovec **ret_pointer,
+ int check_access)
{
unsigned long seg;
ssize_t ret;
@@ -673,7 +690,8 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
ret = -EINVAL;
goto out;
}
- if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
+ if (check_access
+ && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
ret = -EFAULT;
goto out;
}
@@ -705,7 +723,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
}
ret = rw_copy_check_uvector(type, uvector, nr_segs,
- ARRAY_SIZE(iovstack), iovstack, &iov);
+ ARRAY_SIZE(iovstack), iovstack, &iov, 1);
if (ret <= 0)
goto out;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9b0d4b78b4f..950f13af095 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1154,7 +1154,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
set_inode_item_key_version(inode, KEY_FORMAT_3_5);
set_inode_sd_version(inode, STAT_DATA_V1);
inode->i_mode = sd_v1_mode(sd);
- inode->i_nlink = sd_v1_nlink(sd);
+ set_nlink(inode, sd_v1_nlink(sd));
inode->i_uid = sd_v1_uid(sd);
inode->i_gid = sd_v1_gid(sd);
inode->i_size = sd_v1_size(sd);
@@ -1199,7 +1199,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
inode->i_mode = sd_v2_mode(sd);
- inode->i_nlink = sd_v2_nlink(sd);
+ set_nlink(inode, sd_v2_nlink(sd));
inode->i_uid = sd_v2_uid(sd);
inode->i_size = sd_v2_size(sd);
inode->i_gid = sd_v2_gid(sd);
@@ -1444,7 +1444,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
/* a stale NFS handle can trigger this without it being an error */
pathrelse(&path_to_sd);
reiserfs_make_bad_inode(inode);
- inode->i_nlink = 0;
+ clear_nlink(inode);
return;
}
@@ -1832,7 +1832,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
#endif
/* fill stat data */
- inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
+ set_nlink(inode, (S_ISDIR(mode) ? 2 : 1));
/* uid and gid must already be set by the caller for quota init */
@@ -1987,7 +1987,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
make_bad_inode(inode);
out_inserted_sd:
- inode->i_nlink = 0;
+ clear_nlink(inode);
th->t_trans_id = 0; /* so the caller can't use this handle later */
unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
iput(inode);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index a159ba5a35e..eb711060a6f 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -291,14 +291,13 @@ int reiserfs_allocate_list_bitmaps(struct super_block *sb,
for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
jb = jb_array + i;
jb->journal_list = NULL;
- jb->bitmaps = vmalloc(mem);
+ jb->bitmaps = vzalloc(mem);
if (!jb->bitmaps) {
reiserfs_warning(sb, "clm-2000", "unable to "
"allocate bitmaps for journal lists");
failed = 1;
break;
}
- memset(jb->bitmaps, 0, mem);
}
if (failed) {
free_list_bitmaps(sb, jb_array);
@@ -353,11 +352,10 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
if (num_cnodes <= 0) {
return NULL;
}
- head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
+ head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
if (!head) {
return NULL;
}
- memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode));
head[0].prev = NULL;
head[0].next = head + 1;
for (i = 1; i < num_cnodes; i++) {
@@ -2685,14 +2683,13 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
* dependency inversion warnings.
*/
reiserfs_write_unlock(sb);
- journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
+ journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
if (!journal) {
reiserfs_warning(sb, "journal-1256",
"unable to get memory for journal structure");
reiserfs_write_lock(sb);
return 1;
}
- memset(journal, 0, sizeof(struct reiserfs_journal));
INIT_LIST_HEAD(&journal->j_bitmap_nodes);
INIT_LIST_HEAD(&journal->j_prealloc_list);
INIT_LIST_HEAD(&journal->j_working_list);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ef392324bbf..80058e8ce36 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -19,7 +19,7 @@
#include <linux/reiserfs_xattr.h>
#include <linux/quotaops.h>
-#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
+#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
// directory item contains array of entry headers. This performs
@@ -622,7 +622,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
dentry->d_name.len, inode, 1 /*visible */ );
if (retval) {
int err;
- inode->i_nlink--;
+ drop_nlink(inode);
reiserfs_update_sd(&th, inode);
err = journal_end(&th, dir->i_sb, jbegin_count);
if (err)
@@ -702,7 +702,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
dentry->d_name.len, inode, 1 /*visible */ );
if (retval) {
int err;
- inode->i_nlink--;
+ drop_nlink(inode);
reiserfs_update_sd(&th, inode);
err = journal_end(&th, dir->i_sb, jbegin_count);
if (err)
@@ -787,7 +787,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
dentry->d_name.len, inode, 1 /*visible */ );
if (retval) {
int err;
- inode->i_nlink = 0;
+ clear_nlink(inode);
DEC_DIR_INODE_NLINK(dir);
reiserfs_update_sd(&th, inode);
err = journal_end(&th, dir->i_sb, jbegin_count);
@@ -964,7 +964,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
reiserfs_warning(inode->i_sb, "reiserfs-7042",
"deleting nonexistent file (%lu), %d",
inode->i_ino, inode->i_nlink);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
drop_nlink(inode);
@@ -1086,7 +1086,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
dentry->d_name.len, inode, 1 /*visible */ );
if (retval) {
int err;
- inode->i_nlink--;
+ drop_nlink(inode);
reiserfs_update_sd(&th, inode);
err = journal_end(&th, parent_dir->i_sb, jbegin_count);
if (err)
@@ -1129,7 +1129,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
retval = journal_begin(&th, dir->i_sb, jbegin_count);
if (retval) {
- inode->i_nlink--;
+ drop_nlink(inode);
reiserfs_write_unlock(dir->i_sb);
return retval;
}
@@ -1144,7 +1144,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
if (retval) {
int err;
- inode->i_nlink--;
+ drop_nlink(inode);
err = journal_end(&th, dir->i_sb, jbegin_count);
reiserfs_write_unlock(dir->i_sb);
return err ? err : retval;
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index b6b9b1fe33b..7483279b482 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -111,15 +111,13 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
/* allocate additional bitmap blocks, reallocate array of bitmap
* block pointers */
bitmap =
- vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
+ vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
if (!bitmap) {
/* Journal bitmaps are still supersized, but the memory isn't
* leaked, so I guess it's ok */
printk("reiserfs_resize: unable to allocate memory.\n");
return -ENOMEM;
}
- memset(bitmap, 0,
- sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
for (i = 0; i < bmap_nr; i++)
bitmap[i] = old_bitmap[i];
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index ef66c18a933..534668fa41b 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -66,8 +66,8 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
if (IS_PRIVATE(dir))
return 0;
- error = security_inode_init_security(inode, dir, qstr, &sec->name,
- &sec->value, &sec->length);
+ error = security_old_inode_init_security(inode, dir, qstr, &sec->name,
+ &sec->value, &sec->length);
if (error) {
if (error == -EOPNOTSUPP)
error = 0;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 2305e3121cb..8b4089f3040 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -337,7 +337,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
inode->i_dataoffset = pos + inode->i_metasize;
- i->i_nlink = 1; /* Hard to decide.. */
+ set_nlink(i, 1); /* Hard to decide.. */
i->i_size = be32_to_cpu(ri.size);
i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 05d6b0e78c9..dba43c3ea3a 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -449,8 +449,6 @@ EXPORT_SYMBOL(seq_path);
/*
* Same as seq_path, but relative to supplied root.
- *
- * root may be changed, see __d_path().
*/
int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
char *esc)
@@ -463,6 +461,8 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
char *p;
p = __d_path(path, root, buf, size);
+ if (!p)
+ return SEQ_SKIP;
res = PTR_ERR(p);
if (!IS_ERR(p)) {
char *end = mangle_path(buf, p, esc);
@@ -474,7 +474,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
}
seq_commit(m, res);
- return res < 0 ? res : 0;
+ return res < 0 && res != -ENAMETOOLONG ? res : 0;
}
/*
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 1360d4f88f4..c70111ebefd 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -19,9 +19,9 @@ config SQUASHFS
If you want to compile this as a module ( = code which can be
inserted in and removed from the running kernel whenever you want),
- say M here and read <file:Documentation/modules.txt>. The module
- will be called squashfs. Note that the root file system (the one
- containing the directory /) cannot be compiled as a module.
+ say M here. The module will be called squashfs. Note that the root
+ file system (the one containing the directory /) cannot be compiled
+ as a module.
If unsure, say N.
@@ -78,6 +78,28 @@ config SQUASHFS_XZ
If unsure, say N.
+config SQUASHFS_4K_DEVBLK_SIZE
+ bool "Use 4K device block size?"
+ depends on SQUASHFS
+ help
+ By default Squashfs sets the dev block size (sb_min_blocksize)
+ to 1K or the smallest block size supported by the block device
+ (if larger). This, because blocks are packed together and
+ unaligned in Squashfs, should reduce latency.
+
+ This, however, gives poor performance on MTD NAND devices where
+ the optimal I/O size is 4K (even though the devices can support
+ smaller block sizes).
+
+ Using a 4K device block size may also improve overall I/O
+ performance for some file access patterns (e.g. sequential
+ accesses of files in filesystem order) on all media.
+
+ Setting this option will force Squashfs to use a 4K device block
+ size by default.
+
+ If unsure, say N.
+
config SQUASHFS_EMBEDDED
bool "Additional option for memory-constrained systems"
depends on SQUASHFS
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 04bebcaa237..fd7b3b3bda1 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -159,7 +159,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
frag_offset = 0;
}
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
inode->i_size = le32_to_cpu(sqsh_ino->file_size);
inode->i_fop = &generic_ro_fops;
inode->i_mode |= S_IFREG;
@@ -203,7 +203,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
}
xattr_id = le32_to_cpu(sqsh_ino->xattr);
- inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
inode->i_size = le64_to_cpu(sqsh_ino->file_size);
inode->i_op = &squashfs_inode_ops;
inode->i_fop = &generic_ro_fops;
@@ -232,7 +232,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
if (err < 0)
goto failed_read;
- inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
inode->i_size = le16_to_cpu(sqsh_ino->file_size);
inode->i_op = &squashfs_dir_inode_ops;
inode->i_fop = &squashfs_dir_ops;
@@ -257,7 +257,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
goto failed_read;
xattr_id = le32_to_cpu(sqsh_ino->xattr);
- inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
inode->i_size = le32_to_cpu(sqsh_ino->file_size);
inode->i_op = &squashfs_dir_inode_ops;
inode->i_fop = &squashfs_dir_ops;
@@ -284,7 +284,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
if (err < 0)
goto failed_read;
- inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
inode->i_op = &squashfs_symlink_inode_ops;
inode->i_data.a_ops = &squashfs_symlink_aops;
@@ -325,7 +325,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_mode |= S_IFCHR;
else
inode->i_mode |= S_IFBLK;
- inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
rdev = le32_to_cpu(sqsh_ino->rdev);
init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
@@ -349,7 +349,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_mode |= S_IFBLK;
xattr_id = le32_to_cpu(sqsh_ino->xattr);
inode->i_op = &squashfs_inode_ops;
- inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
rdev = le32_to_cpu(sqsh_ino->rdev);
init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
@@ -370,7 +370,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_mode |= S_IFIFO;
else
inode->i_mode |= S_IFSOCK;
- inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
init_special_inode(inode, inode->i_mode, 0);
break;
}
@@ -389,7 +389,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_mode |= S_IFSOCK;
xattr_id = le32_to_cpu(sqsh_ino->xattr);
inode->i_op = &squashfs_inode_ops;
- inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
init_special_inode(inode, inode->i_mode, 0);
break;
}
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index b4a4e539a08..e8e14645de9 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -36,6 +36,13 @@
#define SQUASHFS_FILE_SIZE 131072
#define SQUASHFS_FILE_LOG 17
+/* default size of block device I/O */
+#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
+#define SQUASHFS_DEVBLK_SIZE 4096
+#else
+#define SQUASHFS_DEVBLK_SIZE 1024
+#endif
+
#define SQUASHFS_FILE_MAX_SIZE 1048576
#define SQUASHFS_FILE_MAX_LOG 20
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 7438850c62d..2da1715452a 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
}
msblk = sb->s_fs_info;
- msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
+ msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
msblk->devblksize_log2 = ffz(~msblk->devblksize);
mutex_init(&msblk->read_data_mutex);
diff --git a/fs/stack.c b/fs/stack.c
index b4f2ab48a61..9c11519245a 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -71,6 +71,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
dest->i_ctime = src->i_ctime;
dest->i_blkbits = src->i_blkbits;
dest->i_flags = src->i_flags;
- dest->i_nlink = src->i_nlink;
+ set_nlink(dest, src->i_nlink);
}
EXPORT_SYMBOL_GPL(fsstack_copy_attr_all);
diff --git a/fs/stat.c b/fs/stat.c
index 78a3aa83c7e..8806b8997d2 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -294,15 +294,16 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
{
struct path path;
int error;
+ int empty = 0;
if (bufsiz <= 0)
return -EINVAL;
- error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
+ error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty);
if (!error) {
struct inode *inode = path.dentry->d_inode;
- error = -EINVAL;
+ error = empty ? -ENOENT : -EINVAL;
if (inode->i_op->readlink) {
error = security_inode_readlink(path.dentry);
if (!error) {
diff --git a/fs/statfs.c b/fs/statfs.c
index 8244924dec5..9cf04a11896 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -76,7 +76,7 @@ EXPORT_SYMBOL(vfs_statfs);
int user_statfs(const char __user *pathname, struct kstatfs *st)
{
struct path path;
- int error = user_path(pathname, &path);
+ int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
if (!error) {
error = vfs_statfs(&path, st);
path_put(&path);
diff --git a/fs/super.c b/fs/super.c
index 3f56a269a4f..afd0f1ad45e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -61,7 +61,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
return -1;
if (!grab_super_passive(sb))
- return -1;
+ return !sc->nr_to_scan ? 0 : -1;
if (sb->s_op && sb->s_op->nr_cached_objects)
fs_objects = sb->s_op->nr_cached_objects(sb);
@@ -727,8 +727,13 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
if (sb->s_op->remount_fs) {
retval = sb->s_op->remount_fs(sb, &flags, data);
- if (retval)
- return retval;
+ if (retval) {
+ if (!force)
+ return retval;
+ /* If forced remount, go ahead despite any errors */
+ WARN(1, "forced remount of a %s fs returned %i\n",
+ sb->s_type->name, retval);
+ }
}
sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
diff --git a/fs/sync.c b/fs/sync.c
index c98a7477edf..101b8ef901d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
if (wait)
sync_inodes_sb(sb);
else
- writeback_inodes_sb(sb);
+ writeback_inodes_sb(sb, WB_REASON_SYNC);
if (sb->s_op->sync_fs)
sb->s_op->sync_fs(sb, wait);
@@ -98,7 +98,7 @@ static void sync_filesystems(int wait)
*/
SYSCALL_DEFINE0(sync)
{
- wakeup_flusher_threads(0);
+ wakeup_flusher_threads(0, WB_REASON_SYNC);
sync_filesystems(0);
sync_filesystems(1);
if (unlikely(laptop_mode))
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ea9120a830d..7fdf6a7b743 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,20 +43,48 @@ static DEFINE_IDA(sysfs_ino_ida);
static void sysfs_link_sibling(struct sysfs_dirent *sd)
{
struct sysfs_dirent *parent_sd = sd->s_parent;
- struct sysfs_dirent **pos;
- BUG_ON(sd->s_sibling);
-
- /* Store directory entries in order by ino. This allows
- * readdir to properly restart without having to add a
- * cursor into the s_dir.children list.
- */
- for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) {
- if (sd->s_ino < (*pos)->s_ino)
- break;
+ struct rb_node **p;
+ struct rb_node *parent;
+
+ if (sysfs_type(sd) == SYSFS_DIR)
+ parent_sd->s_dir.subdirs++;
+
+ p = &parent_sd->s_dir.inode_tree.rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+#define node rb_entry(parent, struct sysfs_dirent, inode_node)
+ if (sd->s_ino < node->s_ino) {
+ p = &node->inode_node.rb_left;
+ } else if (sd->s_ino > node->s_ino) {
+ p = &node->inode_node.rb_right;
+ } else {
+ printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n",
+ (unsigned long) sd->s_ino);
+ BUG();
+ }
+#undef node
}
- sd->s_sibling = *pos;
- *pos = sd;
+ rb_link_node(&sd->inode_node, parent, p);
+ rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree);
+
+ p = &parent_sd->s_dir.name_tree.rb_node;
+ parent = NULL;
+ while (*p) {
+ int c;
+ parent = *p;
+#define node rb_entry(parent, struct sysfs_dirent, name_node)
+ c = strcmp(sd->s_name, node->s_name);
+ if (c < 0) {
+ p = &node->name_node.rb_left;
+ } else {
+ p = &node->name_node.rb_right;
+ }
+#undef node
+ }
+ rb_link_node(&sd->name_node, parent, p);
+ rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree);
}
/**
@@ -71,16 +99,11 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
*/
static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
{
- struct sysfs_dirent **pos;
+ if (sysfs_type(sd) == SYSFS_DIR)
+ sd->s_parent->s_dir.subdirs--;
- for (pos = &sd->s_parent->s_dir.children; *pos;
- pos = &(*pos)->s_sibling) {
- if (*pos == sd) {
- *pos = sd->s_sibling;
- sd->s_sibling = NULL;
- break;
- }
- }
+ rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree);
+ rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree);
}
/**
@@ -126,7 +149,6 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
*/
void sysfs_put_active(struct sysfs_dirent *sd)
{
- struct completion *cmpl;
int v;
if (unlikely(!sd))
@@ -138,10 +160,9 @@ void sysfs_put_active(struct sysfs_dirent *sd)
return;
/* atomic_dec_return() is a mb(), we'll always see the updated
- * sd->s_sibling.
+ * sd->u.completion.
*/
- cmpl = (void *)sd->s_sibling;
- complete(cmpl);
+ complete(sd->u.completion);
}
/**
@@ -155,16 +176,16 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
DECLARE_COMPLETION_ONSTACK(wait);
int v;
- BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
+ BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
return;
- sd->s_sibling = (void *)&wait;
+ sd->u.completion = (void *)&wait;
rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
/* atomic_add_return() is a mb(), put_active() will always see
- * the updated sd->s_sibling.
+ * the updated sd->u.completion.
*/
v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
@@ -173,8 +194,6 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
wait_for_completion(&wait);
}
- sd->s_sibling = NULL;
-
lock_acquired(&sd->dep_map, _RET_IP_);
rwsem_release(&sd->dep_map, 1, _RET_IP_);
}
@@ -384,6 +403,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
{
struct sysfs_inode_attrs *ps_iattr;
+ if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
+ WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+ sysfs_ns_type(acxt->parent_sd)? "required": "invalid",
+ acxt->parent_sd->s_name, sd->s_name);
+ return -EINVAL;
+ }
+
if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
return -EEXIST;
@@ -490,7 +516,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
}
sd->s_flags |= SYSFS_FLAG_REMOVED;
- sd->s_sibling = acxt->removed;
+ sd->u.removed_list = acxt->removed;
acxt->removed = sd;
}
@@ -514,8 +540,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
while (acxt->removed) {
struct sysfs_dirent *sd = acxt->removed;
- acxt->removed = sd->s_sibling;
- sd->s_sibling = NULL;
+ acxt->removed = sd->u.removed_list;
sysfs_deactivate(sd);
unmap_bin_file(sd);
@@ -540,15 +565,43 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
const void *ns,
const unsigned char *name)
{
- struct sysfs_dirent *sd;
+ struct rb_node *p = parent_sd->s_dir.name_tree.rb_node;
+ struct sysfs_dirent *found = NULL;
- for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
- if (ns && sd->s_ns && (sd->s_ns != ns))
- continue;
- if (!strcmp(sd->s_name, name))
- return sd;
+ if (!!sysfs_ns_type(parent_sd) != !!ns) {
+ WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+ sysfs_ns_type(parent_sd)? "required": "invalid",
+ parent_sd->s_name, name);
+ return NULL;
}
- return NULL;
+
+ while (p) {
+ int c;
+#define node rb_entry(p, struct sysfs_dirent, name_node)
+ c = strcmp(name, node->s_name);
+ if (c < 0) {
+ p = node->name_node.rb_left;
+ } else if (c > 0) {
+ p = node->name_node.rb_right;
+ } else {
+ found = node;
+ p = node->name_node.rb_left;
+ }
+#undef node
+ }
+
+ if (found) {
+ while (found->s_ns != ns) {
+ p = rb_next(&found->name_node);
+ if (!p)
+ return NULL;
+ found = rb_entry(p, struct sysfs_dirent, name_node);
+ if (strcmp(name, found->s_name))
+ return NULL;
+ }
+ }
+
+ return found;
}
/**
@@ -744,21 +797,19 @@ void sysfs_remove_subdir(struct sysfs_dirent *sd)
static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
{
struct sysfs_addrm_cxt acxt;
- struct sysfs_dirent **pos;
+ struct rb_node *pos;
if (!dir_sd)
return;
pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
sysfs_addrm_start(&acxt, dir_sd);
- pos = &dir_sd->s_dir.children;
- while (*pos) {
- struct sysfs_dirent *sd = *pos;
-
+ pos = rb_first(&dir_sd->s_dir.inode_tree);
+ while (pos) {
+ struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node);
+ pos = rb_next(pos);
if (sysfs_type(sd) != SYSFS_DIR)
sysfs_remove_one(&acxt, sd);
- else
- pos = &(*pos)->s_sibling;
}
sysfs_addrm_finish(&acxt);
@@ -814,15 +865,13 @@ int sysfs_rename(struct sysfs_dirent *sd,
sd->s_name = new_name;
}
- /* Remove from old parent's list and insert into new parent's list. */
- if (sd->s_parent != new_parent_sd) {
- sysfs_unlink_sibling(sd);
- sysfs_get(new_parent_sd);
- sysfs_put(sd->s_parent);
- sd->s_parent = new_parent_sd;
- sysfs_link_sibling(sd);
- }
+ /* Move to the appropriate place in the appropriate directories rbtree. */
+ sysfs_unlink_sibling(sd);
+ sysfs_get(new_parent_sd);
+ sysfs_put(sd->s_parent);
sd->s_ns = new_ns;
+ sd->s_parent = new_parent_sd;
+ sysfs_link_sibling(sd);
error = 0;
out:
@@ -881,12 +930,28 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
pos = NULL;
}
if (!pos && (ino > 1) && (ino < INT_MAX)) {
- pos = parent_sd->s_dir.children;
- while (pos && (ino > pos->s_ino))
- pos = pos->s_sibling;
+ struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node;
+ while (p) {
+#define node rb_entry(p, struct sysfs_dirent, inode_node)
+ if (ino < node->s_ino) {
+ pos = node;
+ p = node->inode_node.rb_left;
+ } else if (ino > node->s_ino) {
+ p = node->inode_node.rb_right;
+ } else {
+ pos = node;
+ break;
+ }
+#undef node
+ }
+ }
+ while (pos && pos->s_ns != ns) {
+ struct rb_node *p = rb_next(&pos->inode_node);
+ if (!p)
+ pos = NULL;
+ else
+ pos = rb_entry(p, struct sysfs_dirent, inode_node);
}
- while (pos && pos->s_ns && pos->s_ns != ns)
- pos = pos->s_sibling;
return pos;
}
@@ -894,10 +959,13 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
{
pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
- if (pos)
- pos = pos->s_sibling;
- while (pos && pos->s_ns && pos->s_ns != ns)
- pos = pos->s_sibling;
+ if (pos) do {
+ struct rb_node *p = rb_next(&pos->inode_node);
+ if (!p)
+ pos = NULL;
+ else
+ pos = rb_entry(p, struct sysfs_dirent, inode_node);
+ } while (pos && pos->s_ns != ns);
return pos;
}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1ad8c93c1b8..d4e6080b4b2 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -466,9 +466,6 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
mutex_lock(&sysfs_mutex);
if (sd && dir)
- /* Only directories are tagged, so no need to pass
- * a tag explicitly.
- */
sd = sysfs_find_dirent(sd, NULL, dir);
if (sd && attr)
sd = sysfs_find_dirent(sd, NULL, attr);
@@ -488,17 +485,56 @@ const struct file_operations sysfs_file_operations = {
.poll = sysfs_poll,
};
+int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
+ const void **pns)
+{
+ struct sysfs_dirent *dir_sd = kobj->sd;
+ const struct sysfs_ops *ops;
+ const void *ns = NULL;
+ int err;
+
+ err = 0;
+ if (!sysfs_ns_type(dir_sd))
+ goto out;
+
+ err = -EINVAL;
+ if (!kobj->ktype)
+ goto out;
+ ops = kobj->ktype->sysfs_ops;
+ if (!ops)
+ goto out;
+ if (!ops->namespace)
+ goto out;
+
+ err = 0;
+ ns = ops->namespace(kobj, attr);
+out:
+ if (err) {
+ WARN(1, KERN_ERR "missing sysfs namespace attribute operation for "
+ "kobject: %s\n", kobject_name(kobj));
+ }
+ *pns = ns;
+ return err;
+}
+
int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
const struct attribute *attr, int type, mode_t amode)
{
umode_t mode = (amode & S_IALLUGO) | S_IFREG;
struct sysfs_addrm_cxt acxt;
struct sysfs_dirent *sd;
+ const void *ns;
int rc;
+ rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns);
+ if (rc)
+ return rc;
+
sd = sysfs_new_dirent(attr->name, mode, type);
if (!sd)
return -ENOMEM;
+
+ sd->s_ns = ns;
sd->s_attr.attr = (void *)attr;
sysfs_dirent_init_lockdep(sd);
@@ -586,12 +622,17 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
{
struct sysfs_dirent *sd;
struct iattr newattrs;
+ const void *ns;
int rc;
+ rc = sysfs_attr_ns(kobj, attr, &ns);
+ if (rc)
+ return rc;
+
mutex_lock(&sysfs_mutex);
rc = -ENOENT;
- sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
+ sd = sysfs_find_dirent(kobj->sd, ns, attr->name);
if (!sd)
goto out;
@@ -616,7 +657,12 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
{
- sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
+ const void *ns;
+
+ if (sysfs_attr_ns(kobj, attr, &ns))
+ return;
+
+ sysfs_hash_and_remove(kobj->sd, ns, attr->name);
}
void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e3f091a81c7..c81b22f3ace 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -202,18 +202,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
inode->i_ctime = iattr->ia_ctime;
}
-static int sysfs_count_nlink(struct sysfs_dirent *sd)
-{
- struct sysfs_dirent *child;
- int nr = 0;
-
- for (child = sd->s_dir.children; child; child = child->s_sibling)
- if (sysfs_type(child) == SYSFS_DIR)
- nr++;
-
- return nr + 2;
-}
-
static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
{
struct sysfs_inode_attrs *iattrs = sd->s_iattr;
@@ -230,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
}
if (sysfs_type(sd) == SYSFS_DIR)
- inode->i_nlink = sysfs_count_nlink(sd);
+ set_nlink(inode, sd->s_dir.subdirs + 2);
}
int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -336,8 +324,6 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
sysfs_addrm_start(&acxt, dir_sd);
sd = sysfs_find_dirent(dir_sd, ns, name);
- if (sd && (sd->s_ns != ns))
- sd = NULL;
if (sd)
sysfs_remove_one(&acxt, sd);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 845ab3ad229..ce29e28b766 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -11,14 +11,18 @@
#include <linux/lockdep.h>
#include <linux/kobject_ns.h>
#include <linux/fs.h>
+#include <linux/rbtree.h>
struct sysfs_open_dirent;
/* type-specific structures for sysfs_dirent->s_* union members */
struct sysfs_elem_dir {
struct kobject *kobj;
- /* children list starts here and goes through sd->s_sibling */
- struct sysfs_dirent *children;
+
+ unsigned long subdirs;
+
+ struct rb_root inode_tree;
+ struct rb_root name_tree;
};
struct sysfs_elem_symlink {
@@ -56,9 +60,16 @@ struct sysfs_dirent {
struct lockdep_map dep_map;
#endif
struct sysfs_dirent *s_parent;
- struct sysfs_dirent *s_sibling;
const char *s_name;
+ struct rb_node inode_node;
+ struct rb_node name_node;
+
+ union {
+ struct completion *completion;
+ struct sysfs_dirent *removed_list;
+ } u;
+
const void *s_ns; /* namespace tag */
union {
struct sysfs_elem_dir s_dir;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 0630eb969a2..25ffb3e9a3f 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -219,7 +219,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode);
inode->i_uid = (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid);
inode->i_gid = (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid);
- inode->i_nlink = fs16_to_cpu(sbi, raw_inode->i_nlink);
+ set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 315de66e52b..bc4f94b2870 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -63,7 +63,7 @@
static void shrink_liability(struct ubifs_info *c, int nr_to_write)
{
down_read(&c->vfs_sb->s_umount);
- writeback_inodes_sb(c->vfs_sb);
+ writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE);
up_read(&c->vfs_sb->s_umount);
}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index eef109a1a92..b09ba2dd8b6 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
spin_unlock(&dbg_lock);
}
+void dbg_dump_sleb(const struct ubifs_info *c,
+ const struct ubifs_scan_leb *sleb, int offs)
+{
+ struct ubifs_scan_node *snod;
+
+ printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n",
+ current->pid, sleb->lnum, offs);
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ cond_resched();
+ printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
+ snod->offs, snod->len);
+ dbg_dump_node(c, snod->node);
+ }
+}
+
void dbg_dump_leb(const struct ubifs_info *c, int lnum)
{
struct ubifs_scan_leb *sleb;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index feb361e252a..8d9c4681018 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
void dbg_dump_lprops(struct ubifs_info *c);
void dbg_dump_lpt_info(struct ubifs_info *c);
void dbg_dump_leb(const struct ubifs_info *c, int lnum);
+void dbg_dump_sleb(const struct ubifs_info *c,
+ const struct ubifs_scan_leb *sleb, int offs);
void dbg_dump_znode(const struct ubifs_info *c,
const struct ubifs_znode *znode);
void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
@@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; }
static inline void dbg_dump_leb(const struct ubifs_info *c,
int lnum) { return; }
static inline void
+dbg_dump_sleb(const struct ubifs_info *c,
+ const struct ubifs_scan_leb *sleb, int offs) { return; }
+static inline void
dbg_dump_znode(const struct ubifs_info *c,
const struct ubifs_znode *znode) { return; }
static inline void dbg_dump_heap(struct ubifs_info *c,
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index af02790d932..ee4f43f4bb9 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf)
}
/**
- * clean_an_unclean_leb - read and write a LEB to remove corruption.
+ * clean_an_unclean_leb - read and write a LEB to remove corruption.
* @c: UBIFS file-system description object
* @ucleb: unclean LEB information
* @sbuf: LEB-sized buffer to use
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 93d938ad3d2..6094c5a5d7a 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c)
mst->total_dirty = cpu_to_le64(tmp64);
/* The indexing LEB does not contribute to dark space */
- tmp64 = (c->main_lebs - 1) * c->dark_wm;
+ tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm);
mst->total_dark = cpu_to_le64(tmp64);
mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b28121278d4..ae0e76bb6eb 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -129,7 +129,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
goto out_ino;
inode->i_flags |= (S_NOCMTIME | S_NOATIME);
- inode->i_nlink = le32_to_cpu(ino->nlink);
+ set_nlink(inode, le32_to_cpu(ino->nlink));
inode->i_uid = le32_to_cpu(ino->uid);
inode->i_gid = le32_to_cpu(ino->gid);
inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec);
@@ -2264,19 +2264,12 @@ static int __init ubifs_init(void)
return -EINVAL;
}
- err = register_filesystem(&ubifs_fs_type);
- if (err) {
- ubifs_err("cannot register file system, error %d", err);
- return err;
- }
-
- err = -ENOMEM;
ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
sizeof(struct ubifs_inode), 0,
SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
&inode_slab_ctor);
if (!ubifs_inode_slab)
- goto out_reg;
+ return -ENOMEM;
register_shrinker(&ubifs_shrinker_info);
@@ -2288,15 +2281,20 @@ static int __init ubifs_init(void)
if (err)
goto out_compr;
+ err = register_filesystem(&ubifs_fs_type);
+ if (err) {
+ ubifs_err("cannot register file system, error %d", err);
+ goto out_dbg;
+ }
return 0;
+out_dbg:
+ dbg_debugfs_exit();
out_compr:
ubifs_compressors_exit();
out_shrinker:
unregister_shrinker(&ubifs_shrinker_info);
kmem_cache_destroy(ubifs_inode_slab);
-out_reg:
- unregister_filesystem(&ubifs_fs_type);
return err;
}
/* late_initcall to let compressors initialize first */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 16f19f55e63..bf18f7a0454 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -558,10 +558,10 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
}
ubifs_assert(inode->i_nlink == 1);
- inode->i_nlink = 0;
+ clear_nlink(inode);
err = remove_xattr(c, host, inode, &nm);
if (err)
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
/* If @i_nlink is 0, 'iput()' will delete the inode */
iput(inode);
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 95518a9f589..987585bb0a1 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -59,8 +59,8 @@ static int __load_block_bitmap(struct super_block *sb,
int nr_groups = bitmap->s_nr_groups;
if (block_group >= nr_groups) {
- udf_debug("block_group (%d) > nr_groups (%d)\n", block_group,
- nr_groups);
+ udf_debug("block_group (%d) > nr_groups (%d)\n",
+ block_group, nr_groups);
}
if (bitmap->s_block_bitmap[block_group]) {
@@ -126,8 +126,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
if (bloc->logicalBlockNum + count < count ||
(bloc->logicalBlockNum + count) > partmap->s_partition_len) {
udf_debug("%d < %d || %d + %d > %d\n",
- bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
- count, partmap->s_partition_len);
+ bloc->logicalBlockNum, 0,
+ bloc->logicalBlockNum, count,
+ partmap->s_partition_len);
goto error_return;
}
@@ -155,7 +156,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
if (udf_set_bit(bit + i, bh->b_data)) {
udf_debug("bit %ld already set\n", bit + i);
udf_debug("byte=%2x\n",
- ((char *)bh->b_data)[(bit + i) >> 3]);
+ ((char *)bh->b_data)[(bit + i) >> 3]);
}
}
udf_add_free_space(sb, sbi->s_partition, count);
@@ -369,7 +370,8 @@ static void udf_table_free_blocks(struct super_block *sb,
if (bloc->logicalBlockNum + count < count ||
(bloc->logicalBlockNum + count) > partmap->s_partition_len) {
udf_debug("%d < %d || %d + %d > %d\n",
- bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
+ bloc->logicalBlockNum, 0,
+ bloc->logicalBlockNum, count,
partmap->s_partition_len);
goto error_return;
}
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2ffdb6733af..3e44f575fb9 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -162,8 +162,8 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
int padlen;
if ((!buffer) || (!offset)) {
- udf_debug("invalidparms\n, buffer=%p, offset=%p\n", buffer,
- offset);
+ udf_debug("invalidparms, buffer=%p, offset=%p\n",
+ buffer, offset);
return NULL;
}
@@ -201,7 +201,7 @@ struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offs
struct short_ad *sa;
if ((!ptr) || (!offset)) {
- printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
+ pr_err("%s: invalidparms\n", __func__);
return NULL;
}
@@ -223,7 +223,7 @@ struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset
struct long_ad *la;
if ((!ptr) || (!offset)) {
- printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
+ pr_err("%s: invalidparms\n", __func__);
return NULL;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 1d1358ed80c..4fd1d809738 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -37,6 +37,7 @@
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/crc-itu-t.h>
+#include <linux/mpage.h>
#include "udf_i.h"
#include "udf_sb.h"
@@ -83,12 +84,10 @@ void udf_evict_inode(struct inode *inode)
end_writeback(inode);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
inode->i_size != iinfo->i_lenExtents) {
- printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
- "inode size %llu different from extent length %llu. "
- "Filesystem need not be standards compliant.\n",
- inode->i_sb->s_id, inode->i_ino, inode->i_mode,
- (unsigned long long)inode->i_size,
- (unsigned long long)iinfo->i_lenExtents);
+ udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n",
+ inode->i_ino, inode->i_mode,
+ (unsigned long long)inode->i_size,
+ (unsigned long long)iinfo->i_lenExtents);
}
kfree(iinfo->i_ext.i_data);
iinfo->i_ext.i_data = NULL;
@@ -104,7 +103,13 @@ static int udf_writepage(struct page *page, struct writeback_control *wbc)
static int udf_readpage(struct file *file, struct page *page)
{
- return block_read_full_page(page, udf_get_block);
+ return mpage_readpage(page, udf_get_block);
+}
+
+static int udf_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, udf_get_block);
}
static int udf_write_begin(struct file *file, struct address_space *mapping,
@@ -139,6 +144,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations udf_aops = {
.readpage = udf_readpage,
+ .readpages = udf_readpages,
.writepage = udf_writepage,
.write_begin = udf_write_begin,
.write_end = generic_write_end,
@@ -1169,16 +1175,15 @@ static void __udf_read_inode(struct inode *inode)
*/
bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
if (!bh) {
- printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
- inode->i_ino);
+ udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
make_bad_inode(inode);
return;
}
if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
ident != TAG_IDENT_USE) {
- printk(KERN_ERR "udf: udf_read_inode(ino %ld) "
- "failed ident=%d\n", inode->i_ino, ident);
+ udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
+ inode->i_ino, ident);
brelse(bh);
make_bad_inode(inode);
return;
@@ -1218,8 +1223,8 @@ static void __udf_read_inode(struct inode *inode)
}
brelse(ibh);
} else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
- printk(KERN_ERR "udf: unsupported strategy type: %d\n",
- le16_to_cpu(fe->icbTag.strategyType));
+ udf_err(inode->i_sb, "unsupported strategy type: %d\n",
+ le16_to_cpu(fe->icbTag.strategyType));
brelse(bh);
make_bad_inode(inode);
return;
@@ -1236,6 +1241,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
int offset;
struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
struct udf_inode_info *iinfo = UDF_I(inode);
+ unsigned int link_count;
fe = (struct fileEntry *)bh->b_data;
efe = (struct extendedFileEntry *)bh->b_data;
@@ -1318,9 +1324,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
inode->i_mode &= ~sbi->s_umask;
read_unlock(&sbi->s_cred_lock);
- inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
- if (!inode->i_nlink)
- inode->i_nlink = 1;
+ link_count = le16_to_cpu(fe->fileLinkCount);
+ if (!link_count)
+ link_count = 1;
+ set_nlink(inode, link_count);
inode->i_size = le64_to_cpu(fe->informationLength);
iinfo->i_lenExtents = inode->i_size;
@@ -1413,9 +1420,8 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
udf_debug("METADATA BITMAP FILE-----\n");
break;
default:
- printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown "
- "file type=%d\n", inode->i_ino,
- fe->icbTag.fileType);
+ udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
+ inode->i_ino, fe->icbTag.fileType);
make_bad_inode(inode);
return;
}
@@ -1438,8 +1444,8 @@ static int udf_alloc_i_data(struct inode *inode, size_t size)
iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL);
if (!iinfo->i_ext.i_data) {
- printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) "
- "no free memory\n", inode->i_ino);
+ udf_err(inode->i_sb, "(ino %ld) no free memory\n",
+ inode->i_ino);
return -ENOMEM;
}
@@ -1689,9 +1695,8 @@ out:
if (do_sync) {
sync_dirty_buffer(bh);
if (buffer_write_io_error(bh)) {
- printk(KERN_WARNING "IO error syncing udf inode "
- "[%s:%08lx]\n", inode->i_sb->s_id,
- inode->i_ino);
+ udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n",
+ inode->i_ino);
err = -EIO;
}
}
@@ -1982,8 +1987,7 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
*elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK;
break;
default:
- udf_debug("alloc_type = %d unsupported\n",
- iinfo->i_alloc_type);
+ udf_debug("alloc_type = %d unsupported\n", iinfo->i_alloc_type);
return -1;
}
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 43e24a3b8e1..6583fe9b064 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -38,7 +38,7 @@ unsigned int udf_get_last_session(struct super_block *sb)
if (i == 0) {
udf_debug("XA disk: %s, vol_desc_start=%d\n",
- (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
+ ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba);
if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
vol_desc_start = ms_info.addr.lba;
} else {
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 9215700c00a..c175b4dabc1 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -204,6 +204,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
{
struct tag *tag_p;
struct buffer_head *bh = NULL;
+ u8 checksum;
/* Read the block */
if (block == 0xFFFFFFFF)
@@ -211,8 +212,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
bh = udf_tread(sb, block);
if (!bh) {
- udf_debug("block=%d, location=%d: read failed\n",
- block, location);
+ udf_err(sb, "read failed, block=%u, location=%d\n",
+ block, location);
return NULL;
}
@@ -227,16 +228,18 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
}
/* Verify the tag checksum */
- if (udf_tag_checksum(tag_p) != tag_p->tagChecksum) {
- printk(KERN_ERR "udf: tag checksum failed block %d\n", block);
+ checksum = udf_tag_checksum(tag_p);
+ if (checksum != tag_p->tagChecksum) {
+ udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n",
+ block, checksum, tag_p->tagChecksum);
goto error_out;
}
/* Verify the tag version */
if (tag_p->descVersion != cpu_to_le16(0x0002U) &&
tag_p->descVersion != cpu_to_le16(0x0003U)) {
- udf_debug("tag version 0x%04x != 0x0002 || 0x0003 block %d\n",
- le16_to_cpu(tag_p->descVersion), block);
+ udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n",
+ le16_to_cpu(tag_p->descVersion), block);
goto error_out;
}
@@ -248,8 +251,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
return bh;
udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block,
- le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength));
-
+ le16_to_cpu(tag_p->descCRC),
+ le16_to_cpu(tag_p->descCRCLength));
error_out:
brelse(bh);
return NULL;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f1dce848ef9..4639e137222 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -577,8 +577,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
if (!fi) {
- inode->i_nlink--;
- mark_inode_dirty(inode);
+ inode_dec_link_count(inode);
iput(inode);
return err;
}
@@ -618,8 +617,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
init_special_inode(inode, mode, rdev);
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
if (!fi) {
- inode->i_nlink--;
- mark_inode_dirty(inode);
+ inode_dec_link_count(inode);
iput(inode);
return err;
}
@@ -665,12 +663,11 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
inode->i_fop = &udf_dir_operations;
fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
if (!fi) {
- inode->i_nlink--;
- mark_inode_dirty(inode);
+ inode_dec_link_count(inode);
iput(inode);
goto out;
}
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location);
*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
@@ -683,7 +680,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
if (!fi) {
- inode->i_nlink = 0;
+ clear_nlink(inode);
mark_inode_dirty(inode);
iput(inode);
goto out;
@@ -799,9 +796,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
if (retval)
goto end_rmdir;
if (inode->i_nlink != 2)
- udf_warning(inode->i_sb, "udf_rmdir",
- "empty directory has nlink != 2 (%d)",
- inode->i_nlink);
+ udf_warn(inode->i_sb, "empty directory has nlink != 2 (%d)\n",
+ inode->i_nlink);
clear_nlink(inode);
inode->i_size = 0;
inode_dec_link_count(dir);
@@ -840,7 +836,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
if (!inode->i_nlink) {
udf_debug("Deleting nonexistent file (%lu), %d\n",
inode->i_ino, inode->i_nlink);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
retval = udf_delete_entry(dir, fi, &fibh, &cfi);
if (retval)
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index a71090ea0e0..d6caf01a209 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -33,8 +33,8 @@ uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
struct udf_sb_info *sbi = UDF_SB(sb);
struct udf_part_map *map;
if (partition >= sbi->s_partitions) {
- udf_debug("block=%d, partition=%d, offset=%d: "
- "invalid partition\n", block, partition, offset);
+ udf_debug("block=%d, partition=%d, offset=%d: invalid partition\n",
+ block, partition, offset);
return 0xFFFFFFFF;
}
map = &sbi->s_partmaps[partition];
@@ -60,8 +60,8 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
vdata = &map->s_type_specific.s_virtual;
if (block > vdata->s_num_entries) {
- udf_debug("Trying to access block beyond end of VAT "
- "(%d max %d)\n", block, vdata->s_num_entries);
+ udf_debug("Trying to access block beyond end of VAT (%d max %d)\n",
+ block, vdata->s_num_entries);
return 0xFFFFFFFF;
}
@@ -321,9 +321,14 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
/* We shouldn't mount such media... */
BUG_ON(!inode);
retblk = udf_try_read_meta(inode, block, partition, offset);
- if (retblk == 0xFFFFFFFF) {
- udf_warning(sb, __func__, "error reading from METADATA, "
- "trying to read from MIRROR");
+ if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) {
+ udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n");
+ if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) {
+ mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
+ mdata->s_mirror_file_loc, map->s_partition_num);
+ mdata->s_flags |= MF_MIRROR_FE_LOADED;
+ }
+
inode = mdata->s_mirror_fe;
if (!inode)
return 0xFFFFFFFF;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 7b27b063ff6..e185253470d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -75,8 +75,6 @@
#define UDF_DEFAULT_BLOCKSIZE 2048
-static char error_buf[1024];
-
/* These are the "meat" - everything else is stuffing */
static int udf_fill_super(struct super_block *, void *, int);
static void udf_put_super(struct super_block *);
@@ -92,8 +90,6 @@ static void udf_close_lvid(struct super_block *);
static unsigned int udf_count_free(struct super_block *);
static int udf_statfs(struct dentry *, struct kstatfs *);
static int udf_show_options(struct seq_file *, struct vfsmount *);
-static void udf_error(struct super_block *sb, const char *function,
- const char *fmt, ...);
struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
{
@@ -244,9 +240,8 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map),
GFP_KERNEL);
if (!sbi->s_partmaps) {
- udf_error(sb, __func__,
- "Unable to allocate space for %d partition maps",
- count);
+ udf_err(sb, "Unable to allocate space for %d partition maps\n",
+ count);
sbi->s_partitions = 0;
return -ENOMEM;
}
@@ -550,8 +545,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
uopt->dmode = option & 0777;
break;
default:
- printk(KERN_ERR "udf: bad mount option \"%s\" "
- "or missing value\n", p);
+ pr_err("bad mount option \"%s\" or missing value\n", p);
return 0;
}
}
@@ -645,20 +639,16 @@ static loff_t udf_check_vsd(struct super_block *sb)
udf_debug("ISO9660 Boot Record found\n");
break;
case 1:
- udf_debug("ISO9660 Primary Volume Descriptor "
- "found\n");
+ udf_debug("ISO9660 Primary Volume Descriptor found\n");
break;
case 2:
- udf_debug("ISO9660 Supplementary Volume "
- "Descriptor found\n");
+ udf_debug("ISO9660 Supplementary Volume Descriptor found\n");
break;
case 3:
- udf_debug("ISO9660 Volume Partition Descriptor "
- "found\n");
+ udf_debug("ISO9660 Volume Partition Descriptor found\n");
break;
case 255:
- udf_debug("ISO9660 Volume Descriptor Set "
- "Terminator found\n");
+ udf_debug("ISO9660 Volume Descriptor Set Terminator found\n");
break;
default:
udf_debug("ISO9660 VRS (%u) found\n",
@@ -809,8 +799,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
pvoldesc->recordingDateAndTime)) {
#ifdef UDFFS_DEBUG
struct timestamp *ts = &pvoldesc->recordingDateAndTime;
- udf_debug("recording time %04u/%02u/%02u"
- " %02u:%02u (%x)\n",
+ udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n",
le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
ts->minute, le16_to_cpu(ts->typeAndTimezone));
#endif
@@ -821,7 +810,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
outstr->u_len > 31 ? 31 : outstr->u_len);
udf_debug("volIdent[] = '%s'\n",
- UDF_SB(sb)->s_volume_ident);
+ UDF_SB(sb)->s_volume_ident);
}
if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
@@ -837,64 +826,57 @@ out1:
return ret;
}
+struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
+ u32 meta_file_loc, u32 partition_num)
+{
+ struct kernel_lb_addr addr;
+ struct inode *metadata_fe;
+
+ addr.logicalBlockNum = meta_file_loc;
+ addr.partitionReferenceNum = partition_num;
+
+ metadata_fe = udf_iget(sb, &addr);
+
+ if (metadata_fe == NULL)
+ udf_warn(sb, "metadata inode efe not found\n");
+ else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
+ udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
+ iput(metadata_fe);
+ metadata_fe = NULL;
+ }
+
+ return metadata_fe;
+}
+
static int udf_load_metadata_files(struct super_block *sb, int partition)
{
struct udf_sb_info *sbi = UDF_SB(sb);
struct udf_part_map *map;
struct udf_meta_data *mdata;
struct kernel_lb_addr addr;
- int fe_error = 0;
map = &sbi->s_partmaps[partition];
mdata = &map->s_type_specific.s_metadata;
/* metadata address */
- addr.logicalBlockNum = mdata->s_meta_file_loc;
- addr.partitionReferenceNum = map->s_partition_num;
-
udf_debug("Metadata file location: block = %d part = %d\n",
- addr.logicalBlockNum, addr.partitionReferenceNum);
+ mdata->s_meta_file_loc, map->s_partition_num);
- mdata->s_metadata_fe = udf_iget(sb, &addr);
+ mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb,
+ mdata->s_meta_file_loc, map->s_partition_num);
if (mdata->s_metadata_fe == NULL) {
- udf_warning(sb, __func__, "metadata inode efe not found, "
- "will try mirror inode.");
- fe_error = 1;
- } else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type !=
- ICBTAG_FLAG_AD_SHORT) {
- udf_warning(sb, __func__, "metadata inode efe does not have "
- "short allocation descriptors!");
- fe_error = 1;
- iput(mdata->s_metadata_fe);
- mdata->s_metadata_fe = NULL;
- }
+ /* mirror file entry */
+ udf_debug("Mirror metadata file location: block = %d part = %d\n",
+ mdata->s_mirror_file_loc, map->s_partition_num);
- /* mirror file entry */
- addr.logicalBlockNum = mdata->s_mirror_file_loc;
- addr.partitionReferenceNum = map->s_partition_num;
-
- udf_debug("Mirror metadata file location: block = %d part = %d\n",
- addr.logicalBlockNum, addr.partitionReferenceNum);
+ mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
+ mdata->s_mirror_file_loc, map->s_partition_num);
- mdata->s_mirror_fe = udf_iget(sb, &addr);
-
- if (mdata->s_mirror_fe == NULL) {
- if (fe_error) {
- udf_error(sb, __func__, "mirror inode efe not found "
- "and metadata inode is missing too, exiting...");
- goto error_exit;
- } else
- udf_warning(sb, __func__, "mirror inode efe not found,"
- " but metadata inode is OK");
- } else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type !=
- ICBTAG_FLAG_AD_SHORT) {
- udf_warning(sb, __func__, "mirror inode efe does not have "
- "short allocation descriptors!");
- iput(mdata->s_mirror_fe);
- mdata->s_mirror_fe = NULL;
- if (fe_error)
+ if (mdata->s_mirror_fe == NULL) {
+ udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
goto error_exit;
+ }
}
/*
@@ -907,18 +889,15 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
addr.partitionReferenceNum = map->s_partition_num;
udf_debug("Bitmap file location: block = %d part = %d\n",
- addr.logicalBlockNum, addr.partitionReferenceNum);
+ addr.logicalBlockNum, addr.partitionReferenceNum);
mdata->s_bitmap_fe = udf_iget(sb, &addr);
if (mdata->s_bitmap_fe == NULL) {
if (sb->s_flags & MS_RDONLY)
- udf_warning(sb, __func__, "bitmap inode efe "
- "not found but it's ok since the disc"
- " is mounted read-only");
+ udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
else {
- udf_error(sb, __func__, "bitmap inode efe not "
- "found and attempted read-write mount");
+ udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
goto error_exit;
}
}
@@ -971,9 +950,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
if (bitmap == NULL) {
- udf_error(sb, __func__,
- "Unable to allocate space for bitmap "
- "and %d buffer_head pointers", nr_groups);
+ udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n",
+ nr_groups);
return NULL;
}
@@ -1003,10 +981,9 @@ static int udf_fill_partdesc_info(struct super_block *sb,
if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;
- udf_debug("Partition (%d type %x) starts at physical %d, "
- "block length %d\n", p_index,
- map->s_partition_type, map->s_partition_root,
- map->s_partition_len);
+ udf_debug("Partition (%d type %x) starts at physical %d, block length %d\n",
+ p_index, map->s_partition_type,
+ map->s_partition_root, map->s_partition_len);
if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
@@ -1023,12 +1000,12 @@ static int udf_fill_partdesc_info(struct super_block *sb,
map->s_uspace.s_table = udf_iget(sb, &loc);
if (!map->s_uspace.s_table) {
udf_debug("cannot load unallocSpaceTable (part %d)\n",
- p_index);
+ p_index);
return 1;
}
map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
udf_debug("unallocSpaceTable (part %d) @ %ld\n",
- p_index, map->s_uspace.s_table->i_ino);
+ p_index, map->s_uspace.s_table->i_ino);
}
if (phd->unallocSpaceBitmap.extLength) {
@@ -1041,8 +1018,8 @@ static int udf_fill_partdesc_info(struct super_block *sb,
bitmap->s_extPosition = le32_to_cpu(
phd->unallocSpaceBitmap.extPosition);
map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
- udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index,
- bitmap->s_extPosition);
+ udf_debug("unallocSpaceBitmap (part %d) @ %d\n",
+ p_index, bitmap->s_extPosition);
}
if (phd->partitionIntegrityTable.extLength)
@@ -1058,13 +1035,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
map->s_fspace.s_table = udf_iget(sb, &loc);
if (!map->s_fspace.s_table) {
udf_debug("cannot load freedSpaceTable (part %d)\n",
- p_index);
+ p_index);
return 1;
}
map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
udf_debug("freedSpaceTable (part %d) @ %ld\n",
- p_index, map->s_fspace.s_table->i_ino);
+ p_index, map->s_fspace.s_table->i_ino);
}
if (phd->freedSpaceBitmap.extLength) {
@@ -1077,8 +1054,8 @@ static int udf_fill_partdesc_info(struct super_block *sb,
bitmap->s_extPosition = le32_to_cpu(
phd->freedSpaceBitmap.extPosition);
map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
- udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index,
- bitmap->s_extPosition);
+ udf_debug("freedSpaceBitmap (part %d) @ %d\n",
+ p_index, bitmap->s_extPosition);
}
return 0;
}
@@ -1118,11 +1095,9 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
if (!sbi->s_vat_inode &&
sbi->s_last_block != blocks - 1) {
- printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
- " last recorded block (%lu), retrying with the last "
- "block of the device (%lu).\n",
- (unsigned long)sbi->s_last_block,
- (unsigned long)blocks - 1);
+ pr_notice("Failed to read VAT inode from the last recorded block (%lu), retrying with the last block of the device (%lu).\n",
+ (unsigned long)sbi->s_last_block,
+ (unsigned long)blocks - 1);
udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
}
if (!sbi->s_vat_inode)
@@ -1220,8 +1195,8 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
if (map->s_partition_type == UDF_METADATA_MAP25) {
ret = udf_load_metadata_files(sb, i);
if (ret) {
- printk(KERN_ERR "UDF-fs: error loading MetaData "
- "partition map %d\n", i);
+ udf_err(sb, "error loading MetaData partition map %d\n",
+ i);
goto out_bh;
}
} else {
@@ -1234,9 +1209,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
* overwrite blocks instead of relocating them).
*/
sb->s_flags |= MS_RDONLY;
- printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only "
- "because writing to pseudooverwrite partition is "
- "not implemented.\n");
+ pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n");
}
out_bh:
/* In case loading failed, we handle cleanup in udf_fill_super */
@@ -1344,9 +1317,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
struct metadataPartitionMap *mdm =
(struct metadataPartitionMap *)
&(lvd->partitionMaps[offset]);
- udf_debug("Parsing Logical vol part %d "
- "type %d id=%s\n", i, type,
- UDF_ID_METADATA);
+ udf_debug("Parsing Logical vol part %d type %d id=%s\n",
+ i, type, UDF_ID_METADATA);
map->s_partition_type = UDF_METADATA_MAP25;
map->s_partition_func = udf_get_pblock_meta25;
@@ -1361,25 +1333,24 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
le32_to_cpu(mdm->allocUnitSize);
mdata->s_align_unit_size =
le16_to_cpu(mdm->alignUnitSize);
- mdata->s_dup_md_flag =
- mdm->flags & 0x01;
+ if (mdm->flags & 0x01)
+ mdata->s_flags |= MF_DUPLICATE_MD;
udf_debug("Metadata Ident suffix=0x%x\n",
- (le16_to_cpu(
- ((__le16 *)
- mdm->partIdent.identSuffix)[0])));
+ le16_to_cpu(*(__le16 *)
+ mdm->partIdent.identSuffix));
udf_debug("Metadata part num=%d\n",
- le16_to_cpu(mdm->partitionNum));
+ le16_to_cpu(mdm->partitionNum));
udf_debug("Metadata part alloc unit size=%d\n",
- le32_to_cpu(mdm->allocUnitSize));
+ le32_to_cpu(mdm->allocUnitSize));
udf_debug("Metadata file loc=%d\n",
- le32_to_cpu(mdm->metadataFileLoc));
+ le32_to_cpu(mdm->metadataFileLoc));
udf_debug("Mirror file loc=%d\n",
- le32_to_cpu(mdm->metadataMirrorFileLoc));
+ le32_to_cpu(mdm->metadataMirrorFileLoc));
udf_debug("Bitmap file loc=%d\n",
- le32_to_cpu(mdm->metadataBitmapFileLoc));
- udf_debug("Duplicate Flag: %d %d\n",
- mdata->s_dup_md_flag, mdm->flags);
+ le32_to_cpu(mdm->metadataBitmapFileLoc));
+ udf_debug("Flags: %d %d\n",
+ mdata->s_flags, mdm->flags);
} else {
udf_debug("Unknown ident: %s\n",
upm2->partIdent.ident);
@@ -1389,16 +1360,15 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
map->s_partition_num = le16_to_cpu(upm2->partitionNum);
}
udf_debug("Partition (%d:%d) type %d on volume %d\n",
- i, map->s_partition_num, type,
- map->s_volumeseqnum);
+ i, map->s_partition_num, type, map->s_volumeseqnum);
}
if (fileset) {
struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
*fileset = lelb_to_cpu(la->extLocation);
- udf_debug("FileSet found in LogicalVolDesc at block=%d, "
- "partition=%d\n", fileset->logicalBlockNum,
+ udf_debug("FileSet found in LogicalVolDesc at block=%d, partition=%d\n",
+ fileset->logicalBlockNum,
fileset->partitionReferenceNum);
}
if (lvd->integritySeqExt.extLength)
@@ -1478,9 +1448,9 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
bh = udf_read_tagged(sb, block, block, &ident);
if (!bh) {
- printk(KERN_ERR "udf: Block %Lu of volume descriptor "
- "sequence is corrupted or we could not read "
- "it.\n", (unsigned long long)block);
+ udf_err(sb,
+ "Block %llu of volume descriptor sequence is corrupted or we could not read it\n",
+ (unsigned long long)block);
return 1;
}
@@ -1553,7 +1523,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
* in a suitable order
*/
if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
- printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n");
+ udf_err(sb, "Primary Volume Descriptor not found!\n");
return 1;
}
if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block))
@@ -1740,7 +1710,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
if (!sb_set_blocksize(sb, uopt->blocksize)) {
if (!silent)
- printk(KERN_WARNING "UDF-fs: Bad block size\n");
+ udf_warn(sb, "Bad block size\n");
return 0;
}
sbi->s_last_block = uopt->lastblock;
@@ -1749,12 +1719,11 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
nsr_off = udf_check_vsd(sb);
if (!nsr_off) {
if (!silent)
- printk(KERN_WARNING "UDF-fs: No VRS found\n");
+ udf_warn(sb, "No VRS found\n");
return 0;
}
if (nsr_off == -1)
- udf_debug("Failed to read byte 32768. Assuming open "
- "disc. Skipping validity check\n");
+ udf_debug("Failed to read byte 32768. Assuming open disc. Skipping validity check\n");
if (!sbi->s_last_block)
sbi->s_last_block = udf_get_last_block(sb);
} else {
@@ -1765,7 +1734,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
sbi->s_anchor = uopt->anchor;
if (!udf_find_anchor(sb, fileset)) {
if (!silent)
- printk(KERN_WARNING "UDF-fs: No anchor found\n");
+ udf_warn(sb, "No anchor found\n");
return 0;
}
return 1;
@@ -1937,8 +1906,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
if (uopt.flags & (1 << UDF_FLAG_UTF8) &&
uopt.flags & (1 << UDF_FLAG_NLS_MAP)) {
- udf_error(sb, "udf_read_super",
- "utf8 cannot be combined with iocharset\n");
+ udf_err(sb, "utf8 cannot be combined with iocharset\n");
goto error_out;
}
#ifdef CONFIG_UDF_NLS
@@ -1987,15 +1955,14 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
ret = udf_load_vrs(sb, &uopt, silent, &fileset);
if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
if (!silent)
- printk(KERN_NOTICE
- "UDF-fs: Rescanning with blocksize "
- "%d\n", UDF_DEFAULT_BLOCKSIZE);
+ pr_notice("Rescanning with blocksize %d\n",
+ UDF_DEFAULT_BLOCKSIZE);
uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
ret = udf_load_vrs(sb, &uopt, silent, &fileset);
}
}
if (!ret) {
- printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
+ udf_warn(sb, "No partition found (1)\n");
goto error_out;
}
@@ -2010,10 +1977,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
le16_to_cpu(lvidiu->maxUDFWriteRev); */
if (minUDFReadRev > UDF_MAX_READ_VERSION) {
- printk(KERN_ERR "UDF-fs: minUDFReadRev=%x "
- "(max is %x)\n",
- le16_to_cpu(lvidiu->minUDFReadRev),
- UDF_MAX_READ_VERSION);
+ udf_err(sb, "minUDFReadRev=%x (max is %x)\n",
+ le16_to_cpu(lvidiu->minUDFReadRev),
+ UDF_MAX_READ_VERSION);
goto error_out;
} else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION)
sb->s_flags |= MS_RDONLY;
@@ -2027,28 +1993,27 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
}
if (!sbi->s_partitions) {
- printk(KERN_WARNING "UDF-fs: No partition found (2)\n");
+ udf_warn(sb, "No partition found (2)\n");
goto error_out;
}
if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
UDF_PART_FLAG_READ_ONLY) {
- printk(KERN_NOTICE "UDF-fs: Partition marked readonly; "
- "forcing readonly mount\n");
+ pr_notice("Partition marked readonly; forcing readonly mount\n");
sb->s_flags |= MS_RDONLY;
}
if (udf_find_fileset(sb, &fileset, &rootdir)) {
- printk(KERN_WARNING "UDF-fs: No fileset found\n");
+ udf_warn(sb, "No fileset found\n");
goto error_out;
}
if (!silent) {
struct timestamp ts;
udf_time_to_disk_stamp(&ts, sbi->s_record_time);
- udf_info("UDF: Mounting volume '%s', "
- "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
- sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day,
+ udf_info("Mounting volume '%s', timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
+ sbi->s_volume_ident,
+ le16_to_cpu(ts.year), ts.month, ts.day,
ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
}
if (!(sb->s_flags & MS_RDONLY))
@@ -2059,8 +2024,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
/* perhaps it's not extensible enough, but for now ... */
inode = udf_iget(sb, &rootdir);
if (!inode) {
- printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
- "partition=%d\n",
+ udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
goto error_out;
}
@@ -2068,7 +2032,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
/* Allocate a dentry for the root inode */
sb->s_root = d_alloc_root(inode);
if (!sb->s_root) {
- printk(KERN_ERR "UDF-fs: Couldn't allocate root dentry\n");
+ udf_err(sb, "Couldn't allocate root dentry\n");
iput(inode);
goto error_out;
}
@@ -2096,32 +2060,40 @@ error_out:
return -EINVAL;
}
-static void udf_error(struct super_block *sb, const char *function,
- const char *fmt, ...)
+void _udf_err(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
- if (!(sb->s_flags & MS_RDONLY)) {
- /* mark sb error */
+ /* mark sb error */
+ if (!(sb->s_flags & MS_RDONLY))
sb->s_dirt = 1;
- }
+
va_start(args, fmt);
- vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_err("error (device %s): %s: %pV", sb->s_id, function, &vaf);
+
va_end(args);
- printk(KERN_CRIT "UDF-fs error (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
}
-void udf_warning(struct super_block *sb, const char *function,
- const char *fmt, ...)
+void _udf_warn(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_warn("warning (device %s): %s: %pV", sb->s_id, function, &vaf);
+
va_end(args);
- printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
}
static void udf_put_super(struct super_block *sb)
@@ -2213,11 +2185,11 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
bh = udf_read_ptagged(sb, &loc, 0, &ident);
if (!bh) {
- printk(KERN_ERR "udf: udf_count_free failed\n");
+ udf_err(sb, "udf_count_free failed\n");
goto out;
} else if (ident != TAG_IDENT_SBD) {
brelse(bh);
- printk(KERN_ERR "udf: udf_count_free failed\n");
+ udf_err(sb, "udf_count_free failed\n");
goto out;
}
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 8424308db4b..4b98fee8e16 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -95,23 +95,21 @@ void udf_truncate_tail_extent(struct inode *inode)
lbcount += elen;
if (lbcount > inode->i_size) {
if (lbcount - inode->i_size >= inode->i_sb->s_blocksize)
- printk(KERN_WARNING
- "udf_truncate_tail_extent(): Too long "
- "extent after EOF in inode %u: i_size: "
- "%Ld lbcount: %Ld extent %u+%u\n",
- (unsigned)inode->i_ino,
- (long long)inode->i_size,
- (long long)lbcount,
- (unsigned)eloc.logicalBlockNum,
- (unsigned)elen);
+ udf_warn(inode->i_sb,
+ "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n",
+ (unsigned)inode->i_ino,
+ (long long)inode->i_size,
+ (long long)lbcount,
+ (unsigned)eloc.logicalBlockNum,
+ (unsigned)elen);
nelen = elen - (lbcount - inode->i_size);
epos.offset -= adsize;
extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
epos.offset += adsize;
if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
- printk(KERN_ERR "udf_truncate_tail_extent(): "
- "Extent after EOF in inode %u.\n",
- (unsigned)inode->i_ino);
+ udf_err(inode->i_sb,
+ "Extent after EOF in inode %u\n",
+ (unsigned)inode->i_ino);
break;
}
}
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 4858c191242..5142a82e327 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -54,13 +54,16 @@
#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
+#define MF_DUPLICATE_MD 0x01
+#define MF_MIRROR_FE_LOADED 0x02
+
struct udf_meta_data {
__u32 s_meta_file_loc;
__u32 s_mirror_file_loc;
__u32 s_bitmap_file_loc;
__u32 s_alloc_unit_size;
__u16 s_align_unit_size;
- __u8 s_dup_md_flag;
+ int s_flags;
struct inode *s_metadata_fe;
struct inode *s_mirror_fe;
struct inode *s_bitmap_fe;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index dbd52d4b5ee..f34e6fc0cda 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -1,6 +1,8 @@
#ifndef __UDF_DECL_H
#define __UDF_DECL_H
+#define pr_fmt(fmt) "UDF-fs: " fmt
+
#include "ecma_167.h"
#include "osta_udf.h"
@@ -16,23 +18,30 @@
#define UDF_PREALLOCATE
#define UDF_DEFAULT_PREALLOC_BLOCKS 8
+extern __printf(3, 4) void _udf_err(struct super_block *sb,
+ const char *function, const char *fmt, ...);
+#define udf_err(sb, fmt, ...) \
+ _udf_err(sb, __func__, fmt, ##__VA_ARGS__)
+
+extern __printf(3, 4) void _udf_warn(struct super_block *sb,
+ const char *function, const char *fmt, ...);
+#define udf_warn(sb, fmt, ...) \
+ _udf_warn(sb, __func__, fmt, ##__VA_ARGS__)
+
+#define udf_info(fmt, ...) \
+ pr_info("INFO " fmt, ##__VA_ARGS__)
+
#undef UDFFS_DEBUG
#ifdef UDFFS_DEBUG
-#define udf_debug(f, a...) \
-do { \
- printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \
- __FILE__, __LINE__, __func__); \
- printk(f, ##a); \
-} while (0)
+#define udf_debug(fmt, ...) \
+ printk(KERN_DEBUG pr_fmt("%s:%d:%s: " fmt), \
+ __FILE__, __LINE__, __func__, ##__VA_ARGS__)
#else
-#define udf_debug(f, a...) /**/
+#define udf_debug(fmt, ...) \
+ no_printk(fmt, ##__VA_ARGS__)
#endif
-#define udf_info(f, a...) \
- printk(KERN_INFO "UDF-fs INFO " f, ##a);
-
-
#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) )
#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) )
@@ -112,8 +121,6 @@ struct extent_position {
/* super.c */
-__attribute__((format(printf, 3, 4)))
-extern void udf_warning(struct super_block *, const char *, const char *, ...);
static inline void udf_updated_lvid(struct super_block *sb)
{
struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
@@ -126,6 +133,8 @@ static inline void udf_updated_lvid(struct super_block *sb)
UDF_SB(sb)->s_lvid_dirty = 1;
}
extern u64 lvid_get_unique_id(struct super_block *sb);
+struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
+ u32 meta_file_loc, u32 partition_num);
/* namei.c */
extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index b8c828c4d20..1f11483eba6 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -34,9 +34,10 @@
* http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm
*/
+#include "udfdecl.h"
+
#include <linux/types.h>
#include <linux/kernel.h>
-#include "udfdecl.h"
#define EPOCH_YEAR 1970
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index d03a90b6ad6..44b815e57f9 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -114,7 +114,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
cmp_id = ocu_i->u_cmpID;
if (cmp_id != 8 && cmp_id != 16) {
memset(utf_o, 0, sizeof(struct ustr));
- printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
+ pr_err("unknown compression code (%d) stri=%s\n",
cmp_id, ocu_i->u_name);
return 0;
}
@@ -242,7 +242,7 @@ try_again:
if (utf_cnt) {
error_out:
ocu[++u_len] = '?';
- printk(KERN_DEBUG "udf: bad UTF-8 character\n");
+ printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
}
ocu[length - 1] = (uint8_t)u_len + 1;
@@ -267,7 +267,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
cmp_id = ocu_i->u_cmpID;
if (cmp_id != 8 && cmp_id != 16) {
memset(utf_o, 0, sizeof(struct ustr));
- printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
+ pr_err("unknown compression code (%d) stri=%s\n",
cmp_id, ocu_i->u_name);
return 0;
}
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 2eabf04af3d..78a4c70d46b 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -341,7 +341,7 @@ cg_found:
fail_remove_inode:
unlock_super(sb);
- inode->i_nlink = 0;
+ clear_nlink(inode);
iput(inode);
UFSD("EXIT (FAILED): err %d\n", err);
return ERR_PTR(err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index b4d791a8320..879b13436fa 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -589,7 +589,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
* Copy data to the in-core inode.
*/
inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode);
- inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink);
+ set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink));
if (inode->i_nlink == 0) {
ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
return -1;
@@ -637,7 +637,7 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
* Copy data to the in-core inode.
*/
inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode);
- inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink);
+ set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink));
if (inode->i_nlink == 0) {
ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
return -1;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 5be2755dd71..c26f2bcec26 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -117,9 +117,12 @@ extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buf
extern const struct file_operations ufs_dir_operations;
/* super.c */
-extern void ufs_warning (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
-extern void ufs_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
-extern void ufs_panic (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
+extern __printf(3, 4)
+void ufs_warning(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ufs_error(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ufs_panic(struct super_block *, const char *, const char *, ...);
/* symlink.c */
extern const struct inode_operations ufs_fast_symlink_inode_operations;
diff --git a/fs/xattr.c b/fs/xattr.c
index f060663ab70..67583de8218 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -14,6 +14,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/security.h>
+#include <linux/evm.h>
#include <linux/syscalls.h>
#include <linux/module.h>
#include <linux/fsnotify.h>
@@ -166,6 +167,64 @@ out_noalloc:
}
EXPORT_SYMBOL_GPL(xattr_getsecurity);
+/*
+ * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
+ *
+ * Allocate memory, if not already allocated, or re-allocate correct size,
+ * before retrieving the extended attribute.
+ *
+ * Returns the result of alloc, if failed, or the getxattr operation.
+ */
+ssize_t
+vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
+ size_t xattr_size, gfp_t flags)
+{
+ struct inode *inode = dentry->d_inode;
+ char *value = *xattr_value;
+ int error;
+
+ error = xattr_permission(inode, name, MAY_READ);
+ if (error)
+ return error;
+
+ if (!inode->i_op->getxattr)
+ return -EOPNOTSUPP;
+
+ error = inode->i_op->getxattr(dentry, name, NULL, 0);
+ if (error < 0)
+ return error;
+
+ if (!value || (error > xattr_size)) {
+ value = krealloc(*xattr_value, error + 1, flags);
+ if (!value)
+ return -ENOMEM;
+ memset(value, 0, error + 1);
+ }
+
+ error = inode->i_op->getxattr(dentry, name, value, error);
+ *xattr_value = value;
+ return error;
+}
+
+/* Compare an extended attribute value with the given value */
+int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
+ const char *value, size_t size, gfp_t flags)
+{
+ char *xattr_value = NULL;
+ int rc;
+
+ rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags);
+ if (rc < 0)
+ return rc;
+
+ if ((rc != size) || (memcmp(xattr_value, value, rc) != 0))
+ rc = -EINVAL;
+ else
+ rc = 0;
+ kfree(xattr_value);
+ return rc;
+}
+
ssize_t
vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
{
@@ -243,8 +302,10 @@ vfs_removexattr(struct dentry *dentry, const char *name)
error = inode->i_op->removexattr(dentry, name);
mutex_unlock(&inode->i_mutex);
- if (!error)
+ if (!error) {
fsnotify_xattr(dentry);
+ evm_inode_post_removexattr(dentry, name);
+ }
return error;
}
EXPORT_SYMBOL_GPL(vfs_removexattr);
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index f7c8f7a9ea6..292eff19803 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -61,12 +61,7 @@ extern void kmem_free(const void *);
static inline void *kmem_zalloc_large(size_t size)
{
- void *ptr;
-
- ptr = vmalloc(size);
- if (ptr)
- memset(ptr, 0, size);
- return ptr;
+ return vzalloc(size);
}
static inline void kmem_free_large(void *ptr)
{
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b6c4b3795c4..76e4266d2e7 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -42,6 +42,8 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
int count, i;
count = be32_to_cpu(aclp->acl_cnt);
+ if (count > XFS_ACL_MAX_ENTRIES)
+ return ERR_PTR(-EFSCORRUPTED);
acl = posix_acl_alloc(count, GFP_KERNEL);
if (!acl)
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bdd9cb54d63..ce84ffd0264 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -452,7 +452,7 @@ xfs_alloc_read_agfl(
if (error)
return error;
ASSERT(!xfs_buf_geterror(bp));
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
+ xfs_buf_set_ref(bp, XFS_AGFL_REF);
*bpp = bp;
return 0;
}
@@ -2139,7 +2139,7 @@ xfs_read_agf(
xfs_trans_brelse(tp, *bpp);
return XFS_ERROR(EFSCORRUPTED);
}
- XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
+ xfs_buf_set_ref(*bpp, XFS_AGF_REF);
return 0;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8c37dde4c52..574d4ee9b62 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -38,40 +38,6 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
-
-/*
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC 37
-#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
-static wait_queue_head_t xfs_ioend_wq[NVSYNC];
-
-void __init
-xfs_ioend_init(void)
-{
- int i;
-
- for (i = 0; i < NVSYNC; i++)
- init_waitqueue_head(&xfs_ioend_wq[i]);
-}
-
-void
-xfs_ioend_wait(
- xfs_inode_t *ip)
-{
- wait_queue_head_t *wq = to_ioend_wq(ip);
-
- wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
-}
-
-STATIC void
-xfs_ioend_wake(
- xfs_inode_t *ip)
-{
- if (atomic_dec_and_test(&ip->i_iocount))
- wake_up(to_ioend_wq(ip));
-}
-
void
xfs_count_page_state(
struct page *page,
@@ -115,25 +81,20 @@ xfs_destroy_ioend(
xfs_ioend_t *ioend)
{
struct buffer_head *bh, *next;
- struct xfs_inode *ip = XFS_I(ioend->io_inode);
for (bh = ioend->io_buffer_head; bh; bh = next) {
next = bh->b_private;
bh->b_end_io(bh, !ioend->io_error);
}
- /*
- * Volume managers supporting multiple paths can send back ENODEV
- * when the final path disappears. In this case continuing to fill
- * the page cache with dirty data which cannot be written out is
- * evil, so prevent that.
- */
- if (unlikely(ioend->io_error == -ENODEV)) {
- xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
- __FILE__, __LINE__);
+ if (ioend->io_iocb) {
+ if (ioend->io_isasync) {
+ aio_complete(ioend->io_iocb, ioend->io_error ?
+ ioend->io_error : ioend->io_result, 0);
+ }
+ inode_dio_done(ioend->io_inode);
}
- xfs_ioend_wake(ip);
mempool_free(ioend, xfs_ioend_pool);
}
@@ -156,6 +117,15 @@ xfs_ioend_new_eof(
}
/*
+ * Fast and loose check if this write could update the on-disk inode size.
+ */
+static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
+{
+ return ioend->io_offset + ioend->io_size >
+ XFS_I(ioend->io_inode)->i_d.di_size;
+}
+
+/*
* Update on-disk file size now that data has been written to disk. The
* current in-memory file size is i_size. If a write is beyond eof i_new_size
* will be the intended file size until i_size is updated. If this write does
@@ -173,9 +143,6 @@ xfs_setfilesize(
xfs_inode_t *ip = XFS_I(ioend->io_inode);
xfs_fsize_t isize;
- if (unlikely(ioend->io_error))
- return 0;
-
if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
return EAGAIN;
@@ -192,6 +159,9 @@ xfs_setfilesize(
/*
* Schedule IO completion handling on the final put of an ioend.
+ *
+ * If there is no work to do we might as well call it a day and free the
+ * ioend right now.
*/
STATIC void
xfs_finish_ioend(
@@ -200,8 +170,10 @@ xfs_finish_ioend(
if (atomic_dec_and_test(&ioend->io_remaining)) {
if (ioend->io_type == IO_UNWRITTEN)
queue_work(xfsconvertd_workqueue, &ioend->io_work);
- else
+ else if (xfs_ioend_is_append(ioend))
queue_work(xfsdatad_workqueue, &ioend->io_work);
+ else
+ xfs_destroy_ioend(ioend);
}
}
@@ -216,17 +188,24 @@ xfs_end_io(
struct xfs_inode *ip = XFS_I(ioend->io_inode);
int error = 0;
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ ioend->io_error = -EIO;
+ goto done;
+ }
+ if (ioend->io_error)
+ goto done;
+
/*
* For unwritten extents we need to issue transactions to convert a
* range to normal written extens after the data I/O has finished.
*/
- if (ioend->io_type == IO_UNWRITTEN &&
- likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
-
+ if (ioend->io_type == IO_UNWRITTEN) {
error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
ioend->io_size);
- if (error)
- ioend->io_error = error;
+ if (error) {
+ ioend->io_error = -error;
+ goto done;
+ }
}
/*
@@ -236,6 +215,7 @@ xfs_end_io(
error = xfs_setfilesize(ioend);
ASSERT(!error || error == EAGAIN);
+done:
/*
* If we didn't complete processing of the ioend, requeue it to the
* tail of the workqueue for another attempt later. Otherwise destroy
@@ -247,8 +227,6 @@ xfs_end_io(
/* ensure we don't spin on blocked ioends */
delay(1);
} else {
- if (ioend->io_iocb)
- aio_complete(ioend->io_iocb, ioend->io_result, 0);
xfs_destroy_ioend(ioend);
}
}
@@ -285,13 +263,13 @@ xfs_alloc_ioend(
* all the I/O from calling the completion routine too early.
*/
atomic_set(&ioend->io_remaining, 1);
+ ioend->io_isasync = 0;
ioend->io_error = 0;
ioend->io_list = NULL;
ioend->io_type = type;
ioend->io_inode = inode;
ioend->io_buffer_head = NULL;
ioend->io_buffer_tail = NULL;
- atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
ioend->io_offset = 0;
ioend->io_size = 0;
ioend->io_iocb = NULL;
@@ -337,8 +315,8 @@ xfs_map_blocks(
count = mp->m_maxioffset - offset;
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
- error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
- bmapi_flags, NULL, 0, imap, &nimaps, NULL);
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+ imap, &nimaps, bmapi_flags);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (error)
@@ -551,7 +529,6 @@ xfs_cancel_ioend(
unlock_buffer(bh);
} while ((bh = next_bh) != NULL);
- xfs_ioend_wake(XFS_I(ioend->io_inode));
mempool_free(ioend, xfs_ioend_pool);
} while ((ioend = next) != NULL);
}
@@ -925,11 +902,11 @@ xfs_vm_writepage(
* random callers for direct reclaim or memcg reclaim. We explicitly
* allow reclaim from kswapd as the stack usage there is relatively low.
*
- * This should really be done by the core VM, but until that happens
- * filesystems like XFS, btrfs and ext4 have to take care of this
- * by themselves.
+ * This should never happen except in the case of a VM regression so
+ * warn about it.
*/
- if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+ if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+ PF_MEMALLOC))
goto redirty;
/*
@@ -1161,8 +1138,8 @@ __xfs_get_blocks(
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
- error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
- XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL);
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+ &imap, &nimaps, XFS_BMAPI_ENTIRE);
if (error)
goto out_unlock;
@@ -1300,7 +1277,6 @@ xfs_end_io_direct_write(
bool is_async)
{
struct xfs_ioend *ioend = iocb->private;
- struct inode *inode = ioend->io_inode;
/*
* blockdev_direct_IO can return an error even after the I/O
@@ -1311,28 +1287,17 @@ xfs_end_io_direct_write(
ioend->io_offset = offset;
ioend->io_size = size;
+ ioend->io_iocb = iocb;
+ ioend->io_result = ret;
if (private && size > 0)
ioend->io_type = IO_UNWRITTEN;
if (is_async) {
- /*
- * If we are converting an unwritten extent we need to delay
- * the AIO completion until after the unwrittent extent
- * conversion has completed, otherwise do it ASAP.
- */
- if (ioend->io_type == IO_UNWRITTEN) {
- ioend->io_iocb = iocb;
- ioend->io_result = ret;
- } else {
- aio_complete(iocb, ret, 0);
- }
+ ioend->io_isasync = 1;
xfs_finish_ioend(ioend);
} else {
xfs_finish_ioend_sync(ioend);
}
-
- /* XXX: probably should move into the real I/O completion handler */
- inode_dio_done(inode);
}
STATIC ssize_t
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 71f721e1a71..116dd5c3703 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -47,6 +47,7 @@ typedef struct xfs_ioend {
unsigned int io_type; /* delalloc / unwritten */
int io_error; /* I/O error code */
atomic_t io_remaining; /* hold count */
+ unsigned int io_isasync : 1; /* needs aio_complete */
struct inode *io_inode; /* file being written to */
struct buffer_head *io_buffer_head;/* buffer linked list head */
struct buffer_head *io_buffer_tail;/* buffer linked list tail */
@@ -60,9 +61,6 @@ typedef struct xfs_ioend {
extern const struct address_space_operations xfs_address_space_operations;
extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
-extern void xfs_ioend_init(void);
-extern void xfs_ioend_wait(struct xfs_inode *);
-
extern void xfs_count_page_state(struct page *, int *, int *);
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 160bcdc34a6..1e5d97f86ea 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -319,7 +319,7 @@ xfs_attr_set_int(
return (error);
}
- xfs_trans_ijoin(args.trans, dp);
+ xfs_trans_ijoin(args.trans, dp, 0);
/*
* If the attribute list is non-existent or a shortform list,
@@ -389,7 +389,7 @@ xfs_attr_set_int(
* a new one. We need the inode to be in all transactions.
*/
if (committed)
- xfs_trans_ijoin(args.trans, dp);
+ xfs_trans_ijoin(args.trans, dp, 0);
/*
* Commit the leaf transformation. We'll need another (linked)
@@ -537,7 +537,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
* No need to make quota reservations here. We expect to release some
* blocks not allocate in the common case.
*/
- xfs_trans_ijoin(args.trans, dp);
+ xfs_trans_ijoin(args.trans, dp, 0);
/*
* Decide on what work routines to call based on the inode size.
@@ -809,7 +809,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
* No need to make quota reservations here. We expect to release some
* blocks, not allocate, in the common case.
*/
- xfs_trans_ijoin(trans, dp);
+ xfs_trans_ijoin(trans, dp, 0);
/*
* Decide on what work routines to call based on the inode size.
@@ -823,18 +823,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
if (error)
goto out;
- /*
- * Signal synchronous inactive transactions unless this is a
- * synchronous mount filesystem in which case we know that we're here
- * because we've been called out of xfs_inactive which means that the
- * last reference is gone and the unlink transaction has already hit
- * the disk so async inactive transactions are safe.
- */
- if (!(mp->m_flags & XFS_MOUNT_WSYNC)) {
- if (dp->i_d.di_anextents > 0)
- xfs_trans_set_sync(trans);
- }
-
error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
if (error)
goto out;
@@ -973,7 +961,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* a new one. We need the inode to be in all transactions.
*/
if (committed)
- xfs_trans_ijoin(args->trans, dp);
+ xfs_trans_ijoin(args->trans, dp, 0);
/*
* Commit the current trans (including the inode) and start
@@ -1075,7 +1063,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* in all transactions.
*/
if (committed)
- xfs_trans_ijoin(args->trans, dp);
+ xfs_trans_ijoin(args->trans, dp, 0);
} else
xfs_da_buf_done(bp);
@@ -1149,7 +1137,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
* a new one. We need the inode to be in all transactions.
*/
if (committed)
- xfs_trans_ijoin(args->trans, dp);
+ xfs_trans_ijoin(args->trans, dp, 0);
} else
xfs_da_buf_done(bp);
return(0);
@@ -1303,7 +1291,7 @@ restart:
* in all transactions.
*/
if (committed)
- xfs_trans_ijoin(args->trans, dp);
+ xfs_trans_ijoin(args->trans, dp, 0);
/*
* Commit the node conversion and start the next
@@ -1340,7 +1328,7 @@ restart:
* a new one. We need the inode to be in all transactions.
*/
if (committed)
- xfs_trans_ijoin(args->trans, dp);
+ xfs_trans_ijoin(args->trans, dp, 0);
} else {
/*
* Addition succeeded, update Btree hashvals.
@@ -1452,7 +1440,7 @@ restart:
* in all transactions.
*/
if (committed)
- xfs_trans_ijoin(args->trans, dp);
+ xfs_trans_ijoin(args->trans, dp, 0);
}
/*
@@ -1584,7 +1572,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
* a new one. We need the inode to be in all transactions.
*/
if (committed)
- xfs_trans_ijoin(args->trans, dp);
+ xfs_trans_ijoin(args->trans, dp, 0);
/*
* Commit the Btree join operation and start a new trans.
@@ -1635,7 +1623,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
* in all transactions.
*/
if (committed)
- xfs_trans_ijoin(args->trans, dp);
+ xfs_trans_ijoin(args->trans, dp, 0);
} else
xfs_da_brelse(args->trans, bp);
}
@@ -1975,10 +1963,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
lblkno = args->rmtblkno;
while (valuelen > 0) {
nmap = ATTR_RMTVALUE_MAPSIZE;
- error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- NULL, 0, map, &nmap, NULL);
+ error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+ args->rmtblkcnt, map, &nmap,
+ XFS_BMAPI_ATTRFORK);
if (error)
return(error);
ASSERT(nmap >= 1);
@@ -2052,10 +2039,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
*/
xfs_bmap_init(args->flist, args->firstblock);
nmap = 1;
- error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
+ error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
blkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
- XFS_BMAPI_WRITE,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
args->firstblock, args->total, &map, &nmap,
args->flist);
if (!error) {
@@ -2074,7 +2060,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
* a new one. We need the inode to be in all transactions.
*/
if (committed)
- xfs_trans_ijoin(args->trans, dp);
+ xfs_trans_ijoin(args->trans, dp, 0);
ASSERT(nmap == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -2104,14 +2090,11 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
*/
xfs_bmap_init(args->flist, args->firstblock);
nmap = 1;
- error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- args->firstblock, 0, &map, &nmap,
- NULL);
- if (error) {
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+ args->rmtblkcnt, &map, &nmap,
+ XFS_BMAPI_ATTRFORK);
+ if (error)
return(error);
- }
ASSERT(nmap == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
(map.br_startblock != HOLESTARTBLOCK));
@@ -2121,16 +2104,17 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
XBF_LOCK | XBF_DONT_BLOCK);
- ASSERT(!xfs_buf_geterror(bp));
-
+ if (!bp)
+ return ENOMEM;
tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
XFS_BUF_SIZE(bp);
xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
if (tmp < XFS_BUF_SIZE(bp))
xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
- if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
- return (error);
- }
+ error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
+ xfs_buf_relse(bp);
+ if (error)
+ return error;
src += tmp;
valuelen -= tmp;
@@ -2166,16 +2150,12 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
/*
* Try to remember where we decided to put the value.
*/
- xfs_bmap_init(args->flist, args->firstblock);
nmap = 1;
- error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- args->firstblock, 0, &map, &nmap,
- args->flist);
- if (error) {
+ error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+ args->rmtblkcnt, &map, &nmap,
+ XFS_BMAPI_ATTRFORK);
+ if (error)
return(error);
- }
ASSERT(nmap == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
(map.br_startblock != HOLESTARTBLOCK));
@@ -2188,8 +2168,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
*/
bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
if (bp) {
- XFS_BUF_STALE(bp);
- XFS_BUF_UNDELAYWRITE(bp);
+ xfs_buf_stale(bp);
xfs_buf_relse(bp);
bp = NULL;
}
@@ -2227,7 +2206,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
* a new one. We need the inode to be in all transactions.
*/
if (committed)
- xfs_trans_ijoin(args->trans, args->dp);
+ xfs_trans_ijoin(args->trans, args->dp, 0);
/*
* Close out trans and start the next one in the chain.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 8fad9602542..c1b55e59655 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -110,6 +110,7 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
/*
* Query whether the requested number of additional bytes of extended
* attribute space will be able to fit inline.
+ *
* Returns zero if not, else the di_forkoff fork offset to be used in the
* literal area for attribute data once the new bytes have been added.
*
@@ -122,7 +123,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
int offset;
int minforkoff; /* lower limit on valid forkoff locations */
int maxforkoff; /* upper limit on valid forkoff locations */
- int dsize;
+ int dsize;
xfs_mount_t *mp = dp->i_mount;
offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */
@@ -136,47 +137,60 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
return (offset >= minforkoff) ? minforkoff : 0;
}
- if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
- if (bytes <= XFS_IFORK_ASIZE(dp))
- return dp->i_d.di_forkoff;
+ /*
+ * If the requested numbers of bytes is smaller or equal to the
+ * current attribute fork size we can always proceed.
+ *
+ * Note that if_bytes in the data fork might actually be larger than
+ * the current data fork size is due to delalloc extents. In that
+ * case either the extent count will go down when they are converted
+ * to real extents, or the delalloc conversion will take care of the
+ * literal area rebalancing.
+ */
+ if (bytes <= XFS_IFORK_ASIZE(dp))
+ return dp->i_d.di_forkoff;
+
+ /*
+ * For attr2 we can try to move the forkoff if there is space in the
+ * literal area, but for the old format we are done if there is no
+ * space in the fixed attribute fork.
+ */
+ if (!(mp->m_flags & XFS_MOUNT_ATTR2))
return 0;
- }
dsize = dp->i_df.if_bytes;
-
+
switch (dp->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- /*
+ /*
* If there is no attr fork and the data fork is extents,
- * determine if creating the default attr fork will result
- * in the extents form migrating to btree. If so, the
- * minimum offset only needs to be the space required for
+ * determine if creating the default attr fork will result
+ * in the extents form migrating to btree. If so, the
+ * minimum offset only needs to be the space required for
* the btree root.
- */
+ */
if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
xfs_default_attroffset(dp))
dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
break;
-
case XFS_DINODE_FMT_BTREE:
/*
- * If have data btree then keep forkoff if we have one,
- * otherwise we are adding a new attr, so then we set
- * minforkoff to where the btree root can finish so we have
+ * If we have a data btree then keep forkoff if we have one,
+ * otherwise we are adding a new attr, so then we set
+ * minforkoff to where the btree root can finish so we have
* plenty of room for attrs
*/
if (dp->i_d.di_forkoff) {
- if (offset < dp->i_d.di_forkoff)
+ if (offset < dp->i_d.di_forkoff)
return 0;
- else
- return dp->i_d.di_forkoff;
- } else
- dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
+ return dp->i_d.di_forkoff;
+ }
+ dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
break;
}
-
- /*
- * A data fork btree root must have space for at least
+
+ /*
+ * A data fork btree root must have space for at least
* MINDBTPTRS key/ptr pairs if the data fork is small or empty.
*/
minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
@@ -186,10 +200,10 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
maxforkoff = maxforkoff >> 3; /* rounded down */
- if (offset >= minforkoff && offset < maxforkoff)
- return offset;
if (offset >= maxforkoff)
return maxforkoff;
+ if (offset >= minforkoff)
+ return offset;
return 0;
}
@@ -2926,9 +2940,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
* Try to remember where we decided to put the value.
*/
nmap = 1;
- error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- NULL, 0, &map, &nmap, NULL);
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
+ &map, &nmap, XFS_BMAPI_ATTRFORK);
if (error) {
return(error);
}
@@ -2948,6 +2961,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
bp = xfs_trans_get_buf(*trans,
dp->i_mount->m_ddev_targp,
dblkno, dblkcnt, XBF_LOCK);
+ if (!bp)
+ return ENOMEM;
xfs_trans_binval(*trans, bp);
/*
* Roll to next transaction.
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 452a291383a..d0ab7883705 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -50,17 +50,22 @@
#include "xfs_trace.h"
-#ifdef DEBUG
-STATIC void
-xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
-#endif
-
kmem_zone_t *xfs_bmap_free_item_zone;
/*
* Prototypes for internal bmap routines.
*/
+#ifdef DEBUG
+STATIC void
+xfs_bmap_check_leaf_extents(
+ struct xfs_btree_cur *cur,
+ struct xfs_inode *ip,
+ int whichfork);
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
+#endif
+
/*
* Called from xfs_bmap_add_attrfork to handle extents format files.
@@ -85,58 +90,6 @@ xfs_bmap_add_attrfork_local(
int *flags); /* inode logging flags */
/*
- * Called by xfs_bmap_add_extent to handle cases converting a delayed
- * allocation to a real allocation.
- */
-STATIC int /* error */
-xfs_bmap_add_extent_delay_real(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t *idx, /* extent number to update/insert */
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to add to file extents */
- xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
- xfs_fsblock_t *first, /* pointer to firstblock variable */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp); /* inode logging flags */
-
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a delayed allocation.
- */
-STATIC int /* error */
-xfs_bmap_add_extent_hole_delay(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t *idx, /* extent number to update/insert */
- xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp); /* inode logging flags */
-
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a real allocation.
- */
-STATIC int /* error */
-xfs_bmap_add_extent_hole_real(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t *idx, /* extent number to update/insert */
- xfs_btree_cur_t *cur, /* if null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp, /* inode logging flags */
- int whichfork); /* data or attr fork */
-
-/*
- * Called by xfs_bmap_add_extent to handle cases converting an unwritten
- * allocation to a real allocation or vice versa.
- */
-STATIC int /* error */
-xfs_bmap_add_extent_unwritten_real(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t *idx, /* extent number to update/insert */
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp); /* inode logging flags */
-
-/*
* xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
* It figures out where to ask the underlying allocator to put the new extent.
*/
@@ -215,19 +168,6 @@ xfs_bmap_search_extents(
xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */
/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- */
-STATIC int /* error */
-xfs_bmap_isaeof(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t off, /* file offset in fsblocks */
- int whichfork, /* data or attribute fork */
- char *aeof); /* return value */
-
-/*
* Compute the worst-case number of indirect blocks that will be used
* for ip's delayed extent of length "len".
*/
@@ -431,188 +371,13 @@ xfs_bmap_add_attrfork_local(
}
/*
- * Called by xfs_bmapi to update file extent records and the btree
- * after allocating space (or doing a delayed allocation).
- */
-STATIC int /* error */
-xfs_bmap_add_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t *idx, /* extent number to update/insert */
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to add to file extents */
- xfs_fsblock_t *first, /* pointer to firstblock variable */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
-{
- xfs_btree_cur_t *cur; /* btree cursor or null */
- xfs_filblks_t da_new; /* new count del alloc blocks used */
- xfs_filblks_t da_old; /* old count del alloc blocks used */
- int error; /* error return value */
- xfs_ifork_t *ifp; /* inode fork ptr */
- int logflags; /* returned value */
- xfs_extnum_t nextents; /* number of extents in file now */
-
- XFS_STATS_INC(xs_add_exlist);
-
- cur = *curp;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- da_old = da_new = 0;
- error = 0;
-
- ASSERT(*idx >= 0);
- ASSERT(*idx <= nextents);
-
- /*
- * This is the first extent added to a new/empty file.
- * Special case this one, so other routines get to assume there are
- * already extents in the list.
- */
- if (nextents == 0) {
- xfs_iext_insert(ip, *idx, 1, new,
- whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-
- ASSERT(cur == NULL);
-
- if (!isnullstartblock(new->br_startblock)) {
- XFS_IFORK_NEXT_SET(ip, whichfork, 1);
- logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
- } else
- logflags = 0;
- }
- /*
- * Any kind of new delayed allocation goes here.
- */
- else if (isnullstartblock(new->br_startblock)) {
- if (cur)
- ASSERT((cur->bc_private.b.flags &
- XFS_BTCUR_BPRV_WASDEL) == 0);
- error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
- &logflags);
- }
- /*
- * Real allocation off the end of the file.
- */
- else if (*idx == nextents) {
- if (cur)
- ASSERT((cur->bc_private.b.flags &
- XFS_BTCUR_BPRV_WASDEL) == 0);
- error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
- &logflags, whichfork);
- } else {
- xfs_bmbt_irec_t prev; /* old extent at offset idx */
-
- /*
- * Get the record referred to by idx.
- */
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
- /*
- * If it's a real allocation record, and the new allocation ends
- * after the start of the referred to record, then we're filling
- * in a delayed or unwritten allocation with a real one, or
- * converting real back to unwritten.
- */
- if (!isnullstartblock(new->br_startblock) &&
- new->br_startoff + new->br_blockcount > prev.br_startoff) {
- if (prev.br_state != XFS_EXT_UNWRITTEN &&
- isnullstartblock(prev.br_startblock)) {
- da_old = startblockval(prev.br_startblock);
- if (cur)
- ASSERT(cur->bc_private.b.flags &
- XFS_BTCUR_BPRV_WASDEL);
- error = xfs_bmap_add_extent_delay_real(tp, ip,
- idx, &cur, new, &da_new,
- first, flist, &logflags);
- } else {
- ASSERT(new->br_state == XFS_EXT_NORM ||
- new->br_state == XFS_EXT_UNWRITTEN);
-
- error = xfs_bmap_add_extent_unwritten_real(ip,
- idx, &cur, new, &logflags);
- if (error)
- goto done;
- }
- }
- /*
- * Otherwise we're filling in a hole with an allocation.
- */
- else {
- if (cur)
- ASSERT((cur->bc_private.b.flags &
- XFS_BTCUR_BPRV_WASDEL) == 0);
- error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
- new, &logflags, whichfork);
- }
- }
-
- if (error)
- goto done;
- ASSERT(*curp == cur || *curp == NULL);
-
- /*
- * Convert to a btree if necessary.
- */
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
- int tmp_logflags; /* partial log flag return val */
-
- ASSERT(cur == NULL);
- error = xfs_bmap_extents_to_btree(tp, ip, first,
- flist, &cur, da_old > 0, &tmp_logflags, whichfork);
- logflags |= tmp_logflags;
- if (error)
- goto done;
- }
- /*
- * Adjust for changes in reserved delayed indirect blocks.
- * Nothing to do for disk quotas here.
- */
- if (da_old || da_new) {
- xfs_filblks_t nblks;
-
- nblks = da_new;
- if (cur)
- nblks += cur->bc_private.b.allocated;
- ASSERT(nblks <= da_old);
- if (nblks < da_old)
- xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
- (int64_t)(da_old - nblks), 0);
- }
- /*
- * Clear out the allocated field, done with it now in any case.
- */
- if (cur) {
- cur->bc_private.b.allocated = 0;
- *curp = cur;
- }
-done:
-#ifdef DEBUG
- if (!error)
- xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
-#endif
- *logflagsp = logflags;
- return error;
-}
-
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a delayed
- * allocation to a real allocation.
+ * Convert a delayed allocation to a real allocation.
*/
STATIC int /* error */
xfs_bmap_add_extent_delay_real(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t *idx, /* extent number to update/insert */
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to add to file extents */
- xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
- xfs_fsblock_t *first, /* pointer to firstblock variable */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp) /* inode logging flags */
+ struct xfs_bmalloca *bma)
{
- xfs_btree_cur_t *cur; /* btree cursor */
+ struct xfs_bmbt_irec *new = &bma->got;
int diff; /* temp value */
xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
int error; /* error return value */
@@ -623,10 +388,22 @@ xfs_bmap_add_extent_delay_real(
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
int state = 0;/* state bits, accessed thru macros */
- xfs_filblks_t temp=0; /* value for dnew calculations */
- xfs_filblks_t temp2=0;/* value for dnew calculations */
+ xfs_filblks_t da_new; /* new count del alloc blocks used */
+ xfs_filblks_t da_old; /* old count del alloc blocks used */
+ xfs_filblks_t temp=0; /* value for da_new calculations */
+ xfs_filblks_t temp2=0;/* value for da_new calculations */
int tmp_rval; /* partial logging flags */
+ ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
+
+ ASSERT(bma->idx >= 0);
+ ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(!isnullstartblock(new->br_startblock));
+ ASSERT(!bma->cur ||
+ (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+ XFS_STATS_INC(xs_add_exlist);
+
#define LEFT r[0]
#define RIGHT r[1]
#define PREV r[2]
@@ -634,14 +411,15 @@ xfs_bmap_add_extent_delay_real(
/*
* Set up a bunch of variables to make the tests simpler.
*/
- cur = *curp;
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- ep = xfs_iext_get_ext(ifp, *idx);
+ ep = xfs_iext_get_ext(ifp, bma->idx);
xfs_bmbt_get_all(ep, &PREV);
new_endoff = new->br_startoff + new->br_blockcount;
ASSERT(PREV.br_startoff <= new->br_startoff);
ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+ da_old = startblockval(PREV.br_startblock);
+ da_new = 0;
+
/*
* Set flags determining what part of the previous delayed allocation
* extent is being replaced by a real allocation.
@@ -655,9 +433,9 @@ xfs_bmap_add_extent_delay_real(
* Check and set flags if this segment has a left neighbor.
* Don't set contiguous if the combined extent would be too large.
*/
- if (*idx > 0) {
+ if (bma->idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
if (isnullstartblock(LEFT.br_startblock))
state |= BMAP_LEFT_DELAY;
@@ -675,9 +453,9 @@ xfs_bmap_add_extent_delay_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
if (isnullstartblock(RIGHT.br_startblock))
state |= BMAP_RIGHT_DELAY;
@@ -708,38 +486,41 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The left and right neighbors are both contiguous with new.
*/
- --*idx;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+ bma->idx--;
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
LEFT.br_blockcount + PREV.br_blockcount +
RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_iext_remove(ip, *idx + 1, 2, state);
- ip->i_d.di_nextents--;
- if (cur == NULL)
+ xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
+ bma->ip->i_d.di_nextents--;
+ if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
RIGHT.br_startblock,
- RIGHT.br_blockcount, &i)))
+ RIGHT.br_blockcount, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_btree_delete(cur, &i)))
+ error = xfs_btree_delete(bma->cur, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_btree_decrement(cur, 0, &i)))
+ error = xfs_btree_decrement(bma->cur, 0, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+ error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount +
PREV.br_blockcount +
- RIGHT.br_blockcount, LEFT.br_state)))
+ RIGHT.br_blockcount, LEFT.br_state);
+ if (error)
goto done;
}
- *dnew = 0;
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -747,30 +528,31 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The left neighbor is contiguous, the right is not.
*/
- --*idx;
+ bma->idx--;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
LEFT.br_blockcount + PREV.br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_iext_remove(ip, *idx + 1, 1, state);
- if (cur == NULL)
+ xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+ if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
LEFT.br_startblock, LEFT.br_blockcount,
- &i)))
+ &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+ error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount +
- PREV.br_blockcount, LEFT.br_state)))
+ PREV.br_blockcount, LEFT.br_state);
+ if (error)
goto done;
}
- *dnew = 0;
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -778,30 +560,30 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The right neighbor is contiguous, the left is not.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep, new->br_startblock);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount + RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_iext_remove(ip, *idx + 1, 1, state);
- if (cur == NULL)
+ xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+ if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
RIGHT.br_startblock,
- RIGHT.br_blockcount, &i)))
+ RIGHT.br_blockcount, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+ error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
new->br_startblock,
PREV.br_blockcount +
- RIGHT.br_blockcount, PREV.br_state)))
+ RIGHT.br_blockcount, PREV.br_state);
+ if (error)
goto done;
}
-
- *dnew = 0;
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -810,27 +592,27 @@ xfs_bmap_add_extent_delay_real(
* Neither the left nor right neighbors are contiguous with
* the new one.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep, new->br_startblock);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- ip->i_d.di_nextents++;
- if (cur == NULL)
+ bma->ip->i_d.di_nextents++;
+ if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
new->br_startblock, new->br_blockcount,
- &i)))
+ &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 0, done);
- cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_btree_insert(cur, &i)))
+ bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
-
- *dnew = 0;
break;
case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -838,39 +620,40 @@ xfs_bmap_add_extent_delay_real(
* Filling in the first part of a previous delayed allocation.
* The left neighbor is contiguous.
*/
- trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
LEFT.br_blockcount + new->br_blockcount);
xfs_bmbt_set_startoff(ep,
PREV.br_startoff + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- if (cur == NULL)
+ if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
LEFT.br_startblock, LEFT.br_blockcount,
- &i)))
+ &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+ error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount +
new->br_blockcount,
- LEFT.br_state)))
+ LEFT.br_state);
+ if (error)
goto done;
}
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock));
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- --*idx;
- *dnew = temp;
+ bma->idx--;
break;
case BMAP_LEFT_FILLING:
@@ -878,43 +661,43 @@ xfs_bmap_add_extent_delay_real(
* Filling in the first part of a previous delayed allocation.
* The left neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_bmbt_set_startoff(ep, new_endoff);
temp = PREV.br_blockcount - new->br_blockcount;
xfs_bmbt_set_blockcount(ep, temp);
- xfs_iext_insert(ip, *idx, 1, new, state);
- ip->i_d.di_nextents++;
- if (cur == NULL)
+ xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+ bma->ip->i_d.di_nextents++;
+ if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
new->br_startblock, new->br_blockcount,
- &i)))
+ &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 0, done);
- cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_btree_insert(cur, &i)))
+ bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_nextents > ip->i_df.if_ext_max) {
- error = xfs_bmap_extents_to_btree(tp, ip,
- first, flist, &cur, 1, &tmp_rval,
- XFS_DATA_FORK);
+ if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ bma->firstblock, bma->flist,
+ &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
rval |= tmp_rval;
if (error)
goto done;
}
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (cur ? cur->bc_private.b.allocated : 0));
- ep = xfs_iext_get_ext(ifp, *idx + 1);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
-
- *dnew = temp;
+ (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ ep = xfs_iext_get_ext(ifp, bma->idx + 1);
+ xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+ trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
break;
case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -923,38 +706,39 @@ xfs_bmap_add_extent_delay_real(
* The right neighbor is contiguous with the new allocation.
*/
temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
new->br_startoff, new->br_startblock,
new->br_blockcount + RIGHT.br_blockcount,
RIGHT.br_state);
- trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
- if (cur == NULL)
+ trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+ if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
RIGHT.br_startblock,
- RIGHT.br_blockcount, &i)))
+ RIGHT.br_blockcount, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_update(cur, new->br_startoff,
+ error = xfs_bmbt_update(bma->cur, new->br_startoff,
new->br_startblock,
new->br_blockcount +
RIGHT.br_blockcount,
- RIGHT.br_state)))
+ RIGHT.br_state);
+ if (error)
goto done;
}
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock));
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- ++*idx;
- *dnew = temp;
+ bma->idx++;
break;
case BMAP_RIGHT_FILLING:
@@ -963,42 +747,43 @@ xfs_bmap_add_extent_delay_real(
* The right neighbor is not contiguous.
*/
temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- xfs_iext_insert(ip, *idx + 1, 1, new, state);
- ip->i_d.di_nextents++;
- if (cur == NULL)
+ xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
+ bma->ip->i_d.di_nextents++;
+ if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
new->br_startblock, new->br_blockcount,
- &i)))
+ &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 0, done);
- cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_btree_insert(cur, &i)))
+ bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_nextents > ip->i_df.if_ext_max) {
- error = xfs_bmap_extents_to_btree(tp, ip,
- first, flist, &cur, 1, &tmp_rval,
- XFS_DATA_FORK);
+ if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ bma->firstblock, bma->flist, &bma->cur, 1,
+ &tmp_rval, XFS_DATA_FORK);
rval |= tmp_rval;
if (error)
goto done;
}
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (cur ? cur->bc_private.b.allocated : 0));
- ep = xfs_iext_get_ext(ifp, *idx);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ ep = xfs_iext_get_ext(ifp, bma->idx);
+ xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- ++*idx;
- *dnew = temp;
+ bma->idx++;
break;
case 0:
@@ -1024,82 +809,65 @@ xfs_bmap_add_extent_delay_real(
*/
temp = new->br_startoff - PREV.br_startoff;
temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
- trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
LEFT = *new;
RIGHT.br_state = PREV.br_state;
RIGHT.br_startblock = nullstartblock(
- (int)xfs_bmap_worst_indlen(ip, temp2));
+ (int)xfs_bmap_worst_indlen(bma->ip, temp2));
RIGHT.br_startoff = new_endoff;
RIGHT.br_blockcount = temp2;
/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
- xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
- ip->i_d.di_nextents++;
- if (cur == NULL)
+ xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
+ bma->ip->i_d.di_nextents++;
+ if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
new->br_startblock, new->br_blockcount,
- &i)))
+ &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 0, done);
- cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_btree_insert(cur, &i)))
+ bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_nextents > ip->i_df.if_ext_max) {
- error = xfs_bmap_extents_to_btree(tp, ip,
- first, flist, &cur, 1, &tmp_rval,
- XFS_DATA_FORK);
+ if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ bma->firstblock, bma->flist, &bma->cur,
+ 1, &tmp_rval, XFS_DATA_FORK);
rval |= tmp_rval;
if (error)
goto done;
}
- temp = xfs_bmap_worst_indlen(ip, temp);
- temp2 = xfs_bmap_worst_indlen(ip, temp2);
+ temp = xfs_bmap_worst_indlen(bma->ip, temp);
+ temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
- (cur ? cur->bc_private.b.allocated : 0));
- if (diff > 0 &&
- xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
- -((int64_t)diff), 0)) {
- /*
- * Ick gross gag me with a spoon.
- */
- ASSERT(0); /* want to see if this ever happens! */
- while (diff > 0) {
- if (temp) {
- temp--;
- diff--;
- if (!diff ||
- !xfs_icsb_modify_counters(ip->i_mount,
- XFS_SBS_FDBLOCKS,
- -((int64_t)diff), 0))
- break;
- }
- if (temp2) {
- temp2--;
- diff--;
- if (!diff ||
- !xfs_icsb_modify_counters(ip->i_mount,
- XFS_SBS_FDBLOCKS,
- -((int64_t)diff), 0))
- break;
- }
- }
+ (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ if (diff > 0) {
+ error = xfs_icsb_modify_counters(bma->ip->i_mount,
+ XFS_SBS_FDBLOCKS,
+ -((int64_t)diff), 0);
+ ASSERT(!error);
+ if (error)
+ goto done;
}
- ep = xfs_iext_get_ext(ifp, *idx);
+
+ ep = xfs_iext_get_ext(ifp, bma->idx);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
nullstartblock((int)temp2));
- trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
- ++*idx;
- *dnew = temp + temp2;
+ bma->idx++;
+ da_new = temp + temp2;
break;
case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1114,9 +882,40 @@ xfs_bmap_add_extent_delay_real(
*/
ASSERT(0);
}
- *curp = cur;
+
+ /* convert to a btree if necessary */
+ if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(bma->cur == NULL);
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ bma->firstblock, bma->flist, &bma->cur,
+ da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+ bma->logflags |= tmp_logflags;
+ if (error)
+ goto done;
+ }
+
+ /* adjust for changes in reserved delayed indirect blocks */
+ if (da_old || da_new) {
+ temp = da_new;
+ if (bma->cur)
+ temp += bma->cur->bc_private.b.allocated;
+ ASSERT(temp <= da_old);
+ if (temp < da_old)
+ xfs_icsb_modify_counters(bma->ip->i_mount,
+ XFS_SBS_FDBLOCKS,
+ (int64_t)(da_old - temp), 0);
+ }
+
+ /* clear out the allocated field, done with it now in any case. */
+ if (bma->cur)
+ bma->cur->bc_private.b.allocated = 0;
+
+ xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
done:
- *logflagsp = rval;
+ bma->logflags |= rval;
return error;
#undef LEFT
#undef RIGHT
@@ -1124,15 +923,17 @@ done:
}
/*
- * Called by xfs_bmap_add_extent to handle cases converting an unwritten
- * allocation to a real allocation or vice versa.
+ * Convert an unwritten allocation to a real allocation or vice versa.
*/
STATIC int /* error */
xfs_bmap_add_extent_unwritten_real(
+ struct xfs_trans *tp,
xfs_inode_t *ip, /* incore inode pointer */
xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
+ xfs_fsblock_t *first, /* pointer to firstblock variable */
+ xfs_bmap_free_t *flist, /* list of extents to be freed */
int *logflagsp) /* inode logging flags */
{
xfs_btree_cur_t *cur; /* btree cursor */
@@ -1148,15 +949,25 @@ xfs_bmap_add_extent_unwritten_real(
int rval=0; /* return value (logging flags) */
int state = 0;/* state bits, accessed thru macros */
+ *logflagsp = 0;
+
+ cur = *curp;
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+
+ ASSERT(*idx >= 0);
+ ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(!isnullstartblock(new->br_startblock));
+
+ XFS_STATS_INC(xs_add_exlist);
+
#define LEFT r[0]
#define RIGHT r[1]
#define PREV r[2]
+
/*
* Set up a bunch of variables to make the tests simpler.
*/
error = 0;
- cur = *curp;
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_get_all(ep, &PREV);
newext = new->br_state;
@@ -1406,10 +1217,11 @@ xfs_bmap_add_extent_unwritten_real(
goto done;
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- if (xfs_bmbt_update(cur, LEFT.br_startoff,
+ error = xfs_bmbt_update(cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount + new->br_blockcount,
- LEFT.br_state))
+ LEFT.br_state);
+ if (error)
goto done;
}
break;
@@ -1607,9 +1419,29 @@ xfs_bmap_add_extent_unwritten_real(
*/
ASSERT(0);
}
- *curp = cur;
+
+ /* convert to a btree if necessary */
+ if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+ 0, &tmp_logflags, XFS_DATA_FORK);
+ *logflagsp |= tmp_logflags;
+ if (error)
+ goto done;
+ }
+
+ /* clear out the allocated field, done with it now in any case. */
+ if (cur) {
+ cur->bc_private.b.allocated = 0;
+ *curp = cur;
+ }
+
+ xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
done:
- *logflagsp = rval;
+ *logflagsp |= rval;
return error;
#undef LEFT
#undef RIGHT
@@ -1617,16 +1449,13 @@ done:
}
/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a delayed allocation.
+ * Convert a hole to a delayed allocation.
*/
-/*ARGSUSED*/
-STATIC int /* error */
+STATIC void
xfs_bmap_add_extent_hole_delay(
xfs_inode_t *ip, /* incore inode pointer */
xfs_extnum_t *idx, /* extent number to update/insert */
- xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp) /* inode logging flags */
+ xfs_bmbt_irec_t *new) /* new data to add to file extents */
{
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
@@ -1761,23 +1590,17 @@ xfs_bmap_add_extent_hole_delay(
* Nothing to do for disk quota accounting here.
*/
}
- *logflagsp = 0;
- return 0;
}
/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a real allocation.
+ * Convert a hole to a real allocation.
*/
STATIC int /* error */
xfs_bmap_add_extent_hole_real(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t *idx, /* extent number to update/insert */
- xfs_btree_cur_t *cur, /* if null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
+ struct xfs_bmalloca *bma,
+ int whichfork)
{
+ struct xfs_bmbt_irec *new = &bma->got;
int error; /* error return value */
int i; /* temp state */
xfs_ifork_t *ifp; /* inode fork pointer */
@@ -1786,19 +1609,26 @@ xfs_bmap_add_extent_hole_real(
int rval=0; /* return value (logging flags) */
int state; /* state bits, accessed thru macros */
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
- state = 0;
+ ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+
+ ASSERT(bma->idx >= 0);
+ ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(!isnullstartblock(new->br_startblock));
+ ASSERT(!bma->cur ||
+ !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+ XFS_STATS_INC(xs_add_exlist);
+ state = 0;
if (whichfork == XFS_ATTR_FORK)
state |= BMAP_ATTRFORK;
/*
* Check and set flags if this segment has a left neighbor.
*/
- if (*idx > 0) {
+ if (bma->idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -1807,9 +1637,9 @@ xfs_bmap_add_extent_hole_real(
* Check and set flags if this segment has a current value.
* Not true if we're inserting into the "hole" at eof.
*/
- if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -1846,39 +1676,42 @@ xfs_bmap_add_extent_hole_real(
* left and on the right.
* Merge all three into a single extent record.
*/
- --*idx;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+ --bma->idx;
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
left.br_blockcount + new->br_blockcount +
right.br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_iext_remove(ip, *idx + 1, 1, state);
+ xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
- if (cur == NULL) {
+ XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+ XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
+ if (bma->cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur,
- right.br_startoff,
- right.br_startblock,
- right.br_blockcount, &i)))
+ error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
+ right.br_startblock, right.br_blockcount,
+ &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_btree_delete(cur, &i)))
+ error = xfs_btree_delete(bma->cur, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_btree_decrement(cur, 0, &i)))
+ error = xfs_btree_decrement(bma->cur, 0, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_update(cur, left.br_startoff,
+ error = xfs_bmbt_update(bma->cur, left.br_startoff,
left.br_startblock,
left.br_blockcount +
new->br_blockcount +
right.br_blockcount,
- left.br_state)))
+ left.br_state);
+ if (error)
goto done;
}
break;
@@ -1889,27 +1722,28 @@ xfs_bmap_add_extent_hole_real(
* on the left.
* Merge the new allocation with the left neighbor.
*/
- --*idx;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+ --bma->idx;
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
left.br_blockcount + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- if (cur == NULL) {
+ if (bma->cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur,
- left.br_startoff,
- left.br_startblock,
- left.br_blockcount, &i)))
+ error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
+ left.br_startblock, left.br_blockcount,
+ &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_update(cur, left.br_startoff,
+ error = xfs_bmbt_update(bma->cur, left.br_startoff,
left.br_startblock,
left.br_blockcount +
new->br_blockcount,
- left.br_state)))
+ left.br_state);
+ if (error)
goto done;
}
break;
@@ -1920,28 +1754,30 @@ xfs_bmap_add_extent_hole_real(
* on the right.
* Merge the new allocation with the right neighbor.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
new->br_startoff, new->br_startblock,
new->br_blockcount + right.br_blockcount,
right.br_state);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- if (cur == NULL) {
+ if (bma->cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur,
+ error = xfs_bmbt_lookup_eq(bma->cur,
right.br_startoff,
right.br_startblock,
- right.br_blockcount, &i)))
+ right.br_blockcount, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_update(cur, new->br_startoff,
+ error = xfs_bmbt_update(bma->cur, new->br_startoff,
new->br_startblock,
new->br_blockcount +
right.br_blockcount,
- right.br_state)))
+ right.br_state);
+ if (error)
goto done;
}
break;
@@ -1952,28 +1788,50 @@ xfs_bmap_add_extent_hole_real(
* real allocation.
* Insert a new entry.
*/
- xfs_iext_insert(ip, *idx, 1, new, state);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
- if (cur == NULL) {
+ xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+ XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+ XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
+ if (bma->cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur,
+ error = xfs_bmbt_lookup_eq(bma->cur,
new->br_startoff,
new->br_startblock,
- new->br_blockcount, &i)))
+ new->br_blockcount, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 0, done);
- cur->bc_rec.b.br_state = new->br_state;
- if ((error = xfs_btree_insert(cur, &i)))
+ bma->cur->bc_rec.b.br_state = new->br_state;
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
break;
}
+
+ /* convert to a btree if necessary */
+ if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(bma->cur == NULL);
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ bma->firstblock, bma->flist, &bma->cur,
+ 0, &tmp_logflags, whichfork);
+ bma->logflags |= tmp_logflags;
+ if (error)
+ goto done;
+ }
+
+ /* clear out the allocated field, done with it now in any case. */
+ if (bma->cur)
+ bma->cur->bc_private.b.allocated = 0;
+
+ xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
done:
- *logflagsp = rval;
+ bma->logflags |= rval;
return error;
}
@@ -2160,26 +2018,26 @@ xfs_bmap_adjacent(
XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
mp = ap->ip->i_mount;
- nullfb = ap->firstblock == NULLFSBLOCK;
+ nullfb = *ap->firstblock == NULLFSBLOCK;
rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
- fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+ fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
/*
* If allocating at eof, and there's a previous real block,
* try to use its last block as our starting point.
*/
- if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
- !isnullstartblock(ap->prevp->br_startblock) &&
- ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
- ap->prevp->br_startblock)) {
- ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
+ if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
+ !isnullstartblock(ap->prev.br_startblock) &&
+ ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
+ ap->prev.br_startblock)) {
+ ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
/*
* Adjust for the gap between prevp and us.
*/
- adjust = ap->off -
- (ap->prevp->br_startoff + ap->prevp->br_blockcount);
+ adjust = ap->offset -
+ (ap->prev.br_startoff + ap->prev.br_blockcount);
if (adjust &&
- ISVALID(ap->rval + adjust, ap->prevp->br_startblock))
- ap->rval += adjust;
+ ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
+ ap->blkno += adjust;
}
/*
* If not at eof, then compare the two neighbor blocks.
@@ -2196,17 +2054,17 @@ xfs_bmap_adjacent(
* If there's a previous (left) block, select a requested
* start block based on it.
*/
- if (ap->prevp->br_startoff != NULLFILEOFF &&
- !isnullstartblock(ap->prevp->br_startblock) &&
- (prevbno = ap->prevp->br_startblock +
- ap->prevp->br_blockcount) &&
- ISVALID(prevbno, ap->prevp->br_startblock)) {
+ if (ap->prev.br_startoff != NULLFILEOFF &&
+ !isnullstartblock(ap->prev.br_startblock) &&
+ (prevbno = ap->prev.br_startblock +
+ ap->prev.br_blockcount) &&
+ ISVALID(prevbno, ap->prev.br_startblock)) {
/*
* Calculate gap to end of previous block.
*/
- adjust = prevdiff = ap->off -
- (ap->prevp->br_startoff +
- ap->prevp->br_blockcount);
+ adjust = prevdiff = ap->offset -
+ (ap->prev.br_startoff +
+ ap->prev.br_blockcount);
/*
* Figure the startblock based on the previous block's
* end and the gap size.
@@ -2215,9 +2073,9 @@ xfs_bmap_adjacent(
* allocating, or using it gives us an invalid block
* number, then just use the end of the previous block.
*/
- if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+ if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
ISVALID(prevbno + prevdiff,
- ap->prevp->br_startblock))
+ ap->prev.br_startblock))
prevbno += adjust;
else
prevdiff += adjust;
@@ -2238,16 +2096,16 @@ xfs_bmap_adjacent(
* If there's a following (right) block, select a requested
* start block based on it.
*/
- if (!isnullstartblock(ap->gotp->br_startblock)) {
+ if (!isnullstartblock(ap->got.br_startblock)) {
/*
* Calculate gap to start of next block.
*/
- adjust = gotdiff = ap->gotp->br_startoff - ap->off;
+ adjust = gotdiff = ap->got.br_startoff - ap->offset;
/*
* Figure the startblock based on the next block's
* start and the gap size.
*/
- gotbno = ap->gotp->br_startblock;
+ gotbno = ap->got.br_startblock;
/*
* Heuristic!
* If the gap is large relative to the piece we're
@@ -2255,12 +2113,12 @@ xfs_bmap_adjacent(
* number, then just use the start of the next block
* offset by our length.
*/
- if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+ if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
ISVALID(gotbno - gotdiff, gotbno))
gotbno -= adjust;
- else if (ISVALID(gotbno - ap->alen, gotbno)) {
- gotbno -= ap->alen;
- gotdiff += adjust - ap->alen;
+ else if (ISVALID(gotbno - ap->length, gotbno)) {
+ gotbno -= ap->length;
+ gotdiff += adjust - ap->length;
} else
gotdiff += adjust;
/*
@@ -2278,14 +2136,14 @@ xfs_bmap_adjacent(
gotbno = NULLFSBLOCK;
/*
* If both valid, pick the better one, else the only good
- * one, else ap->rval is already set (to 0 or the inode block).
+ * one, else ap->blkno is already set (to 0 or the inode block).
*/
if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
- ap->rval = prevdiff <= gotdiff ? prevbno : gotbno;
+ ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
else if (prevbno != NULLFSBLOCK)
- ap->rval = prevbno;
+ ap->blkno = prevbno;
else if (gotbno != NULLFSBLOCK)
- ap->rval = gotbno;
+ ap->blkno = gotbno;
}
#undef ISVALID
}
@@ -2305,24 +2163,24 @@ xfs_bmap_rtalloc(
mp = ap->ip->i_mount;
align = xfs_get_extsz_hint(ap->ip);
prod = align / mp->m_sb.sb_rextsize;
- error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+ error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 1, ap->eof, 0,
- ap->conv, &ap->off, &ap->alen);
+ ap->conv, &ap->offset, &ap->length);
if (error)
return error;
- ASSERT(ap->alen);
- ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
+ ASSERT(ap->length);
+ ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
/*
* If the offset & length are not perfectly aligned
* then kill prod, it will just get us in trouble.
*/
- if (do_mod(ap->off, align) || ap->alen % align)
+ if (do_mod(ap->offset, align) || ap->length % align)
prod = 1;
/*
* Set ralen to be the actual requested length in rtextents.
*/
- ralen = ap->alen / mp->m_sb.sb_rextsize;
+ ralen = ap->length / mp->m_sb.sb_rextsize;
/*
* If the old value was close enough to MAXEXTLEN that
* we rounded up to it, cut it back so it's valid again.
@@ -2337,21 +2195,21 @@ xfs_bmap_rtalloc(
* Lock out other modifications to the RT bitmap inode.
*/
xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
/*
* If it's an allocation to an empty file at offset 0,
* pick an extent that will space things out in the rt area.
*/
- if (ap->eof && ap->off == 0) {
+ if (ap->eof && ap->offset == 0) {
xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
if (error)
return error;
- ap->rval = rtx * mp->m_sb.sb_rextsize;
+ ap->blkno = rtx * mp->m_sb.sb_rextsize;
} else {
- ap->rval = 0;
+ ap->blkno = 0;
}
xfs_bmap_adjacent(ap);
@@ -2359,23 +2217,23 @@ xfs_bmap_rtalloc(
/*
* Realtime allocation, done through xfs_rtallocate_extent.
*/
- atype = ap->rval == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
- do_div(ap->rval, mp->m_sb.sb_rextsize);
- rtb = ap->rval;
- ap->alen = ralen;
- if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen,
+ atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+ do_div(ap->blkno, mp->m_sb.sb_rextsize);
+ rtb = ap->blkno;
+ ap->length = ralen;
+ if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
&ralen, atype, ap->wasdel, prod, &rtb)))
return error;
if (rtb == NULLFSBLOCK && prod > 1 &&
- (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1,
- ap->alen, &ralen, atype,
+ (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
+ ap->length, &ralen, atype,
ap->wasdel, 1, &rtb)))
return error;
- ap->rval = rtb;
- if (ap->rval != NULLFSBLOCK) {
- ap->rval *= mp->m_sb.sb_rextsize;
+ ap->blkno = rtb;
+ if (ap->blkno != NULLFSBLOCK) {
+ ap->blkno *= mp->m_sb.sb_rextsize;
ralen *= mp->m_sb.sb_rextsize;
- ap->alen = ralen;
+ ap->length = ralen;
ap->ip->i_d.di_nblocks += ralen;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel)
@@ -2388,7 +2246,7 @@ xfs_bmap_rtalloc(
ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
} else {
- ap->alen = 0;
+ ap->length = 0;
}
return 0;
}
@@ -2503,7 +2361,7 @@ xfs_bmap_btalloc_nullfb(
* AG as the stream may have moved.
*/
if (xfs_inode_is_filestream(ap->ip))
- ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+ ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
return 0;
}
@@ -2525,55 +2383,57 @@ xfs_bmap_btalloc(
int tryagain;
int error;
+ ASSERT(ap->length);
+
mp = ap->ip->i_mount;
align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
if (unlikely(align)) {
- error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+ error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 0, ap->eof, 0, ap->conv,
- &ap->off, &ap->alen);
+ &ap->offset, &ap->length);
ASSERT(!error);
- ASSERT(ap->alen);
+ ASSERT(ap->length);
}
- nullfb = ap->firstblock == NULLFSBLOCK;
- fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+ nullfb = *ap->firstblock == NULLFSBLOCK;
+ fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
if (nullfb) {
if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
ag = xfs_filestream_lookup_ag(ap->ip);
ag = (ag != NULLAGNUMBER) ? ag : 0;
- ap->rval = XFS_AGB_TO_FSB(mp, ag, 0);
+ ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
} else {
- ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+ ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
}
} else
- ap->rval = ap->firstblock;
+ ap->blkno = *ap->firstblock;
xfs_bmap_adjacent(ap);
/*
- * If allowed, use ap->rval; otherwise must use firstblock since
+ * If allowed, use ap->blkno; otherwise must use firstblock since
* it's in the right allocation group.
*/
- if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno)
+ if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
;
else
- ap->rval = ap->firstblock;
+ ap->blkno = *ap->firstblock;
/*
* Normal allocation, done through xfs_alloc_vextent.
*/
tryagain = isaligned = 0;
args.tp = ap->tp;
args.mp = mp;
- args.fsbno = ap->rval;
+ args.fsbno = ap->blkno;
/* Trim the allocation back to the maximum an AG can fit. */
- args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
- args.firstblock = ap->firstblock;
+ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
+ args.firstblock = *ap->firstblock;
blen = 0;
if (nullfb) {
error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
if (error)
return error;
- } else if (ap->low) {
+ } else if (ap->flist->xbf_low) {
if (xfs_inode_is_filestream(ap->ip))
args.type = XFS_ALLOCTYPE_FIRST_AG;
else
@@ -2587,14 +2447,14 @@ xfs_bmap_btalloc(
/* apply extent size hints if obtained earlier */
if (unlikely(align)) {
args.prod = align;
- if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
+ if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
args.mod = (xfs_extlen_t)(args.prod - args.mod);
} else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
args.prod = 1;
args.mod = 0;
} else {
args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
- if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod))))
+ if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
args.mod = (xfs_extlen_t)(args.prod - args.mod);
}
/*
@@ -2606,8 +2466,8 @@ xfs_bmap_btalloc(
* is >= the stripe unit and the allocation offset is
* at the end of file.
*/
- if (!ap->low && ap->aeof) {
- if (!ap->off) {
+ if (!ap->flist->xbf_low && ap->aeof) {
+ if (!ap->offset) {
args.alignment = mp->m_dalign;
atype = args.type;
isaligned = 1;
@@ -2660,7 +2520,7 @@ xfs_bmap_btalloc(
* turned on.
*/
args.type = atype;
- args.fsbno = ap->rval;
+ args.fsbno = ap->blkno;
args.alignment = mp->m_dalign;
args.minlen = nextminlen;
args.minalignslop = 0;
@@ -2674,7 +2534,7 @@ xfs_bmap_btalloc(
* try again.
*/
args.type = atype;
- args.fsbno = ap->rval;
+ args.fsbno = ap->blkno;
args.alignment = 0;
if ((error = xfs_alloc_vextent(&args)))
return error;
@@ -2683,7 +2543,7 @@ xfs_bmap_btalloc(
args.minlen > ap->minlen) {
args.minlen = ap->minlen;
args.type = XFS_ALLOCTYPE_START_BNO;
- args.fsbno = ap->rval;
+ args.fsbno = ap->blkno;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -2694,13 +2554,26 @@ xfs_bmap_btalloc(
args.minleft = 0;
if ((error = xfs_alloc_vextent(&args)))
return error;
- ap->low = 1;
+ ap->flist->xbf_low = 1;
}
if (args.fsbno != NULLFSBLOCK) {
- ap->firstblock = ap->rval = args.fsbno;
+ /*
+ * check the allocation happened at the same or higher AG than
+ * the first block that was allocated.
+ */
+ ASSERT(*ap->firstblock == NULLFSBLOCK ||
+ XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+ XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+ (ap->flist->xbf_low &&
+ XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
+ XFS_FSB_TO_AGNO(mp, args.fsbno)));
+
+ ap->blkno = args.fsbno;
+ if (*ap->firstblock == NULLFSBLOCK)
+ *ap->firstblock = args.fsbno;
ASSERT(nullfb || fb_agno == args.agno ||
- (ap->low && fb_agno < args.agno));
- ap->alen = args.len;
+ (ap->flist->xbf_low && fb_agno < args.agno));
+ ap->length = args.len;
ap->ip->i_d.di_nblocks += args.len;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel)
@@ -2714,8 +2587,8 @@ xfs_bmap_btalloc(
XFS_TRANS_DQ_BCOUNT,
(long) args.len);
} else {
- ap->rval = NULLFSBLOCK;
- ap->alen = 0;
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
}
return 0;
}
@@ -3589,7 +3462,7 @@ xfs_bmap_add_attrfork(
}
ASSERT(ip->i_d.di_anextents == 0);
- xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
switch (ip->i_d.di_format) {
@@ -3782,19 +3655,11 @@ xfs_bmap_compute_maxlevels(
* Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
* caller. Frees all the extents that need freeing, which must be done
* last due to locking considerations. We never free any extents in
- * the first transaction. This is to allow the caller to make the first
- * transaction a synchronous one so that the pointers to the data being
- * broken in this transaction will be permanent before the data is actually
- * freed. This is necessary to prevent blocks from being reallocated
- * and written to before the free and reallocation are actually permanent.
- * We do not just make the first transaction synchronous here, because
- * there are more efficient ways to gain the same protection in some cases
- * (see the file truncation code).
+ * the first transaction.
*
* Return 1 if the given transaction was committed and a new one
* started, and 0 otherwise in the committed parameter.
*/
-/*ARGSUSED*/
int /* error */
xfs_bmap_finish(
xfs_trans_t **tp, /* transaction pointer addr */
@@ -3994,42 +3859,122 @@ xfs_bmap_last_before(
return 0;
}
+STATIC int
+xfs_bmap_last_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *rec,
+ int *is_empty)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int error;
+ int nextents;
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+ }
+
+ nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ if (nextents == 0) {
+ *is_empty = 1;
+ return 0;
+ }
+
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
+ *is_empty = 0;
+ return 0;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+ struct xfs_bmalloca *bma,
+ int whichfork)
+{
+ struct xfs_bmbt_irec rec;
+ int is_empty;
+ int error;
+
+ bma->aeof = 0;
+ error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+ &is_empty);
+ if (error || is_empty)
+ return error;
+
+ /*
+ * Check if we are allocation or past the last extent, or at least into
+ * the last delayed allocated extent.
+ */
+ bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+ (bma->offset >= rec.br_startoff &&
+ isnullstartblock(rec.br_startblock));
+ return 0;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary. All offsets are considered outside
+ * the end of file for an empty fork, so 1 is returned in *eof in that case.
+ */
+int
+xfs_bmap_eof(
+ struct xfs_inode *ip,
+ xfs_fileoff_t endoff,
+ int whichfork,
+ int *eof)
+{
+ struct xfs_bmbt_irec rec;
+ int error;
+
+ error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
+ if (error || *eof)
+ return error;
+
+ *eof = endoff >= rec.br_startoff + rec.br_blockcount;
+ return 0;
+}
+
/*
* Returns the file-relative block number of the first block past eof in
* the file. This is not based on i_size, it is based on the extent records.
* Returns 0 for local files, as they do not have extent records.
*/
-int /* error */
+int
xfs_bmap_last_offset(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_fileoff_t *last_block, /* last block */
- int whichfork) /* data or attr fork */
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t *last_block,
+ int whichfork)
{
- xfs_bmbt_rec_host_t *ep; /* pointer to last extent */
- int error; /* error return value */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t nextents; /* number of extent entries */
+ struct xfs_bmbt_irec rec;
+ int is_empty;
+ int error;
+
+ *last_block = 0;
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+ return 0;
if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
return XFS_ERROR(EIO);
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- *last_block = 0;
- return 0;
- }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
+
+ error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+ if (error || is_empty)
return error;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- if (!nextents) {
- *last_block = 0;
- return 0;
- }
- ep = xfs_iext_get_ext(ifp, nextents - 1);
- *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
+
+ *last_block = rec.br_startoff + rec.br_blockcount;
return 0;
}
@@ -4159,7 +4104,6 @@ xfs_bmap_read_extents(
xfs_extnum_t num_recs;
xfs_extnum_t start;
-
num_recs = xfs_btree_get_numrecs(block);
if (unlikely(i + num_recs > room)) {
ASSERT(i + num_recs <= room);
@@ -4282,9 +4226,8 @@ xfs_bmap_validate_ret(
ASSERT(i == 0 ||
mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
mval[i].br_startoff);
- if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY))
- ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
- mval[i].br_startblock != HOLESTARTBLOCK);
+ ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+ mval[i].br_startblock != HOLESTARTBLOCK);
ASSERT(mval[i].br_state == XFS_EXT_NORM ||
mval[i].br_state == XFS_EXT_UNWRITTEN);
}
@@ -4293,66 +4236,611 @@ xfs_bmap_validate_ret(
/*
- * Map file blocks to filesystem blocks.
- * File range is given by the bno/len pair.
- * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
- * into a hole or past eof.
- * Only allocates blocks from a single allocation group,
- * to avoid locking problems.
+ * Trim the returned map to the required bounds
+ */
+STATIC void
+xfs_bmapi_trim_map(
+ struct xfs_bmbt_irec *mval,
+ struct xfs_bmbt_irec *got,
+ xfs_fileoff_t *bno,
+ xfs_filblks_t len,
+ xfs_fileoff_t obno,
+ xfs_fileoff_t end,
+ int n,
+ int flags)
+{
+ if ((flags & XFS_BMAPI_ENTIRE) ||
+ got->br_startoff + got->br_blockcount <= obno) {
+ *mval = *got;
+ if (isnullstartblock(got->br_startblock))
+ mval->br_startblock = DELAYSTARTBLOCK;
+ return;
+ }
+
+ if (obno > *bno)
+ *bno = obno;
+ ASSERT((*bno >= obno) || (n == 0));
+ ASSERT(*bno < end);
+ mval->br_startoff = *bno;
+ if (isnullstartblock(got->br_startblock))
+ mval->br_startblock = DELAYSTARTBLOCK;
+ else
+ mval->br_startblock = got->br_startblock +
+ (*bno - got->br_startoff);
+ /*
+ * Return the minimum of what we got and what we asked for for
+ * the length. We can use the len variable here because it is
+ * modified below and we could have been there before coming
+ * here if the first part of the allocation didn't overlap what
+ * was asked for.
+ */
+ mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
+ got->br_blockcount - (*bno - got->br_startoff));
+ mval->br_state = got->br_state;
+ ASSERT(mval->br_blockcount <= len);
+ return;
+}
+
+/*
+ * Update and validate the extent map to return
+ */
+STATIC void
+xfs_bmapi_update_map(
+ struct xfs_bmbt_irec **map,
+ xfs_fileoff_t *bno,
+ xfs_filblks_t *len,
+ xfs_fileoff_t obno,
+ xfs_fileoff_t end,
+ int *n,
+ int flags)
+{
+ xfs_bmbt_irec_t *mval = *map;
+
+ ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+ ((mval->br_startoff + mval->br_blockcount) <= end));
+ ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
+ (mval->br_startoff < obno));
+
+ *bno = mval->br_startoff + mval->br_blockcount;
+ *len = end - *bno;
+ if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+ /* update previous map with new information */
+ ASSERT(mval->br_startblock == mval[-1].br_startblock);
+ ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+ ASSERT(mval->br_state == mval[-1].br_state);
+ mval[-1].br_blockcount = mval->br_blockcount;
+ mval[-1].br_state = mval->br_state;
+ } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+ mval[-1].br_startblock != DELAYSTARTBLOCK &&
+ mval[-1].br_startblock != HOLESTARTBLOCK &&
+ mval->br_startblock == mval[-1].br_startblock +
+ mval[-1].br_blockcount &&
+ ((flags & XFS_BMAPI_IGSTATE) ||
+ mval[-1].br_state == mval->br_state)) {
+ ASSERT(mval->br_startoff ==
+ mval[-1].br_startoff + mval[-1].br_blockcount);
+ mval[-1].br_blockcount += mval->br_blockcount;
+ } else if (*n > 0 &&
+ mval->br_startblock == DELAYSTARTBLOCK &&
+ mval[-1].br_startblock == DELAYSTARTBLOCK &&
+ mval->br_startoff ==
+ mval[-1].br_startoff + mval[-1].br_blockcount) {
+ mval[-1].br_blockcount += mval->br_blockcount;
+ mval[-1].br_state = mval->br_state;
+ } else if (!((*n == 0) &&
+ ((mval->br_startoff + mval->br_blockcount) <=
+ obno))) {
+ mval++;
+ (*n)++;
+ }
+ *map = mval;
+}
+
+/*
+ * Map file blocks to filesystem blocks without allocation.
+ */
+int
+xfs_bmapi_read(
+ struct xfs_inode *ip,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ struct xfs_bmbt_irec *mval,
+ int *nmap,
+ int flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp;
+ struct xfs_bmbt_irec got;
+ struct xfs_bmbt_irec prev;
+ xfs_fileoff_t obno;
+ xfs_fileoff_t end;
+ xfs_extnum_t lastx;
+ int error;
+ int eof;
+ int n = 0;
+ int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+
+ ASSERT(*nmap >= 1);
+ ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
+ XFS_BMAPI_IGSTATE)));
+
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ XFS_STATS_INC(xs_blk_mapr);
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(ifp->if_ext_max ==
+ XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, whichfork);
+ if (error)
+ return error;
+ }
+
+ xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+ end = bno + len;
+ obno = bno;
+
+ while (bno < end && n < *nmap) {
+ /* Reading past eof, act as though there's a hole up to end. */
+ if (eof)
+ got.br_startoff = end;
+ if (got.br_startoff > bno) {
+ /* Reading in a hole. */
+ mval->br_startoff = bno;
+ mval->br_startblock = HOLESTARTBLOCK;
+ mval->br_blockcount =
+ XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+ mval->br_state = XFS_EXT_NORM;
+ bno += mval->br_blockcount;
+ len -= mval->br_blockcount;
+ mval++;
+ n++;
+ continue;
+ }
+
+ /* set up the extent map to return. */
+ xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+ xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+ /* If we're done, stop now. */
+ if (bno >= end || n >= *nmap)
+ break;
+
+ /* Else go on to the next record. */
+ if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+ else
+ eof = 1;
+ }
+ *nmap = n;
+ return 0;
+}
+
+STATIC int
+xfs_bmapi_reserve_delalloc(
+ struct xfs_inode *ip,
+ xfs_fileoff_t aoff,
+ xfs_filblks_t len,
+ struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *prev,
+ xfs_extnum_t *lastx,
+ int eof)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ xfs_extlen_t alen;
+ xfs_extlen_t indlen;
+ char rt = XFS_IS_REALTIME_INODE(ip);
+ xfs_extlen_t extsz;
+ int error;
+
+ alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+ if (!eof)
+ alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+
+ /* Figure out the extent size, adjust alen */
+ extsz = xfs_get_extsz_hint(ip);
+ if (extsz) {
+ /*
+ * Make sure we don't exceed a single extent length when we
+ * align the extent by reducing length we are going to
+ * allocate by the maximum amount extent size aligment may
+ * require.
+ */
+ alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
+ error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+ 1, 0, &aoff, &alen);
+ ASSERT(!error);
+ }
+
+ if (rt)
+ extsz = alen / mp->m_sb.sb_rextsize;
+
+ /*
+ * Make a transaction-less quota reservation for delayed allocation
+ * blocks. This number gets adjusted later. We return if we haven't
+ * allocated blocks already inside this loop.
+ */
+ error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
+ rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ return error;
+
+ /*
+ * Split changing sb for alen and indlen since they could be coming
+ * from different places.
+ */
+ indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+ ASSERT(indlen > 0);
+
+ if (rt) {
+ error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+ -((int64_t)extsz), 0);
+ } else {
+ error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ -((int64_t)alen), 0);
+ }
+
+ if (error)
+ goto out_unreserve_quota;
+
+ error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ -((int64_t)indlen), 0);
+ if (error)
+ goto out_unreserve_blocks;
+
+
+ ip->i_delayed_blks += alen;
+
+ got->br_startoff = aoff;
+ got->br_startblock = nullstartblock(indlen);
+ got->br_blockcount = alen;
+ got->br_state = XFS_EXT_NORM;
+ xfs_bmap_add_extent_hole_delay(ip, lastx, got);
+
+ /*
+ * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
+ * might have merged it into one of the neighbouring ones.
+ */
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
+
+ ASSERT(got->br_startoff <= aoff);
+ ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
+ ASSERT(isnullstartblock(got->br_startblock));
+ ASSERT(got->br_state == XFS_EXT_NORM);
+ return 0;
+
+out_unreserve_blocks:
+ if (rt)
+ xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
+ else
+ xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
+out_unreserve_quota:
+ if (XFS_IS_QUOTA_ON(mp))
+ xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ?
+ XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+ return error;
+}
+
+/*
+ * Map file blocks to filesystem blocks, adding delayed allocations as needed.
+ */
+int
+xfs_bmapi_delay(
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t bno, /* starting file offs. mapped */
+ xfs_filblks_t len, /* length to map in file */
+ struct xfs_bmbt_irec *mval, /* output: map values */
+ int *nmap, /* i/o: mval size/count */
+ int flags) /* XFS_BMAPI_... */
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_bmbt_irec got; /* current file extent record */
+ struct xfs_bmbt_irec prev; /* previous file extent record */
+ xfs_fileoff_t obno; /* old block number (offset) */
+ xfs_fileoff_t end; /* end of mapped file region */
+ xfs_extnum_t lastx; /* last useful extent number */
+ int eof; /* we've hit the end of extents */
+ int n = 0; /* current extent index */
+ int error = 0;
+
+ ASSERT(*nmap >= 1);
+ ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+ ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ XFS_STATS_INC(xs_blk_mapw);
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
+ end = bno + len;
+ obno = bno;
+
+ while (bno < end && n < *nmap) {
+ if (eof || got.br_startoff > bno) {
+ error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
+ &prev, &lastx, eof);
+ if (error) {
+ if (n == 0) {
+ *nmap = 0;
+ return error;
+ }
+ break;
+ }
+ }
+
+ /* set up the extent map to return. */
+ xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+ xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+ /* If we're done, stop now. */
+ if (bno >= end || n >= *nmap)
+ break;
+
+ /* Else go on to the next record. */
+ prev = got;
+ if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+ else
+ eof = 1;
+ }
+
+ *nmap = n;
+ return 0;
+}
+
+
+STATIC int
+xfs_bmapi_allocate(
+ struct xfs_bmalloca *bma,
+ int flags)
+{
+ struct xfs_mount *mp = bma->ip->i_mount;
+ int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ int tmp_logflags = 0;
+ int error;
+ int rt;
+
+ ASSERT(bma->length > 0);
+
+ rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
+
+ /*
+ * For the wasdelay case, we could also just allocate the stuff asked
+ * for in this bmap call but that wouldn't be as good.
+ */
+ if (bma->wasdel) {
+ bma->length = (xfs_extlen_t)bma->got.br_blockcount;
+ bma->offset = bma->got.br_startoff;
+ if (bma->idx != NULLEXTNUM && bma->idx) {
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
+ &bma->prev);
+ }
+ } else {
+ bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+ if (!bma->eof)
+ bma->length = XFS_FILBLKS_MIN(bma->length,
+ bma->got.br_startoff - bma->offset);
+ }
+
+ /*
+ * Indicate if this is the first user data in the file, or just any
+ * user data.
+ */
+ if (!(flags & XFS_BMAPI_METADATA)) {
+ bma->userdata = (bma->offset == 0) ?
+ XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+ }
+
+ bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+
+ /*
+ * Only want to do the alignment at the eof if it is userdata and
+ * allocation length is larger than a stripe unit.
+ */
+ if (mp->m_dalign && bma->length >= mp->m_dalign &&
+ !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+ error = xfs_bmap_isaeof(bma, whichfork);
+ if (error)
+ return error;
+ }
+
+ error = xfs_bmap_alloc(bma);
+ if (error)
+ return error;
+
+ if (bma->flist->xbf_low)
+ bma->minleft = 0;
+ if (bma->cur)
+ bma->cur->bc_private.b.firstblock = *bma->firstblock;
+ if (bma->blkno == NULLFSBLOCK)
+ return 0;
+ if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+ bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
+ bma->cur->bc_private.b.firstblock = *bma->firstblock;
+ bma->cur->bc_private.b.flist = bma->flist;
+ }
+ /*
+ * Bump the number of extents we've allocated
+ * in this call.
+ */
+ bma->nallocs++;
+
+ if (bma->cur)
+ bma->cur->bc_private.b.flags =
+ bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+
+ bma->got.br_startoff = bma->offset;
+ bma->got.br_startblock = bma->blkno;
+ bma->got.br_blockcount = bma->length;
+ bma->got.br_state = XFS_EXT_NORM;
+
+ /*
+ * A wasdelay extent has been initialized, so shouldn't be flagged
+ * as unwritten.
+ */
+ if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) &&
+ xfs_sb_version_hasextflgbit(&mp->m_sb))
+ bma->got.br_state = XFS_EXT_UNWRITTEN;
+
+ if (bma->wasdel)
+ error = xfs_bmap_add_extent_delay_real(bma);
+ else
+ error = xfs_bmap_add_extent_hole_real(bma, whichfork);
+
+ bma->logflags |= tmp_logflags;
+ if (error)
+ return error;
+
+ /*
+ * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
+ * or xfs_bmap_add_extent_hole_real might have merged it into one of
+ * the neighbouring ones.
+ */
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+
+ ASSERT(bma->got.br_startoff <= bma->offset);
+ ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
+ bma->offset + bma->length);
+ ASSERT(bma->got.br_state == XFS_EXT_NORM ||
+ bma->got.br_state == XFS_EXT_UNWRITTEN);
+ return 0;
+}
+
+STATIC int
+xfs_bmapi_convert_unwritten(
+ struct xfs_bmalloca *bma,
+ struct xfs_bmbt_irec *mval,
+ xfs_filblks_t len,
+ int flags)
+{
+ int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ int tmp_logflags = 0;
+ int error;
+
+ /* check if we need to do unwritten->real conversion */
+ if (mval->br_state == XFS_EXT_UNWRITTEN &&
+ (flags & XFS_BMAPI_PREALLOC))
+ return 0;
+
+ /* check if we need to do real->unwritten conversion */
+ if (mval->br_state == XFS_EXT_NORM &&
+ (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
+ (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+ return 0;
+
+ /*
+ * Modify (by adding) the state flag, if writing.
+ */
+ ASSERT(mval->br_blockcount <= len);
+ if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+ bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
+ bma->ip, whichfork);
+ bma->cur->bc_private.b.firstblock = *bma->firstblock;
+ bma->cur->bc_private.b.flist = bma->flist;
+ }
+ mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+ ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+
+ error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
+ &bma->cur, mval, bma->firstblock, bma->flist,
+ &tmp_logflags);
+ bma->logflags |= tmp_logflags;
+ if (error)
+ return error;
+
+ /*
+ * Update our extent pointer, given that
+ * xfs_bmap_add_extent_unwritten_real might have merged it into one
+ * of the neighbouring ones.
+ */
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+
+ /*
+ * We may have combined previously unwritten space with written space,
+ * so generate another request.
+ */
+ if (mval->br_blockcount < len)
+ return EAGAIN;
+ return 0;
+}
+
+/*
+ * Map file blocks to filesystem blocks, and allocate blocks or convert the
+ * extent state if necessary. Details behaviour is controlled by the flags
+ * parameter. Only allocates blocks from a single allocation group, to avoid
+ * locking problems.
+ *
* The returned value in "firstblock" from the first call in a transaction
* must be remembered and presented to subsequent calls in "firstblock".
* An upper bound for the number of blocks to be allocated is supplied to
* the first call in "total"; if no allocation group has that many free
* blocks then the call will fail (return NULLFSBLOCK in "firstblock").
*/
-int /* error */
-xfs_bmapi(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_fileoff_t bno, /* starting file offs. mapped */
- xfs_filblks_t len, /* length to map in file */
- int flags, /* XFS_BMAPI_... */
- xfs_fsblock_t *firstblock, /* first allocated block
- controls a.g. for allocs */
- xfs_extlen_t total, /* total blocks needed */
- xfs_bmbt_irec_t *mval, /* output: map values */
- int *nmap, /* i/o: mval size/count */
- xfs_bmap_free_t *flist) /* i/o: list extents to free */
+int
+xfs_bmapi_write(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t bno, /* starting file offs. mapped */
+ xfs_filblks_t len, /* length to map in file */
+ int flags, /* XFS_BMAPI_... */
+ xfs_fsblock_t *firstblock, /* first allocated block
+ controls a.g. for allocs */
+ xfs_extlen_t total, /* total blocks needed */
+ struct xfs_bmbt_irec *mval, /* output: map values */
+ int *nmap, /* i/o: mval size/count */
+ struct xfs_bmap_free *flist) /* i/o: list extents to free */
{
- xfs_fsblock_t abno; /* allocated block number */
- xfs_extlen_t alen; /* allocated extent length */
- xfs_fileoff_t aoff; /* allocated file offset */
- xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */
- xfs_btree_cur_t *cur; /* bmap btree cursor */
- xfs_fileoff_t end; /* end of mapped file region */
- int eof; /* we've hit the end of extents */
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- int error; /* error return */
- xfs_bmbt_irec_t got; /* current file extent record */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extlen_t indlen; /* indirect blocks length */
- xfs_extnum_t lastx; /* last useful extent number */
- int logflags; /* flags for transaction logging */
- xfs_extlen_t minleft; /* min blocks left after allocation */
- xfs_extlen_t minlen; /* min allocation size */
- xfs_mount_t *mp; /* xfs mount structure */
- int n; /* current extent index */
- int nallocs; /* number of extents alloc'd */
- xfs_extnum_t nextents; /* number of extents in file */
- xfs_fileoff_t obno; /* old block number (offset) */
- xfs_bmbt_irec_t prev; /* previous file extent record */
- int tmp_logflags; /* temp flags holder */
- int whichfork; /* data or attr fork */
- char inhole; /* current location is hole in file */
- char wasdelay; /* old extent was delayed */
- char wr; /* this is a write request */
- char rt; /* this is a realtime file */
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp;
+ struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */
+ xfs_fileoff_t end; /* end of mapped file region */
+ int eof; /* after the end of extents */
+ int error; /* error return */
+ int n; /* current extent index */
+ xfs_fileoff_t obno; /* old block number (offset) */
+ int whichfork; /* data or attr fork */
+ char inhole; /* current location is hole in file */
+ char wasdelay; /* old extent was delayed */
+
#ifdef DEBUG
- xfs_fileoff_t orig_bno; /* original block number value */
- int orig_flags; /* original flags arg value */
- xfs_filblks_t orig_len; /* original value of len arg */
- xfs_bmbt_irec_t *orig_mval; /* original value of mval */
- int orig_nmap; /* original value of *nmap */
+ xfs_fileoff_t orig_bno; /* original block number value */
+ int orig_flags; /* original flags arg value */
+ xfs_filblks_t orig_len; /* original value of len arg */
+ struct xfs_bmbt_irec *orig_mval; /* original value of mval */
+ int orig_nmap; /* original value of *nmap */
orig_bno = bno;
orig_len = len;
@@ -4360,488 +4848,147 @@ xfs_bmapi(
orig_mval = mval;
orig_nmap = *nmap;
#endif
+
ASSERT(*nmap >= 1);
- ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE));
+ ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+ ASSERT(!(flags & XFS_BMAPI_IGSTATE));
+ ASSERT(tp != NULL);
+ ASSERT(len > 0);
+
whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
XFS_ATTR_FORK : XFS_DATA_FORK;
- mp = ip->i_mount;
+
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp);
+ XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
return XFS_ERROR(EFSCORRUPTED);
}
+
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(ifp->if_ext_max ==
XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
- if ((wr = (flags & XFS_BMAPI_WRITE)) != 0)
- XFS_STATS_INC(xs_blk_mapw);
- else
- XFS_STATS_INC(xs_blk_mapr);
- /*
- * IGSTATE flag is used to combine extents which
- * differ only due to the state of the extents.
- * This technique is used from xfs_getbmap()
- * when the caller does not wish to see the
- * separation (which is the default).
- *
- * This technique is also used when writing a
- * buffer which has been partially written,
- * (usually by being flushed during a chunkread),
- * to ensure one write takes place. This also
- * prevents a change in the xfs inode extents at
- * this time, intentionally. This change occurs
- * on completion of the write operation, in
- * xfs_strat_comp(), where the xfs_bmapi() call
- * is transactioned, and the extents combined.
- */
- if ((flags & XFS_BMAPI_IGSTATE) && wr) /* if writing unwritten space */
- wr = 0; /* no allocations are allowed */
- ASSERT(wr || !(flags & XFS_BMAPI_DELAY));
- logflags = 0;
- nallocs = 0;
- cur = NULL;
+
+ XFS_STATS_INC(xs_blk_mapw);
+
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- ASSERT(wr && tp);
- if ((error = xfs_bmap_local_to_extents(tp, ip,
- firstblock, total, &logflags, whichfork)))
+ error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
+ &bma.logflags, whichfork);
+ if (error)
goto error0;
}
- if (wr && *firstblock == NULLFSBLOCK) {
+
+ if (*firstblock == NULLFSBLOCK) {
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
- minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+ bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
else
- minleft = 1;
- } else
- minleft = 0;
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- goto error0;
- ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
- &prev);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ bma.minleft = 1;
+ } else {
+ bma.minleft = 0;
+ }
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ goto error0;
+ }
+
+ xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
+ &bma.prev);
n = 0;
end = bno + len;
obno = bno;
- bma.ip = NULL;
+
+ bma.tp = tp;
+ bma.ip = ip;
+ bma.total = total;
+ bma.userdata = 0;
+ bma.flist = flist;
+ bma.firstblock = firstblock;
while (bno < end && n < *nmap) {
- /*
- * Reading past eof, act as though there's a hole
- * up to end.
- */
- if (eof && !wr)
- got.br_startoff = end;
- inhole = eof || got.br_startoff > bno;
- wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
- isnullstartblock(got.br_startblock);
+ inhole = eof || bma.got.br_startoff > bno;
+ wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
+
/*
* First, deal with the hole before the allocated space
* that we found, if any.
*/
- if (wr && (inhole || wasdelay)) {
- /*
- * For the wasdelay case, we could also just
- * allocate the stuff asked for in this bmap call
- * but that wouldn't be as good.
- */
- if (wasdelay) {
- alen = (xfs_extlen_t)got.br_blockcount;
- aoff = got.br_startoff;
- if (lastx != NULLEXTNUM && lastx) {
- ep = xfs_iext_get_ext(ifp, lastx - 1);
- xfs_bmbt_get_all(ep, &prev);
- }
- } else {
- alen = (xfs_extlen_t)
- XFS_FILBLKS_MIN(len, MAXEXTLEN);
- if (!eof)
- alen = (xfs_extlen_t)
- XFS_FILBLKS_MIN(alen,
- got.br_startoff - bno);
- aoff = bno;
- }
- minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
- if (flags & XFS_BMAPI_DELAY) {
- xfs_extlen_t extsz;
-
- /* Figure out the extent size, adjust alen */
- extsz = xfs_get_extsz_hint(ip);
- if (extsz) {
- /*
- * make sure we don't exceed a single
- * extent length when we align the
- * extent by reducing length we are
- * going to allocate by the maximum
- * amount extent size aligment may
- * require.
- */
- alen = XFS_FILBLKS_MIN(len,
- MAXEXTLEN - (2 * extsz - 1));
- error = xfs_bmap_extsize_align(mp,
- &got, &prev, extsz,
- rt, eof,
- flags&XFS_BMAPI_DELAY,
- flags&XFS_BMAPI_CONVERT,
- &aoff, &alen);
- ASSERT(!error);
- }
-
- if (rt)
- extsz = alen / mp->m_sb.sb_rextsize;
-
- /*
- * Make a transaction-less quota reservation for
- * delayed allocation blocks. This number gets
- * adjusted later. We return if we haven't
- * allocated blocks already inside this loop.
- */
- error = xfs_trans_reserve_quota_nblks(
- NULL, ip, (long)alen, 0,
- rt ? XFS_QMOPT_RES_RTBLKS :
- XFS_QMOPT_RES_REGBLKS);
- if (error) {
- if (n == 0) {
- *nmap = 0;
- ASSERT(cur == NULL);
- return error;
- }
- break;
- }
-
- /*
- * Split changing sb for alen and indlen since
- * they could be coming from different places.
- */
- indlen = (xfs_extlen_t)
- xfs_bmap_worst_indlen(ip, alen);
- ASSERT(indlen > 0);
-
- if (rt) {
- error = xfs_mod_incore_sb(mp,
- XFS_SBS_FREXTENTS,
- -((int64_t)extsz), 0);
- } else {
- error = xfs_icsb_modify_counters(mp,
- XFS_SBS_FDBLOCKS,
- -((int64_t)alen), 0);
- }
- if (!error) {
- error = xfs_icsb_modify_counters(mp,
- XFS_SBS_FDBLOCKS,
- -((int64_t)indlen), 0);
- if (error && rt)
- xfs_mod_incore_sb(mp,
- XFS_SBS_FREXTENTS,
- (int64_t)extsz, 0);
- else if (error)
- xfs_icsb_modify_counters(mp,
- XFS_SBS_FDBLOCKS,
- (int64_t)alen, 0);
- }
+ if (inhole || wasdelay) {
+ bma.eof = eof;
+ bma.conv = !!(flags & XFS_BMAPI_CONVERT);
+ bma.wasdel = wasdelay;
+ bma.offset = bno;
- if (error) {
- if (XFS_IS_QUOTA_ON(mp))
- /* unreserve the blocks now */
- (void)
- xfs_trans_unreserve_quota_nblks(
- NULL, ip,
- (long)alen, 0, rt ?
- XFS_QMOPT_RES_RTBLKS :
- XFS_QMOPT_RES_REGBLKS);
- break;
- }
-
- ip->i_delayed_blks += alen;
- abno = nullstartblock(indlen);
- } else {
- /*
- * If first time, allocate and fill in
- * once-only bma fields.
- */
- if (bma.ip == NULL) {
- bma.tp = tp;
- bma.ip = ip;
- bma.prevp = &prev;
- bma.gotp = &got;
- bma.total = total;
- bma.userdata = 0;
- }
- /* Indicate if this is the first user data
- * in the file, or just any user data.
- */
- if (!(flags & XFS_BMAPI_METADATA)) {
- bma.userdata = (aoff == 0) ?
- XFS_ALLOC_INITIAL_USER_DATA :
- XFS_ALLOC_USERDATA;
- }
- /*
- * Fill in changeable bma fields.
- */
- bma.eof = eof;
- bma.firstblock = *firstblock;
- bma.alen = alen;
- bma.off = aoff;
- bma.conv = !!(flags & XFS_BMAPI_CONVERT);
- bma.wasdel = wasdelay;
- bma.minlen = minlen;
- bma.low = flist->xbf_low;
- bma.minleft = minleft;
- /*
- * Only want to do the alignment at the
- * eof if it is userdata and allocation length
- * is larger than a stripe unit.
- */
- if (mp->m_dalign && alen >= mp->m_dalign &&
- (!(flags & XFS_BMAPI_METADATA)) &&
- (whichfork == XFS_DATA_FORK)) {
- if ((error = xfs_bmap_isaeof(ip, aoff,
- whichfork, &bma.aeof)))
- goto error0;
- } else
- bma.aeof = 0;
- /*
- * Call allocator.
- */
- if ((error = xfs_bmap_alloc(&bma)))
- goto error0;
- /*
- * Copy out result fields.
- */
- abno = bma.rval;
- if ((flist->xbf_low = bma.low))
- minleft = 0;
- alen = bma.alen;
- aoff = bma.off;
- ASSERT(*firstblock == NULLFSBLOCK ||
- XFS_FSB_TO_AGNO(mp, *firstblock) ==
- XFS_FSB_TO_AGNO(mp, bma.firstblock) ||
- (flist->xbf_low &&
- XFS_FSB_TO_AGNO(mp, *firstblock) <
- XFS_FSB_TO_AGNO(mp, bma.firstblock)));
- *firstblock = bma.firstblock;
- if (cur)
- cur->bc_private.b.firstblock =
- *firstblock;
- if (abno == NULLFSBLOCK)
- break;
- if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
- cur = xfs_bmbt_init_cursor(mp, tp,
- ip, whichfork);
- cur->bc_private.b.firstblock =
- *firstblock;
- cur->bc_private.b.flist = flist;
- }
- /*
- * Bump the number of extents we've allocated
- * in this call.
- */
- nallocs++;
- }
- if (cur)
- cur->bc_private.b.flags =
- wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
- got.br_startoff = aoff;
- got.br_startblock = abno;
- got.br_blockcount = alen;
- got.br_state = XFS_EXT_NORM; /* assume normal */
- /*
- * Determine state of extent, and the filesystem.
- * A wasdelay extent has been initialized, so
- * shouldn't be flagged as unwritten.
- */
- if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) {
- if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
- got.br_state = XFS_EXT_UNWRITTEN;
- }
- error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &got,
- firstblock, flist, &tmp_logflags,
- whichfork);
- logflags |= tmp_logflags;
- if (error)
- goto error0;
- ep = xfs_iext_get_ext(ifp, lastx);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- xfs_bmbt_get_all(ep, &got);
- ASSERT(got.br_startoff <= aoff);
- ASSERT(got.br_startoff + got.br_blockcount >=
- aoff + alen);
-#ifdef DEBUG
- if (flags & XFS_BMAPI_DELAY) {
- ASSERT(isnullstartblock(got.br_startblock));
- ASSERT(startblockval(got.br_startblock) > 0);
- }
- ASSERT(got.br_state == XFS_EXT_NORM ||
- got.br_state == XFS_EXT_UNWRITTEN);
-#endif
- /*
- * Fall down into the found allocated space case.
- */
- } else if (inhole) {
- /*
- * Reading in a hole.
- */
- mval->br_startoff = bno;
- mval->br_startblock = HOLESTARTBLOCK;
- mval->br_blockcount =
- XFS_FILBLKS_MIN(len, got.br_startoff - bno);
- mval->br_state = XFS_EXT_NORM;
- bno += mval->br_blockcount;
- len -= mval->br_blockcount;
- mval++;
- n++;
- continue;
- }
- /*
- * Then deal with the allocated space we found.
- */
- ASSERT(ep != NULL);
- if (!(flags & XFS_BMAPI_ENTIRE) &&
- (got.br_startoff + got.br_blockcount > obno)) {
- if (obno > bno)
- bno = obno;
- ASSERT((bno >= obno) || (n == 0));
- ASSERT(bno < end);
- mval->br_startoff = bno;
- if (isnullstartblock(got.br_startblock)) {
- ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
- mval->br_startblock = DELAYSTARTBLOCK;
- } else
- mval->br_startblock =
- got.br_startblock +
- (bno - got.br_startoff);
/*
- * Return the minimum of what we got and what we
- * asked for for the length. We can use the len
- * variable here because it is modified below
- * and we could have been there before coming
- * here if the first part of the allocation
- * didn't overlap what was asked for.
+ * There's a 32/64 bit type mismatch between the
+ * allocation length request (which can be 64 bits in
+ * length) and the bma length request, which is
+ * xfs_extlen_t and therefore 32 bits. Hence we have to
+ * check for 32-bit overflows and handle them here.
*/
- mval->br_blockcount =
- XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
- (bno - got.br_startoff));
- mval->br_state = got.br_state;
- ASSERT(mval->br_blockcount <= len);
- } else {
- *mval = got;
- if (isnullstartblock(mval->br_startblock)) {
- ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
- mval->br_startblock = DELAYSTARTBLOCK;
- }
- }
+ if (len > (xfs_filblks_t)MAXEXTLEN)
+ bma.length = MAXEXTLEN;
+ else
+ bma.length = len;
- /*
- * Check if writing previously allocated but
- * unwritten extents.
- */
- if (wr &&
- ((mval->br_state == XFS_EXT_UNWRITTEN &&
- ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
- (mval->br_state == XFS_EXT_NORM &&
- ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
- (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
- /*
- * Modify (by adding) the state flag, if writing.
- */
- ASSERT(mval->br_blockcount <= len);
- if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
- cur = xfs_bmbt_init_cursor(mp,
- tp, ip, whichfork);
- cur->bc_private.b.firstblock =
- *firstblock;
- cur->bc_private.b.flist = flist;
- }
- mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
- ? XFS_EXT_NORM
- : XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, mval,
- firstblock, flist, &tmp_logflags,
- whichfork);
- logflags |= tmp_logflags;
+ ASSERT(len > 0);
+ ASSERT(bma.length > 0);
+ error = xfs_bmapi_allocate(&bma, flags);
if (error)
goto error0;
- ep = xfs_iext_get_ext(ifp, lastx);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- xfs_bmbt_get_all(ep, &got);
- /*
- * We may have combined previously unwritten
- * space with written space, so generate
- * another request.
- */
- if (mval->br_blockcount < len)
- continue;
+ if (bma.blkno == NULLFSBLOCK)
+ break;
}
- ASSERT((flags & XFS_BMAPI_ENTIRE) ||
- ((mval->br_startoff + mval->br_blockcount) <= end));
- ASSERT((flags & XFS_BMAPI_ENTIRE) ||
- (mval->br_blockcount <= len) ||
- (mval->br_startoff < obno));
- bno = mval->br_startoff + mval->br_blockcount;
- len = end - bno;
- if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
- ASSERT(mval->br_startblock == mval[-1].br_startblock);
- ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
- ASSERT(mval->br_state == mval[-1].br_state);
- mval[-1].br_blockcount = mval->br_blockcount;
- mval[-1].br_state = mval->br_state;
- } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
- mval[-1].br_startblock != DELAYSTARTBLOCK &&
- mval[-1].br_startblock != HOLESTARTBLOCK &&
- mval->br_startblock ==
- mval[-1].br_startblock + mval[-1].br_blockcount &&
- ((flags & XFS_BMAPI_IGSTATE) ||
- mval[-1].br_state == mval->br_state)) {
- ASSERT(mval->br_startoff ==
- mval[-1].br_startoff + mval[-1].br_blockcount);
- mval[-1].br_blockcount += mval->br_blockcount;
- } else if (n > 0 &&
- mval->br_startblock == DELAYSTARTBLOCK &&
- mval[-1].br_startblock == DELAYSTARTBLOCK &&
- mval->br_startoff ==
- mval[-1].br_startoff + mval[-1].br_blockcount) {
- mval[-1].br_blockcount += mval->br_blockcount;
- mval[-1].br_state = mval->br_state;
- } else if (!((n == 0) &&
- ((mval->br_startoff + mval->br_blockcount) <=
- obno))) {
- mval++;
- n++;
- }
+ /* Deal with the allocated space we found. */
+ xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
+ end, n, flags);
+
+ /* Execute unwritten extent conversion if necessary */
+ error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
+ if (error == EAGAIN)
+ continue;
+ if (error)
+ goto error0;
+
+ /* update the extent map to return */
+ xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
/*
* If we're done, stop now. Stop when we've allocated
* XFS_BMAP_MAX_NMAP extents no matter what. Otherwise
* the transaction may get too big.
*/
- if (bno >= end || n >= *nmap || nallocs >= *nmap)
+ if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
break;
- /*
- * Else go on to the next record.
- */
- prev = got;
- if (++lastx < nextents) {
- ep = xfs_iext_get_ext(ifp, lastx);
- xfs_bmbt_get_all(ep, &got);
- } else {
+
+ /* Else go on to the next record. */
+ bma.prev = bma.got;
+ if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
+ &bma.got);
+ } else
eof = 1;
- }
}
*nmap = n;
+
/*
* Transform from btree to extents, give it cur.
*/
- if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
- ASSERT(wr && cur);
- error = xfs_bmap_btree_to_extents(tp, ip, cur,
+ int tmp_logflags = 0;
+
+ ASSERT(bma.cur);
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
&tmp_logflags, whichfork);
- logflags |= tmp_logflags;
+ bma.logflags |= tmp_logflags;
if (error)
goto error0;
}
@@ -4855,34 +5002,33 @@ error0:
* Log everything. Do this after conversion, there's no point in
* logging the extent records if we've converted to btree format.
*/
- if ((logflags & xfs_ilog_fext(whichfork)) &&
+ if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- logflags &= ~xfs_ilog_fext(whichfork);
- else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+ bma.logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
- logflags &= ~xfs_ilog_fbroot(whichfork);
+ bma.logflags &= ~xfs_ilog_fbroot(whichfork);
/*
* Log whatever the flags say, even if error. Otherwise we might miss
* detecting a case where the data is changed, there's an error,
* and it's not logged so we don't shutdown when we should.
*/
- if (logflags) {
- ASSERT(tp && wr);
- xfs_trans_log_inode(tp, ip, logflags);
- }
- if (cur) {
+ if (bma.logflags)
+ xfs_trans_log_inode(tp, ip, bma.logflags);
+
+ if (bma.cur) {
if (!error) {
ASSERT(*firstblock == NULLFSBLOCK ||
XFS_FSB_TO_AGNO(mp, *firstblock) ==
XFS_FSB_TO_AGNO(mp,
- cur->bc_private.b.firstblock) ||
+ bma.cur->bc_private.b.firstblock) ||
(flist->xbf_low &&
XFS_FSB_TO_AGNO(mp, *firstblock) <
XFS_FSB_TO_AGNO(mp,
- cur->bc_private.b.firstblock)));
- *firstblock = cur->bc_private.b.firstblock;
+ bma.cur->bc_private.b.firstblock)));
+ *firstblock = bma.cur->bc_private.b.firstblock;
}
- xfs_btree_del_cursor(cur,
+ xfs_btree_del_cursor(bma.cur,
error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
}
if (!error)
@@ -4892,58 +5038,6 @@ error0:
}
/*
- * Map file blocks to filesystem blocks, simple version.
- * One block (extent) only, read-only.
- * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
- * For the other flag values, the effect is as if XFS_BMAPI_METADATA
- * was set and all the others were clear.
- */
-int /* error */
-xfs_bmapi_single(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- int whichfork, /* data or attr fork */
- xfs_fsblock_t *fsb, /* output: mapped block */
- xfs_fileoff_t bno) /* starting file offs. mapped */
-{
- int eof; /* we've hit the end of extents */
- int error; /* error return */
- xfs_bmbt_irec_t got; /* current file extent record */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t lastx; /* last useful extent number */
- xfs_bmbt_irec_t prev; /* previous file extent record */
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (unlikely(
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) {
- XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
- }
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return XFS_ERROR(EIO);
- XFS_STATS_INC(xs_blk_mapr);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
- &prev);
- /*
- * Reading past eof, act as though there's a hole
- * up to end.
- */
- if (eof || got.br_startoff > bno) {
- *fsb = NULLFSBLOCK;
- return 0;
- }
- ASSERT(!isnullstartblock(got.br_startblock));
- ASSERT(bno < got.br_startoff + got.br_blockcount);
- *fsb = got.br_startblock + (bno - got.br_startoff);
- return 0;
-}
-
-/*
* Unmap (remove) blocks from a file.
* If nexts is nonzero then the number of extents to remove is limited to
* that value. If not all extents in the block range can be removed then
@@ -5114,9 +5208,9 @@ xfs_bunmapi(
del.br_blockcount = mod;
}
del.br_state = XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &del,
- firstblock, flist, &logflags,
- XFS_DATA_FORK);
+ error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+ &lastx, &cur, &del, firstblock, flist,
+ &logflags);
if (error)
goto error0;
goto nodelete;
@@ -5172,18 +5266,18 @@ xfs_bunmapi(
}
prev.br_state = XFS_EXT_UNWRITTEN;
lastx--;
- error = xfs_bmap_add_extent(tp, ip, &lastx,
- &cur, &prev, firstblock, flist,
- &logflags, XFS_DATA_FORK);
+ error = xfs_bmap_add_extent_unwritten_real(tp,
+ ip, &lastx, &cur, &prev,
+ firstblock, flist, &logflags);
if (error)
goto error0;
goto nodelete;
} else {
ASSERT(del.br_state == XFS_EXT_NORM);
del.br_state = XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(tp, ip, &lastx,
- &cur, &del, firstblock, flist,
- &logflags, XFS_DATA_FORK);
+ error = xfs_bmap_add_extent_unwritten_real(tp,
+ ip, &lastx, &cur, &del,
+ firstblock, flist, &logflags);
if (error)
goto error0;
goto nodelete;
@@ -5505,10 +5599,9 @@ xfs_getbmap(
do {
nmap = (nexleft > subnex) ? subnex : nexleft;
- error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
- XFS_BB_TO_FSB(mp, bmv->bmv_length),
- bmapi_flags, NULL, 0, map, &nmap,
- NULL);
+ error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+ XFS_BB_TO_FSB(mp, bmv->bmv_length),
+ map, &nmap, bmapi_flags);
if (error)
goto out_free_map;
ASSERT(nmap <= subnex);
@@ -5582,89 +5675,6 @@ xfs_getbmap(
return error;
}
-/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- */
-STATIC int /* error */
-xfs_bmap_isaeof(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t off, /* file offset in fsblocks */
- int whichfork, /* data or attribute fork */
- char *aeof) /* return value */
-{
- int error; /* error return value */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */
- xfs_extnum_t nextents; /* number of file extents */
- xfs_bmbt_irec_t s; /* expanded extent record */
-
- ASSERT(whichfork == XFS_DATA_FORK);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(NULL, ip, whichfork)))
- return error;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- if (nextents == 0) {
- *aeof = 1;
- return 0;
- }
- /*
- * Go to the last extent
- */
- lastrec = xfs_iext_get_ext(ifp, nextents - 1);
- xfs_bmbt_get_all(lastrec, &s);
- /*
- * Check we are allocating in the last extent (for delayed allocations)
- * or past the last extent for non-delayed allocations.
- */
- *aeof = (off >= s.br_startoff &&
- off < s.br_startoff + s.br_blockcount &&
- isnullstartblock(s.br_startblock)) ||
- off >= s.br_startoff + s.br_blockcount;
- return 0;
-}
-
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary.
- */
-int /* error */
-xfs_bmap_eof(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t endoff, /* file offset in fsblocks */
- int whichfork, /* data or attribute fork */
- int *eof) /* result value */
-{
- xfs_fsblock_t blockcount; /* extent block count */
- int error; /* error return value */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */
- xfs_extnum_t nextents; /* number of file extents */
- xfs_fileoff_t startoff; /* extent starting file offset */
-
- ASSERT(whichfork == XFS_DATA_FORK);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(NULL, ip, whichfork)))
- return error;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- if (nextents == 0) {
- *eof = 1;
- return 0;
- }
- /*
- * Go to the last extent
- */
- lastrec = xfs_iext_get_ext(ifp, nextents - 1);
- startoff = xfs_bmbt_get_startoff(lastrec);
- blockcount = xfs_bmbt_get_blockcount(lastrec);
- *eof = endoff >= startoff + blockcount;
- return 0;
-}
-
#ifdef DEBUG
STATIC struct xfs_buf *
xfs_bmap_get_bp(
@@ -6099,9 +6109,8 @@ xfs_bmap_punch_delalloc_range(
* trying to remove a real extent (which requires a
* transaction) or a hole, which is probably a bad idea...
*/
- error = xfs_bmapi(NULL, ip, start_fsb, 1,
- XFS_BMAPI_ENTIRE, NULL, 0, &imap,
- &nimaps, NULL);
+ error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
+ XFS_BMAPI_ENTIRE);
if (error) {
/* something screwed, just bail */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index c62234bde05..89ee672d378 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -62,27 +62,23 @@ typedef struct xfs_bmap_free
#define XFS_BMAP_MAX_NMAP 4
/*
- * Flags for xfs_bmapi
+ * Flags for xfs_bmapi_*
*/
-#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */
-#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */
-#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */
-#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */
-#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */
-#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */
-#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
+#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */
+#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */
+#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */
/* combine contig. space */
-#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */
+#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */
/*
* unwritten extent conversion - this needs write cache flushing and no additional
* allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
* from written to unwritten, otherwise convert from unwritten to written.
*/
-#define XFS_BMAPI_CONVERT 0x200
+#define XFS_BMAPI_CONVERT 0x040
#define XFS_BMAPI_FLAGS \
- { XFS_BMAPI_WRITE, "WRITE" }, \
- { XFS_BMAPI_DELAY, "DELAY" }, \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
{ XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
@@ -113,21 +109,28 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
* Argument structure for xfs_bmap_alloc.
*/
typedef struct xfs_bmalloca {
- xfs_fsblock_t firstblock; /* i/o first block allocated */
- xfs_fsblock_t rval; /* starting block of new extent */
- xfs_fileoff_t off; /* offset in file filling in */
+ xfs_fsblock_t *firstblock; /* i/o first block allocated */
+ struct xfs_bmap_free *flist; /* bmap freelist */
struct xfs_trans *tp; /* transaction pointer */
struct xfs_inode *ip; /* incore inode pointer */
- struct xfs_bmbt_irec *prevp; /* extent before the new one */
- struct xfs_bmbt_irec *gotp; /* extent after, or delayed */
- xfs_extlen_t alen; /* i/o length asked/allocated */
+ struct xfs_bmbt_irec prev; /* extent before the new one */
+ struct xfs_bmbt_irec got; /* extent after, or delayed */
+
+ xfs_fileoff_t offset; /* offset in file filling in */
+ xfs_extlen_t length; /* i/o length asked/allocated */
+ xfs_fsblock_t blkno; /* starting block of new extent */
+
+ struct xfs_btree_cur *cur; /* btree cursor */
+ xfs_extnum_t idx; /* current extent index */
+ int nallocs;/* number of extents alloc'd */
+ int logflags;/* flags for transaction logging */
+
xfs_extlen_t total; /* total blocks needed for xaction */
xfs_extlen_t minlen; /* minimum allocation size (blocks) */
xfs_extlen_t minleft; /* amount must be left after alloc */
char eof; /* set if allocating past last extent */
char wasdel; /* replacing a delayed allocation */
char userdata;/* set if is user data */
- char low; /* low on space, using seq'l ags */
char aeof; /* allocated space at eof */
char conv; /* overwriting unwritten extents */
} xfs_bmalloca_t;
@@ -152,251 +155,62 @@ typedef struct xfs_bmalloca {
{ BMAP_RIGHT_FILLING, "RF" }, \
{ BMAP_ATTRFORK, "ATTR" }
-/*
- * Add bmap trace insert entries for all the contents of the extent list.
- *
- * Quite excessive tracing. Only do this for debug builds.
- */
#if defined(__KERNEL) && defined(DEBUG)
-void
-xfs_bmap_trace_exlist(
- struct xfs_inode *ip, /* incore inode pointer */
- xfs_extnum_t cnt, /* count of entries in list */
- int whichfork,
- unsigned long caller_ip); /* data or attr fork */
+void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
+ int whichfork, unsigned long caller_ip);
#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
#else
#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
#endif
-/*
- * Convert inode from non-attributed to attributed.
- * Must not be in a transaction, ip must not be locked.
- */
-int /* error code */
-xfs_bmap_add_attrfork(
- struct xfs_inode *ip, /* incore inode pointer */
- int size, /* space needed for new attribute */
- int rsvd); /* flag for reserved block allocation */
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-void
-xfs_bmap_add_free(
- xfs_fsblock_t bno, /* fs block number of extent */
- xfs_filblks_t len, /* length of extent */
- xfs_bmap_free_t *flist, /* list of extents */
- struct xfs_mount *mp); /* mount point structure */
-
-/*
- * Routine to clean up the free list data structure when
- * an error occurs during a transaction.
- */
-void
-xfs_bmap_cancel(
- xfs_bmap_free_t *flist); /* free list to clean up */
-
-/*
- * Compute and fill in the value of the maximum depth of a bmap btree
- * in this filesystem. Done once, during mount.
- */
-void
-xfs_bmap_compute_maxlevels(
- struct xfs_mount *mp, /* file system mount structure */
- int whichfork); /* data or attr fork */
-
-/*
- * Returns the file-relative block number of the first unused block in the file.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- */
-int /* error */
-xfs_bmap_first_unused(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- xfs_extlen_t len, /* size of hole to find */
- xfs_fileoff_t *unused, /* unused block num */
- int whichfork); /* data or attr fork */
-
-/*
- * Returns the file-relative block number of the last block + 1 before
- * last_block (input value) in the file.
- * This is not based on i_size, it is based on the extent list.
- * Returns 0 for local files, as they do not have an extent list.
- */
-int /* error */
-xfs_bmap_last_before(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t *last_block, /* last block */
- int whichfork); /* data or attr fork */
-
-/*
- * Returns the file-relative block number of the first block past eof in
- * the file. This is not based on i_size, it is based on the extent list.
- * Returns 0 for local files, as they do not have an extent list.
- */
-int /* error */
-xfs_bmap_last_offset(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t *unused, /* last block num */
- int whichfork); /* data or attr fork */
-
-/*
- * Returns whether the selected fork of the inode has exactly one
- * block or not. For the data fork we check this matches di_size,
- * implying the file's range is 0..bsize-1.
- */
-int
-xfs_bmap_one_block(
- struct xfs_inode *ip, /* incore inode */
- int whichfork); /* data or attr fork */
-
-/*
- * Read in the extents to iu_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in.
- */
-int /* error */
-xfs_bmap_read_extents(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- int whichfork); /* data or attr fork */
-
-/*
- * Map file blocks to filesystem blocks.
- * File range is given by the bno/len pair.
- * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
- * into a hole or past eof.
- * Only allocates blocks from a single allocation group,
- * to avoid locking problems.
- * The returned value in "firstblock" from the first call in a transaction
- * must be remembered and presented to subsequent calls in "firstblock".
- * An upper bound for the number of blocks to be allocated is supplied to
- * the first call in "total"; if no allocation group has that many free
- * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
- */
-int /* error */
-xfs_bmapi(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t bno, /* starting file offs. mapped */
- xfs_filblks_t len, /* length to map in file */
- int flags, /* XFS_BMAPI_... */
- xfs_fsblock_t *firstblock, /* first allocated block
- controls a.g. for allocs */
- xfs_extlen_t total, /* total blocks needed */
- struct xfs_bmbt_irec *mval, /* output: map values */
- int *nmap, /* i/o: mval size/count */
- xfs_bmap_free_t *flist); /* i/o: list extents to free */
-
-/*
- * Map file blocks to filesystem blocks, simple version.
- * One block only, read-only.
- * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
- * For the other flag values, the effect is as if XFS_BMAPI_METADATA
- * was set and all the others were clear.
- */
-int /* error */
-xfs_bmapi_single(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- int whichfork, /* data or attr fork */
- xfs_fsblock_t *fsb, /* output: mapped block */
- xfs_fileoff_t bno); /* starting file offs. mapped */
-
-/*
- * Unmap (remove) blocks from a file.
- * If nexts is nonzero then the number of extents to remove is limited to
- * that value. If not all extents in the block range can be removed then
- * *done is set.
- */
-int /* error */
-xfs_bunmapi(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t bno, /* starting offset to unmap */
- xfs_filblks_t len, /* length to unmap in file */
- int flags, /* XFS_BMAPI_... */
- xfs_extnum_t nexts, /* number of extents max */
- xfs_fsblock_t *firstblock, /* first allocated block
- controls a.g. for allocs */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- int *done); /* set if not done yet */
-
-/*
- * Check an extent list, which has just been read, for
- * any bit in the extent flag field.
- */
-int
-xfs_check_nostate_extents(
- struct xfs_ifork *ifp,
- xfs_extnum_t idx,
- xfs_extnum_t num);
-
-uint
-xfs_default_attroffset(
- struct xfs_inode *ip);
+int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
+ struct xfs_bmap_free *flist, struct xfs_mount *mp);
+void xfs_bmap_cancel(struct xfs_bmap_free *flist);
+void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
+int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
+int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t *last_block, int whichfork);
+int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t *unused, int whichfork);
+int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
+int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork);
+int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
+ xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+ int *nmap, int flags);
+int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
+ xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+ int *nmap, int flags);
+int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_fsblock_t *firstblock, xfs_extlen_t total,
+ struct xfs_bmbt_irec *mval, int *nmap,
+ struct xfs_bmap_free *flist);
+int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
+ struct xfs_bmap_free *flist, int *done);
+int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
+ xfs_extnum_t num);
+uint xfs_default_attroffset(struct xfs_inode *ip);
#ifdef __KERNEL__
-
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller. Frees all the extents that need freeing, which must be done
- * last due to locking considerations.
- *
- * Return 1 if the given transaction was committed and a new one allocated,
- * and 0 otherwise.
- */
-int /* error */
-xfs_bmap_finish(
- struct xfs_trans **tp, /* transaction pointer addr */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- int *committed); /* xact committed or not */
-
/* bmap to userspace formatter - copy to user & advance pointer */
typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
-/*
- * Get inode's extents as described in bmv, and format for output.
- */
-int /* error code */
-xfs_getbmap(
- xfs_inode_t *ip,
- struct getbmapx *bmv, /* user bmap structure */
- xfs_bmap_format_t formatter, /* format to user */
- void *arg); /* formatter arg */
-
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary
- */
-int
-xfs_bmap_eof(
- struct xfs_inode *ip,
- xfs_fileoff_t endoff,
- int whichfork,
- int *eof);
-
-/*
- * Count fsblocks of the given fork.
- */
-int
-xfs_bmap_count_blocks(
- xfs_trans_t *tp,
- struct xfs_inode *ip,
- int whichfork,
- int *count);
-
-int
-xfs_bmap_punch_delalloc_range(
- struct xfs_inode *ip,
- xfs_fileoff_t start_fsb,
- xfs_fileoff_t length);
+int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
+ int *committed);
+int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
+ xfs_bmap_format_t formatter, void *arg);
+int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
+ int whichfork, int *eof);
+int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, int *count);
+int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+ xfs_fileoff_t start_fsb, xfs_fileoff_t length);
#endif /* __KERNEL__ */
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 2b9fd385e27..1f19f03af9d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -631,7 +631,7 @@ xfs_btree_read_bufl(
}
ASSERT(!xfs_buf_geterror(bp));
if (bp)
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+ xfs_buf_set_ref(bp, refval);
*bpp = bp;
return 0;
}
@@ -939,13 +939,13 @@ xfs_btree_set_refs(
switch (cur->bc_btnum) {
case XFS_BTNUM_BNO:
case XFS_BTNUM_CNT:
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+ xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
break;
case XFS_BTNUM_INO:
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+ xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
break;
case XFS_BTNUM_BMAP:
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+ xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
break;
default:
ASSERT(0);
@@ -970,7 +970,8 @@ xfs_btree_get_buf_block(
*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
mp->m_bsize, flags);
- ASSERT(!xfs_buf_geterror(*bpp));
+ if (!*bpp)
+ return ENOMEM;
*block = XFS_BUF_TO_BLOCK(*bpp);
return 0;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c57836dc778..cf0ac056815 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -43,7 +43,6 @@
static kmem_zone_t *xfs_buf_zone;
STATIC int xfsbufd(void *);
-STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
static struct workqueue_struct *xfslogd_workqueue;
struct workqueue_struct *xfsdatad_workqueue;
@@ -66,10 +65,6 @@ struct workqueue_struct *xfsconvertd_workqueue;
#define xb_to_km(flags) \
(((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
-#define xfs_buf_allocate(flags) \
- kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
-#define xfs_buf_deallocate(bp) \
- kmem_zone_free(xfs_buf_zone, (bp));
static inline int
xfs_buf_is_vmapped(
@@ -152,6 +147,7 @@ xfs_buf_stale(
struct xfs_buf *bp)
{
bp->b_flags |= XBF_STALE;
+ xfs_buf_delwri_dequeue(bp);
atomic_set(&(bp)->b_lru_ref, 0);
if (!list_empty(&bp->b_lru)) {
struct xfs_buftarg *btp = bp->b_target;
@@ -167,14 +163,19 @@ xfs_buf_stale(
ASSERT(atomic_read(&bp->b_hold) >= 1);
}
-STATIC void
-_xfs_buf_initialize(
- xfs_buf_t *bp,
- xfs_buftarg_t *target,
+struct xfs_buf *
+xfs_buf_alloc(
+ struct xfs_buftarg *target,
xfs_off_t range_base,
size_t range_length,
xfs_buf_flags_t flags)
{
+ struct xfs_buf *bp;
+
+ bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags));
+ if (unlikely(!bp))
+ return NULL;
+
/*
* We don't want certain flags to appear in b_flags.
*/
@@ -203,8 +204,9 @@ _xfs_buf_initialize(
init_waitqueue_head(&bp->b_waiters);
XFS_STATS_INC(xb_create);
-
trace_xfs_buf_init(bp, _RET_IP_);
+
+ return bp;
}
/*
@@ -277,7 +279,7 @@ xfs_buf_free(
} else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
_xfs_buf_free_pages(bp);
- xfs_buf_deallocate(bp);
+ kmem_zone_free(xfs_buf_zone, bp);
}
/*
@@ -416,10 +418,7 @@ _xfs_buf_map_pages(
/*
* Look up, and creates if absent, a lockable buffer for
* a given range of an inode. The buffer is returned
- * locked. If other overlapping buffers exist, they are
- * released before the new buffer is created and locked,
- * which may imply that this call will block until those buffers
- * are unlocked. No I/O is implied by this call.
+ * locked. No I/O is implied by this call.
*/
xfs_buf_t *
_xfs_buf_find(
@@ -481,8 +480,6 @@ _xfs_buf_find(
/* No match found */
if (new_bp) {
- _xfs_buf_initialize(new_bp, btp, range_base,
- range_length, flags);
rb_link_node(&new_bp->b_rbnode, parent, rbp);
rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
/* the buffer keeps the perag reference until it is freed */
@@ -525,35 +522,51 @@ found:
}
/*
- * Assembles a buffer covering the specified range.
- * Storage in memory for all portions of the buffer will be allocated,
- * although backing storage may not be.
+ * Assembles a buffer covering the specified range. The code is optimised for
+ * cache hits, as metadata intensive workloads will see 3 orders of magnitude
+ * more hits than misses.
*/
-xfs_buf_t *
+struct xfs_buf *
xfs_buf_get(
xfs_buftarg_t *target,/* target for buffer */
xfs_off_t ioff, /* starting offset of range */
size_t isize, /* length of range */
xfs_buf_flags_t flags)
{
- xfs_buf_t *bp, *new_bp;
+ struct xfs_buf *bp;
+ struct xfs_buf *new_bp;
int error = 0;
- new_bp = xfs_buf_allocate(flags);
+ bp = _xfs_buf_find(target, ioff, isize, flags, NULL);
+ if (likely(bp))
+ goto found;
+
+ new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT,
+ flags);
if (unlikely(!new_bp))
return NULL;
bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
+ if (!bp) {
+ kmem_zone_free(xfs_buf_zone, new_bp);
+ return NULL;
+ }
+
if (bp == new_bp) {
error = xfs_buf_allocate_memory(bp, flags);
if (error)
goto no_buffer;
- } else {
- xfs_buf_deallocate(new_bp);
- if (unlikely(bp == NULL))
- return NULL;
- }
+ } else
+ kmem_zone_free(xfs_buf_zone, new_bp);
+ /*
+ * Now we have a workable buffer, fill in the block number so
+ * that we can do IO on it.
+ */
+ bp->b_bn = ioff;
+ bp->b_count_desired = bp->b_buffer_length;
+
+found:
if (!(bp->b_flags & XBF_MAPPED)) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
@@ -564,18 +577,10 @@ xfs_buf_get(
}
XFS_STATS_INC(xb_get);
-
- /*
- * Always fill in the block number now, the mapped cases can do
- * their own overlay of this later.
- */
- bp->b_bn = ioff;
- bp->b_count_desired = bp->b_buffer_length;
-
trace_xfs_buf_get(bp, flags, _RET_IP_);
return bp;
- no_buffer:
+no_buffer:
if (flags & (XBF_LOCK | XBF_TRYLOCK))
xfs_buf_unlock(bp);
xfs_buf_rele(bp);
@@ -689,19 +694,6 @@ xfs_buf_read_uncached(
return bp;
}
-xfs_buf_t *
-xfs_buf_get_empty(
- size_t len,
- xfs_buftarg_t *target)
-{
- xfs_buf_t *bp;
-
- bp = xfs_buf_allocate(0);
- if (bp)
- _xfs_buf_initialize(bp, target, 0, len, 0);
- return bp;
-}
-
/*
* Return a buffer allocated as an empty buffer and associated to external
* memory via xfs_buf_associate_memory() back to it's empty state.
@@ -787,10 +779,9 @@ xfs_buf_get_uncached(
int error, i;
xfs_buf_t *bp;
- bp = xfs_buf_allocate(0);
+ bp = xfs_buf_alloc(target, 0, len, 0);
if (unlikely(bp == NULL))
goto fail;
- _xfs_buf_initialize(bp, target, 0, len, 0);
error = _xfs_buf_get_pages(bp, page_count, 0);
if (error)
@@ -818,7 +809,7 @@ xfs_buf_get_uncached(
__free_page(bp->b_pages[i]);
_xfs_buf_free_pages(bp);
fail_free_buf:
- xfs_buf_deallocate(bp);
+ kmem_zone_free(xfs_buf_zone, bp);
fail:
return NULL;
}
@@ -937,12 +928,6 @@ void
xfs_buf_unlock(
struct xfs_buf *bp)
{
- if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
- atomic_inc(&bp->b_hold);
- bp->b_flags |= XBF_ASYNC;
- xfs_buf_delwri_queue(bp, 0);
- }
-
XB_CLEAR_OWNER(bp);
up(&bp->b_sema);
@@ -1019,9 +1004,19 @@ xfs_buf_ioerror(
trace_xfs_buf_ioerror(bp, error, _RET_IP_);
}
+void
+xfs_buf_ioerror_alert(
+ struct xfs_buf *bp,
+ const char *func)
+{
+ xfs_alert(bp->b_target->bt_mount,
+"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd",
+ (__uint64_t)XFS_BUF_ADDR(bp), func,
+ bp->b_error, XFS_BUF_COUNT(bp));
+}
+
int
xfs_bwrite(
- struct xfs_mount *mp,
struct xfs_buf *bp)
{
int error;
@@ -1033,25 +1028,13 @@ xfs_bwrite(
xfs_bdstrat_cb(bp);
error = xfs_buf_iowait(bp);
- if (error)
- xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
- xfs_buf_relse(bp);
+ if (error) {
+ xfs_force_shutdown(bp->b_target->bt_mount,
+ SHUTDOWN_META_IO_ERROR);
+ }
return error;
}
-void
-xfs_bdwrite(
- void *mp,
- struct xfs_buf *bp)
-{
- trace_xfs_buf_bdwrite(bp, _RET_IP_);
-
- bp->b_flags &= ~XBF_READ;
- bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
-
- xfs_buf_delwri_queue(bp, 1);
-}
-
/*
* Called when we want to stop a buffer from getting written or read.
* We attach the EIO error, muck with its flags, and call xfs_buf_ioend
@@ -1074,9 +1057,8 @@ xfs_bioerror(
* We're calling xfs_buf_ioend, so delete XBF_DONE flag.
*/
XFS_BUF_UNREAD(bp);
- XFS_BUF_UNDELAYWRITE(bp);
XFS_BUF_UNDONE(bp);
- XFS_BUF_STALE(bp);
+ xfs_buf_stale(bp);
xfs_buf_ioend(bp, 0);
@@ -1103,9 +1085,8 @@ xfs_bioerror_relse(
* change that interface.
*/
XFS_BUF_UNREAD(bp);
- XFS_BUF_UNDELAYWRITE(bp);
XFS_BUF_DONE(bp);
- XFS_BUF_STALE(bp);
+ xfs_buf_stale(bp);
bp->b_iodone = NULL;
if (!(fl & XBF_ASYNC)) {
/*
@@ -1115,7 +1096,7 @@ xfs_bioerror_relse(
* ASYNC buffers.
*/
xfs_buf_ioerror(bp, EIO);
- XFS_BUF_FINISH_IOWAIT(bp);
+ complete(&bp->b_iowait);
} else {
xfs_buf_relse(bp);
}
@@ -1275,15 +1256,10 @@ xfs_buf_iorequest(
{
trace_xfs_buf_iorequest(bp, _RET_IP_);
- if (bp->b_flags & XBF_DELWRI) {
- xfs_buf_delwri_queue(bp, 1);
- return 0;
- }
+ ASSERT(!(bp->b_flags & XBF_DELWRI));
- if (bp->b_flags & XBF_WRITE) {
+ if (bp->b_flags & XBF_WRITE)
xfs_buf_wait_unpin(bp);
- }
-
xfs_buf_hold(bp);
/* Set the count to 1 initially, this will stop an I/O
@@ -1481,9 +1457,13 @@ xfs_setsize_buftarg_flags(
btp->bt_smask = sectorsize - 1;
if (set_blocksize(btp->bt_bdev, sectorsize)) {
+ char name[BDEVNAME_SIZE];
+
+ bdevname(btp->bt_bdev, name);
+
xfs_warn(btp->bt_mount,
"Cannot set_blocksize to %u on device %s\n",
- sectorsize, xfs_buf_target_name(btp));
+ sectorsize, name);
return EINVAL;
}
@@ -1514,12 +1494,12 @@ xfs_setsize_buftarg(
}
STATIC int
-xfs_alloc_delwrite_queue(
+xfs_alloc_delwri_queue(
xfs_buftarg_t *btp,
const char *fsname)
{
- INIT_LIST_HEAD(&btp->bt_delwrite_queue);
- spin_lock_init(&btp->bt_delwrite_lock);
+ INIT_LIST_HEAD(&btp->bt_delwri_queue);
+ spin_lock_init(&btp->bt_delwri_lock);
btp->bt_flags = 0;
btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
if (IS_ERR(btp->bt_task))
@@ -1549,7 +1529,7 @@ xfs_alloc_buftarg(
spin_lock_init(&btp->bt_lru_lock);
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
- if (xfs_alloc_delwrite_queue(btp, fsname))
+ if (xfs_alloc_delwri_queue(btp, fsname))
goto error;
btp->bt_shrinker.shrink = xfs_buftarg_shrink;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1565,56 +1545,48 @@ error:
/*
* Delayed write buffer handling
*/
-STATIC void
+void
xfs_buf_delwri_queue(
- xfs_buf_t *bp,
- int unlock)
+ xfs_buf_t *bp)
{
- struct list_head *dwq = &bp->b_target->bt_delwrite_queue;
- spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
+ struct xfs_buftarg *btp = bp->b_target;
trace_xfs_buf_delwri_queue(bp, _RET_IP_);
- ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
+ ASSERT(!(bp->b_flags & XBF_READ));
- spin_lock(dwlk);
- /* If already in the queue, dequeue and place at tail */
+ spin_lock(&btp->bt_delwri_lock);
if (!list_empty(&bp->b_list)) {
+ /* if already in the queue, move it to the tail */
ASSERT(bp->b_flags & _XBF_DELWRI_Q);
- if (unlock)
- atomic_dec(&bp->b_hold);
- list_del(&bp->b_list);
- }
-
- if (list_empty(dwq)) {
+ list_move_tail(&bp->b_list, &btp->bt_delwri_queue);
+ } else {
/* start xfsbufd as it is about to have something to do */
- wake_up_process(bp->b_target->bt_task);
- }
+ if (list_empty(&btp->bt_delwri_queue))
+ wake_up_process(bp->b_target->bt_task);
- bp->b_flags |= _XBF_DELWRI_Q;
- list_add_tail(&bp->b_list, dwq);
+ atomic_inc(&bp->b_hold);
+ bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
+ list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
+ }
bp->b_queuetime = jiffies;
- spin_unlock(dwlk);
-
- if (unlock)
- xfs_buf_unlock(bp);
+ spin_unlock(&btp->bt_delwri_lock);
}
void
xfs_buf_delwri_dequeue(
xfs_buf_t *bp)
{
- spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
int dequeued = 0;
- spin_lock(dwlk);
+ spin_lock(&bp->b_target->bt_delwri_lock);
if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
ASSERT(bp->b_flags & _XBF_DELWRI_Q);
list_del_init(&bp->b_list);
dequeued = 1;
}
bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
- spin_unlock(dwlk);
+ spin_unlock(&bp->b_target->bt_delwri_lock);
if (dequeued)
xfs_buf_rele(bp);
@@ -1646,16 +1618,9 @@ xfs_buf_delwri_promote(
if (bp->b_queuetime < jiffies - age)
return;
bp->b_queuetime = jiffies - age;
- spin_lock(&btp->bt_delwrite_lock);
- list_move(&bp->b_list, &btp->bt_delwrite_queue);
- spin_unlock(&btp->bt_delwrite_lock);
-}
-
-STATIC void
-xfs_buf_runall_queues(
- struct workqueue_struct *queue)
-{
- flush_workqueue(queue);
+ spin_lock(&btp->bt_delwri_lock);
+ list_move(&bp->b_list, &btp->bt_delwri_queue);
+ spin_unlock(&btp->bt_delwri_lock);
}
/*
@@ -1669,15 +1634,13 @@ xfs_buf_delwri_split(
unsigned long age)
{
xfs_buf_t *bp, *n;
- struct list_head *dwq = &target->bt_delwrite_queue;
- spinlock_t *dwlk = &target->bt_delwrite_lock;
int skipped = 0;
int force;
force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
INIT_LIST_HEAD(list);
- spin_lock(dwlk);
- list_for_each_entry_safe(bp, n, dwq, b_list) {
+ spin_lock(&target->bt_delwri_lock);
+ list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
ASSERT(bp->b_flags & XBF_DELWRI);
if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
@@ -1694,10 +1657,9 @@ xfs_buf_delwri_split(
} else
skipped++;
}
- spin_unlock(dwlk);
+ spin_unlock(&target->bt_delwri_lock);
return skipped;
-
}
/*
@@ -1747,7 +1709,7 @@ xfsbufd(
}
/* sleep for a long time if there is nothing to do. */
- if (list_empty(&target->bt_delwrite_queue))
+ if (list_empty(&target->bt_delwri_queue))
tout = MAX_SCHEDULE_TIMEOUT;
schedule_timeout_interruptible(tout);
@@ -1783,9 +1745,7 @@ xfs_flush_buftarg(
LIST_HEAD(wait_list);
struct blk_plug plug;
- xfs_buf_runall_queues(xfsconvertd_workqueue);
- xfs_buf_runall_queues(xfsdatad_workqueue);
- xfs_buf_runall_queues(xfslogd_workqueue);
+ flush_workqueue(xfslogd_workqueue);
set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
@@ -1866,11 +1826,3 @@ xfs_buf_terminate(void)
destroy_workqueue(xfslogd_workqueue);
kmem_zone_destroy(xfs_buf_zone);
}
-
-#ifdef CONFIG_KDB_MODULES
-struct list_head *
-xfs_get_buftarg_list(void)
-{
- return &xfs_buftarg_list;
-}
-#endif
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 620972b8094..5bab046e859 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -105,8 +105,8 @@ typedef struct xfs_buftarg {
/* per device delwri queue */
struct task_struct *bt_task;
- struct list_head bt_delwrite_queue;
- spinlock_t bt_delwrite_lock;
+ struct list_head bt_delwri_queue;
+ spinlock_t bt_delwri_lock;
unsigned long bt_flags;
/* LRU control structures */
@@ -175,7 +175,8 @@ extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
xfs_buf_flags_t);
-extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t,
+ xfs_buf_flags_t);
extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
@@ -197,14 +198,14 @@ extern void xfs_buf_unlock(xfs_buf_t *);
((bp)->b_sema.count <= 0)
/* Buffer Read and Write Routines */
-extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
-extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
+extern int xfs_bwrite(struct xfs_buf *bp);
extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
extern int xfs_bdstrat_cb(struct xfs_buf *);
extern void xfs_buf_ioend(xfs_buf_t *, int);
extern void xfs_buf_ioerror(xfs_buf_t *, int);
+extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
extern int xfs_buf_iorequest(xfs_buf_t *);
extern int xfs_buf_iowait(xfs_buf_t *);
extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
@@ -221,38 +222,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
/* Delayed Write Buffer Routines */
-extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
-extern void xfs_buf_delwri_promote(xfs_buf_t *);
+extern void xfs_buf_delwri_queue(struct xfs_buf *);
+extern void xfs_buf_delwri_dequeue(struct xfs_buf *);
+extern void xfs_buf_delwri_promote(struct xfs_buf *);
/* Buffer Daemon Setup Routines */
extern int xfs_buf_init(void);
extern void xfs_buf_terminate(void);
-static inline const char *
-xfs_buf_target_name(struct xfs_buftarg *target)
-{
- static char __b[BDEVNAME_SIZE];
-
- return bdevname(target->bt_bdev, __b);
-}
-
-
#define XFS_BUF_ZEROFLAGS(bp) \
((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
void xfs_buf_stale(struct xfs_buf *bp);
-#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
-#define XFS_BUF_SUPER_STALE(bp) do { \
- XFS_BUF_STALE(bp); \
- xfs_buf_delwri_dequeue(bp); \
- XFS_BUF_DONE(bp); \
- } while (0)
-
-#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
-#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
+
#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
@@ -280,23 +265,16 @@ void xfs_buf_stale(struct xfs_buf *bp);
#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
-static inline void
-xfs_buf_set_ref(
- struct xfs_buf *bp,
- int lru_ref)
+static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
atomic_set(&bp->b_lru_ref, lru_ref);
}
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
-#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
static inline int xfs_buf_ispinned(struct xfs_buf *bp)
{
return atomic_read(&bp->b_pin_count);
}
-#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait);
-
static inline void xfs_buf_relse(xfs_buf_t *bp)
{
xfs_buf_unlock(bp);
@@ -313,14 +291,7 @@ extern void xfs_wait_buftarg(xfs_buftarg_t *);
extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
-#ifdef CONFIG_KDB_MODULES
-extern struct list_head *xfs_get_buftarg_list(void);
-#endif
-
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
-#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1)
-#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1)
-
#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index ef43fce519a..eac97ef81e2 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -656,7 +656,7 @@ xfs_buf_item_committing(
/*
* This is the ops vector shared by all buf log items.
*/
-static struct xfs_item_ops xfs_buf_item_ops = {
+static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_size = xfs_buf_item_size,
.iop_format = xfs_buf_item_format,
.iop_pin = xfs_buf_item_pin,
@@ -967,7 +967,8 @@ xfs_buf_iodone_callbacks(
* I/O errors, there's no point in giving this a retry.
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
- XFS_BUF_SUPER_STALE(bp);
+ xfs_buf_stale(bp);
+ XFS_BUF_DONE(bp);
trace_xfs_buf_item_iodone(bp, _RET_IP_);
goto do_callbacks;
}
@@ -975,9 +976,7 @@ xfs_buf_iodone_callbacks(
if (bp->b_target != lasttarg ||
time_after(jiffies, (lasttime + 5*HZ))) {
lasttime = jiffies;
- xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
- xfs_buf_target_name(bp->b_target),
- (__uint64_t)XFS_BUF_ADDR(bp));
+ xfs_buf_ioerror_alert(bp, __func__);
}
lasttarg = bp->b_target;
@@ -993,7 +992,7 @@ xfs_buf_iodone_callbacks(
xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
if (!XFS_BUF_ISSTALE(bp)) {
- XFS_BUF_DELAYWRITE(bp);
+ xfs_buf_delwri_queue(bp);
XFS_BUF_DONE(bp);
}
ASSERT(bp->b_iodone != NULL);
@@ -1006,9 +1005,8 @@ xfs_buf_iodone_callbacks(
* If the write of the buffer was synchronous, we want to make
* sure to return the error to the caller of xfs_bwrite().
*/
- XFS_BUF_STALE(bp);
+ xfs_buf_stale(bp);
XFS_BUF_DONE(bp);
- XFS_BUF_UNDELAYWRITE(bp);
trace_xfs_buf_error_relse(bp, _RET_IP_);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index ee9d5427fcd..77c74257c2a 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1578,9 +1578,8 @@ xfs_da_grow_inode_int(
*/
nmap = 1;
ASSERT(args->firstblock != NULL);
- error = xfs_bmapi(tp, dp, *bno, count,
- xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
- XFS_BMAPI_CONTIG,
+ error = xfs_bmapi_write(tp, dp, *bno, count,
+ xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
args->firstblock, args->total, &map, &nmap,
args->flist);
if (error)
@@ -1602,9 +1601,8 @@ xfs_da_grow_inode_int(
for (b = *bno, mapi = 0; b < *bno + count; ) {
nmap = MIN(XFS_BMAP_MAX_NMAP, count);
c = (int)(*bno + count - b);
- error = xfs_bmapi(tp, dp, b, c,
- xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
- XFS_BMAPI_METADATA,
+ error = xfs_bmapi_write(tp, dp, b, c,
+ xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
args->firstblock, args->total,
&mapp[mapi], &nmap, args->flist);
if (error)
@@ -1975,33 +1973,16 @@ xfs_da_do_buf(
/*
* Optimize the one-block case.
*/
- if (nfsb == 1) {
- xfs_fsblock_t fsb;
-
- if ((error =
- xfs_bmapi_single(trans, dp, whichfork, &fsb,
- (xfs_fileoff_t)bno))) {
- return error;
- }
+ if (nfsb == 1)
mapp = &map;
- if (fsb == NULLFSBLOCK) {
- nmap = 0;
- } else {
- map.br_startblock = fsb;
- map.br_startoff = (xfs_fileoff_t)bno;
- map.br_blockcount = 1;
- nmap = 1;
- }
- } else {
+ else
mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
- nmap = nfsb;
- if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
- nfsb,
- XFS_BMAPI_METADATA |
- xfs_bmapi_aflag(whichfork),
- NULL, 0, mapp, &nmap, NULL)))
- goto exit0;
- }
+
+ nmap = nfsb;
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp,
+ &nmap, xfs_bmapi_aflag(whichfork));
+ if (error)
+ goto exit0;
} else {
map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
map.br_startoff = (xfs_fileoff_t)bno;
@@ -2072,13 +2053,10 @@ xfs_da_do_buf(
if (!bp)
continue;
if (caller == 1) {
- if (whichfork == XFS_ATTR_FORK) {
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE,
- XFS_ATTR_BTREE_REF);
- } else {
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
- XFS_DIR_BTREE_REF);
- }
+ if (whichfork == XFS_ATTR_FORK)
+ xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
+ else
+ xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
}
if (bplist) {
bplist[nbplist++] = bp;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 9a84a85c03b..654dc6f05ba 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -425,8 +425,8 @@ xfs_swap_extents(
}
- xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
xfs_trans_log_inode(tp, ip, ilf_fields);
xfs_trans_log_inode(tp, tip, tilf_fields);
@@ -438,7 +438,7 @@ xfs_swap_extents(
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
+ error = xfs_trans_commit(tp, 0);
trace_xfs_swap_extent_after(ip, 0);
trace_xfs_swap_extent_after(tip, 1);
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ca2386d82cd..66e108f561a 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -888,12 +888,10 @@ xfs_dir2_leaf_getdents(
* we already have in the table.
*/
nmap = map_size - map_valid;
- error = xfs_bmapi(NULL, dp,
- map_off,
+ error = xfs_bmapi_read(dp, map_off,
xfs_dir2_byte_to_da(mp,
XFS_DIR2_LEAF_OFFSET) - map_off,
- XFS_BMAPI_METADATA, NULL, 0,
- &map[map_valid], &nmap, NULL);
+ &map[map_valid], &nmap, 0);
/*
* Don't know if we should ignore this or
* try to return an error.
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 244e797dae3..8a24f0c6c86 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -38,7 +38,7 @@ xfs_trim_extents(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_fsblock_t start,
- xfs_fsblock_t len,
+ xfs_fsblock_t end,
xfs_fsblock_t minlen,
__uint64_t *blocks_trimmed)
{
@@ -100,7 +100,7 @@ xfs_trim_extents(
* down partially overlapping ranges for now.
*/
if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
- XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+ XFS_AGB_TO_FSB(mp, agno, fbno) > end) {
trace_xfs_discard_exclude(mp, agno, fbno, flen);
goto next_extent;
}
@@ -145,7 +145,7 @@ xfs_ioc_trim(
struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
unsigned int granularity = q->limits.discard_granularity;
struct fstrim_range range;
- xfs_fsblock_t start, len, minlen;
+ xfs_fsblock_t start, end, minlen;
xfs_agnumber_t start_agno, end_agno, agno;
__uint64_t blocks_trimmed = 0;
int error, last_error = 0;
@@ -165,19 +165,19 @@ xfs_ioc_trim(
* matter as trimming blocks is an advisory interface.
*/
start = XFS_B_TO_FSBT(mp, range.start);
- len = XFS_B_TO_FSBT(mp, range.len);
+ end = start + XFS_B_TO_FSBT(mp, range.len) - 1;
minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
- start_agno = XFS_FSB_TO_AGNO(mp, start);
- if (start_agno >= mp->m_sb.sb_agcount)
+ if (start >= mp->m_sb.sb_dblocks)
return -XFS_ERROR(EINVAL);
+ if (end > mp->m_sb.sb_dblocks - 1)
+ end = mp->m_sb.sb_dblocks - 1;
- end_agno = XFS_FSB_TO_AGNO(mp, start + len);
- if (end_agno >= mp->m_sb.sb_agcount)
- end_agno = mp->m_sb.sb_agcount - 1;
+ start_agno = XFS_FSB_TO_AGNO(mp, start);
+ end_agno = XFS_FSB_TO_AGNO(mp, end);
for (agno = start_agno; agno <= end_agno; agno++) {
- error = -xfs_trim_extents(mp, agno, start, len, minlen,
+ error = -xfs_trim_extents(mp, agno, start, end, minlen,
&blocks_trimmed);
if (error)
last_error = error;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index db62959bed1..25d7280e9f6 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -377,16 +377,14 @@ xfs_qm_dqalloc(
return (ESRCH);
}
- xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
nmaps = 1;
- if ((error = xfs_bmapi(tp, quotip,
- offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
- XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
- &firstblock,
- XFS_QM_DQALLOC_SPACE_RES(mp),
- &map, &nmaps, &flist))) {
+ error = xfs_bmapi_write(tp, quotip, offset_fsb,
+ XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+ &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
+ &map, &nmaps, &flist);
+ if (error)
goto error0;
- }
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
ASSERT(nmaps == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -402,8 +400,11 @@ xfs_qm_dqalloc(
dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen,
0);
- if (!bp || (error = xfs_buf_geterror(bp)))
+
+ error = xfs_buf_geterror(bp);
+ if (error)
goto error1;
+
/*
* Make a chunk of dquots out of this buffer and log
* the entire thing.
@@ -485,9 +486,8 @@ xfs_qm_dqtobp(
/*
* Find the block map; no allocations yet
*/
- error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
- XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
- NULL, 0, &map, &nmaps, NULL);
+ error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
+ XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
xfs_iunlock(quotip, XFS_ILOCK_SHARED);
if (error)
@@ -605,7 +605,7 @@ xfs_qm_dqread(
dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
/* Mark the buf so that this will stay incore a little longer */
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
+ xfs_buf_set_ref(bp, XFS_DQUOT_REF);
/*
* We got the buffer with a xfs_trans_read_buf() (in dqtobp())
@@ -1242,9 +1242,11 @@ xfs_qm_dqflush(
}
if (flags & SYNC_WAIT)
- error = xfs_bwrite(mp, bp);
+ error = xfs_bwrite(bp);
else
- xfs_bdwrite(mp, bp);
+ xfs_buf_delwri_queue(bp);
+
+ xfs_buf_relse(bp);
trace_xfs_dqflush_done(dqp);
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index bb3f71d236d..0dee0b71029 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -295,7 +295,7 @@ xfs_qm_dquot_logitem_committing(
/*
* This is the ops vector for dquots
*/
-static struct xfs_item_ops xfs_dquot_item_ops = {
+static const struct xfs_item_ops xfs_dquot_item_ops = {
.iop_size = xfs_qm_dquot_logitem_size,
.iop_format = xfs_qm_dquot_logitem_format,
.iop_pin = xfs_qm_dquot_logitem_pin,
@@ -483,7 +483,7 @@ xfs_qm_qoff_logitem_committing(
{
}
-static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
+static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
.iop_pin = xfs_qm_qoff_logitem_pin,
@@ -498,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
/*
* This is the ops vector shared by all quotaoff-start log items.
*/
-static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
+static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
.iop_pin = xfs_qm_qoff_logitem_pin,
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 75e5d322e48..558910f5e3c 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -98,22 +98,22 @@ xfs_fs_encode_fh(
switch (fileid_type) {
case FILEID_INO32_GEN_PARENT:
spin_lock(&dentry->d_lock);
- fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino;
+ fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
spin_unlock(&dentry->d_lock);
/*FALLTHRU*/
case FILEID_INO32_GEN:
- fid->i32.ino = inode->i_ino;
+ fid->i32.ino = XFS_I(inode)->i_ino;
fid->i32.gen = inode->i_generation;
break;
case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
spin_lock(&dentry->d_lock);
- fid64->parent_ino = dentry->d_parent->d_inode->i_ino;
+ fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
spin_unlock(&dentry->d_lock);
/*FALLTHRU*/
case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
- fid64->ino = inode->i_ino;
+ fid64->ino = XFS_I(inode)->i_ino;
fid64->gen = inode->i_generation;
break;
}
@@ -229,16 +229,16 @@ xfs_fs_nfs_commit_metadata(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- int error = 0;
+ xfs_lsn_t lsn = 0;
xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip)) {
- error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
- XFS_LOG_SYNC, NULL);
- }
+ if (xfs_ipincount(ip))
+ lsn = ip->i_itemp->ili_last_lsn;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return error;
+ if (!lsn)
+ return 0;
+ return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
}
const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d22e6262343..35c2aff38b2 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -217,7 +217,7 @@ xfs_efi_item_committing(
/*
* This is the ops vector shared by all efi log items.
*/
-static struct xfs_item_ops xfs_efi_item_ops = {
+static const struct xfs_item_ops xfs_efi_item_ops = {
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
.iop_pin = xfs_efi_item_pin,
@@ -477,7 +477,7 @@ xfs_efd_item_committing(
/*
* This is the ops vector shared by all efd log items.
*/
-static struct xfs_item_ops xfs_efd_item_ops = {
+static const struct xfs_item_ops xfs_efd_item_ops = {
.iop_size = xfs_efd_item_size,
.iop_format = xfs_efd_item_format,
.iop_pin = xfs_efd_item_pin,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7f7b42469ea..753ed9b5c70 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -124,6 +124,35 @@ xfs_iozero(
return (-status);
}
+/*
+ * Fsync operations on directories are much simpler than on regular files,
+ * as there is no file data to flush, and thus also no need for explicit
+ * cache flush operations, and there are no non-transaction metadata updates
+ * on directories either.
+ */
+STATIC int
+xfs_dir_fsync(
+ struct file *file,
+ loff_t start,
+ loff_t end,
+ int datasync)
+{
+ struct xfs_inode *ip = XFS_I(file->f_mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_lsn_t lsn = 0;
+
+ trace_xfs_dir_fsync(ip);
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ lsn = ip->i_itemp->ili_last_lsn;
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!lsn)
+ return 0;
+ return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+}
+
STATIC int
xfs_file_fsync(
struct file *file,
@@ -137,6 +166,7 @@ xfs_file_fsync(
struct xfs_trans *tp;
int error = 0;
int log_flushed = 0;
+ xfs_lsn_t lsn = 0;
trace_xfs_file_fsync(ip);
@@ -149,10 +179,6 @@ xfs_file_fsync(
xfs_iflags_clear(ip, XFS_ITRUNCATED);
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- xfs_ioend_wait(ip);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
if (mp->m_flags & XFS_MOUNT_BARRIER) {
/*
* If we have an RT and/or log subvolume we need to make sure
@@ -216,11 +242,11 @@ xfs_file_fsync(
* transaction. So we play it safe and fire off the
* transaction anyway.
*/
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_trans_set_sync(tp);
- error = _xfs_trans_commit(tp, 0, &log_flushed);
+ error = xfs_trans_commit(tp, 0);
+ lsn = ip->i_itemp->ili_last_lsn;
xfs_iunlock(ip, XFS_ILOCK_EXCL);
} else {
/*
@@ -231,14 +257,14 @@ xfs_file_fsync(
* disk yet, the inode will be still be pinned. If it is,
* force the log.
*/
- if (xfs_ipincount(ip)) {
- error = _xfs_log_force_lsn(mp,
- ip->i_itemp->ili_last_lsn,
- XFS_LOG_SYNC, &log_flushed);
- }
+ if (xfs_ipincount(ip))
+ lsn = ip->i_itemp->ili_last_lsn;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
}
+ if (!error && lsn)
+ error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+
/*
* If we only have a single device, and the log force about was
* a no-op we might have to flush the data device cache here.
@@ -317,7 +343,19 @@ xfs_file_aio_read(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if (unlikely(ioflags & IO_ISDIRECT)) {
+ /*
+ * Locking is a bit tricky here. If we take an exclusive lock
+ * for direct IO, we effectively serialise all new concurrent
+ * read IO to this file and block it behind IO that is currently in
+ * progress because IO in progress holds the IO lock shared. We only
+ * need to hold the lock exclusive to blow away the page cache, so
+ * only take lock exclusively if the page cache needs invalidation.
+ * This allows the normal direct IO case of no page cache pages to
+ * proceeed concurrently without serialisation.
+ */
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
if (inode->i_mapping->nrpages) {
@@ -330,8 +368,7 @@ xfs_file_aio_read(
}
}
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
- } else
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ }
trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
@@ -407,11 +444,13 @@ xfs_aio_write_isize_update(
*/
STATIC void
xfs_aio_write_newsize_update(
- struct xfs_inode *ip)
+ struct xfs_inode *ip,
+ xfs_fsize_t new_size)
{
- if (ip->i_new_size) {
+ if (new_size == ip->i_new_size) {
xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- ip->i_new_size = 0;
+ if (new_size == ip->i_new_size)
+ ip->i_new_size = 0;
if (ip->i_d.di_size > ip->i_size)
ip->i_d.di_size = ip->i_size;
xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
@@ -462,7 +501,7 @@ xfs_file_splice_write(
ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
xfs_aio_write_isize_update(inode, ppos, ret);
- xfs_aio_write_newsize_update(ip);
+ xfs_aio_write_newsize_update(ip, new_size);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
@@ -500,11 +539,9 @@ xfs_zero_last_block(
last_fsb = XFS_B_TO_FSBT(mp, isize);
nimaps = 1;
- error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
- &nimaps, NULL);
- if (error) {
+ error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
+ if (error)
return error;
- }
ASSERT(nimaps > 0);
/*
* If the block underlying isize is just a hole, then there
@@ -595,8 +632,8 @@ xfs_zero_eof(
while (start_zero_fsb <= end_zero_fsb) {
nimaps = 1;
zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
- error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
- 0, NULL, 0, &imap, &nimaps, NULL);
+ error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
+ &imap, &nimaps, 0);
if (error) {
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
return error;
@@ -659,6 +696,7 @@ xfs_file_aio_write_checks(
struct file *file,
loff_t *pos,
size_t *count,
+ xfs_fsize_t *new_sizep,
int *iolock)
{
struct inode *inode = file->f_mapping->host;
@@ -666,6 +704,9 @@ xfs_file_aio_write_checks(
xfs_fsize_t new_size;
int error = 0;
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+ *new_sizep = 0;
+restart:
error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
if (error) {
xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
@@ -673,20 +714,41 @@ xfs_file_aio_write_checks(
return error;
}
- new_size = *pos + *count;
- if (new_size > ip->i_size)
- ip->i_new_size = new_size;
-
if (likely(!(file->f_mode & FMODE_NOCMTIME)))
file_update_time(file);
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
- * write.
+ * write. There is no need to issue zeroing if another in-flght IO ends
+ * at or before this one If zeronig is needed and we are currently
+ * holding the iolock shared, we need to update it to exclusive which
+ * involves dropping all locks and relocking to maintain correct locking
+ * order. If we do this, restart the function to ensure all checks and
+ * values are still valid.
*/
- if (*pos > ip->i_size)
+ if ((ip->i_new_size && *pos > ip->i_new_size) ||
+ (!ip->i_new_size && *pos > ip->i_size)) {
+ if (*iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+ goto restart;
+ }
error = -xfs_zero_eof(ip, *pos, ip->i_size);
+ }
+
+ /*
+ * If this IO extends beyond EOF, we may need to update ip->i_new_size.
+ * We have already zeroed space beyond EOF (if necessary). Only update
+ * ip->i_new_size if this IO ends beyond any other in-flight writes.
+ */
+ new_size = *pos + *count;
+ if (new_size > ip->i_size) {
+ if (new_size > ip->i_new_size)
+ ip->i_new_size = new_size;
+ *new_sizep = new_size;
+ }
xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
@@ -721,7 +783,7 @@ xfs_file_aio_write_checks(
* the dio layer. To avoid the problem with aio, we also need to wait for
* outstanding IOs to complete so that unwritten extent conversion is completed
* before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ * hitting it with a big hammer (i.e. inode_dio_wait()).
*
* Returns with locks held indicated by @iolock and errors indicated by
* negative return values.
@@ -733,6 +795,7 @@ xfs_file_dio_aio_write(
unsigned long nr_segs,
loff_t pos,
size_t ocount,
+ xfs_fsize_t *new_size,
int *iolock)
{
struct file *file = iocb->ki_filp;
@@ -753,18 +816,35 @@ xfs_file_dio_aio_write(
if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
unaligned_io = 1;
- if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+ /*
+ * We don't need to take an exclusive lock unless there page cache needs
+ * to be invalidated or unaligned IO is being executed. We don't need to
+ * consider the EOF extension case here because
+ * xfs_file_aio_write_checks() will relock the inode as necessary for
+ * EOF zeroing cases and fill out the new inode size as appropriate.
+ */
+ if (unaligned_io || mapping->nrpages)
*iolock = XFS_IOLOCK_EXCL;
else
*iolock = XFS_IOLOCK_SHARED;
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+ xfs_rw_ilock(ip, *iolock);
- ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+ /*
+ * Recheck if there are cached pages that need invalidate after we got
+ * the iolock to protect against other threads adding new pages while
+ * we were waiting for the iolock.
+ */
+ if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, *iolock);
+ }
+
+ ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
if (ret)
return ret;
if (mapping->nrpages) {
- WARN_ON(*iolock != XFS_IOLOCK_EXCL);
ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
FI_REMAPF_LOCKED);
if (ret)
@@ -776,7 +856,7 @@ xfs_file_dio_aio_write(
* otherwise demote the lock if we had to flush cached pages
*/
if (unaligned_io)
- xfs_ioend_wait(ip);
+ inode_dio_wait(inode);
else if (*iolock == XFS_IOLOCK_EXCL) {
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
*iolock = XFS_IOLOCK_SHARED;
@@ -798,6 +878,7 @@ xfs_file_buffered_aio_write(
unsigned long nr_segs,
loff_t pos,
size_t ocount,
+ xfs_fsize_t *new_size,
int *iolock)
{
struct file *file = iocb->ki_filp;
@@ -809,9 +890,9 @@ xfs_file_buffered_aio_write(
size_t count = ocount;
*iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+ xfs_rw_ilock(ip, *iolock);
- ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+ ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
if (ret)
return ret;
@@ -851,6 +932,7 @@ xfs_file_aio_write(
ssize_t ret;
int iolock;
size_t ocount = 0;
+ xfs_fsize_t new_size = 0;
XFS_STATS_INC(xs_write_calls);
@@ -870,10 +952,10 @@ xfs_file_aio_write(
if (unlikely(file->f_flags & O_DIRECT))
ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
- ocount, &iolock);
+ ocount, &new_size, &iolock);
else
ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
- ocount, &iolock);
+ ocount, &new_size, &iolock);
xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
@@ -894,7 +976,7 @@ xfs_file_aio_write(
}
out_unlock:
- xfs_aio_write_newsize_update(ip);
+ xfs_aio_write_newsize_update(ip, new_size);
xfs_rw_iunlock(ip, iolock);
return ret;
}
@@ -1087,7 +1169,7 @@ const struct file_operations xfs_dir_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
#endif
- .fsync = xfs_file_fsync,
+ .fsync = xfs_dir_fsync,
};
static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 3ff3d9e23de..5170306a100 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -682,7 +682,7 @@ xfs_filestream_new_ag(
ip = ap->ip;
mp = ip->i_mount;
cache = mp->m_filestream;
- minlen = ap->alen;
+ minlen = ap->length;
*agp = NULLAGNUMBER;
/*
@@ -761,7 +761,7 @@ xfs_filestream_new_ag(
*/
ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
- (ap->low ? XFS_PICK_LOWSPACE : 0);
+ (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
if (err || *agp == NULLAGNUMBER)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 9153d2c77ca..1c6fdeb702f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -194,6 +194,10 @@ xfs_growfs_data_private(
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
agf = XFS_BUF_TO_AGF(bp);
memset(agf, 0, mp->m_sb.sb_sectsize);
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -216,16 +220,21 @@ xfs_growfs_data_private(
tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
agf->agf_freeblks = cpu_to_be32(tmpsize);
agf->agf_longest = cpu_to_be32(tmpsize);
- error = xfs_bwrite(mp, bp);
- if (error) {
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
goto error0;
- }
+
/*
* AG inode header block
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
agi = XFS_BUF_TO_AGI(bp);
memset(agi, 0, mp->m_sb.sb_sectsize);
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -240,10 +249,11 @@ xfs_growfs_data_private(
agi->agi_dirino = cpu_to_be32(NULLAGINO);
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
- error = xfs_bwrite(mp, bp);
- if (error) {
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
goto error0;
- }
+
/*
* BNO btree root block
*/
@@ -251,6 +261,10 @@ xfs_growfs_data_private(
XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
BTOBB(mp->m_sb.sb_blocksize),
XBF_LOCK | XBF_MAPPED);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
block = XFS_BUF_TO_BLOCK(bp);
memset(block, 0, mp->m_sb.sb_blocksize);
block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
@@ -262,10 +276,11 @@ xfs_growfs_data_private(
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
- error = xfs_bwrite(mp, bp);
- if (error) {
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
goto error0;
- }
+
/*
* CNT btree root block
*/
@@ -273,6 +288,10 @@ xfs_growfs_data_private(
XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
BTOBB(mp->m_sb.sb_blocksize),
XBF_LOCK | XBF_MAPPED);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
block = XFS_BUF_TO_BLOCK(bp);
memset(block, 0, mp->m_sb.sb_blocksize);
block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
@@ -285,10 +304,11 @@ xfs_growfs_data_private(
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
nfree += be32_to_cpu(arec->ar_blockcount);
- error = xfs_bwrite(mp, bp);
- if (error) {
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
goto error0;
- }
+
/*
* INO btree root block
*/
@@ -296,6 +316,10 @@ xfs_growfs_data_private(
XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
BTOBB(mp->m_sb.sb_blocksize),
XBF_LOCK | XBF_MAPPED);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
block = XFS_BUF_TO_BLOCK(bp);
memset(block, 0, mp->m_sb.sb_blocksize);
block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
@@ -303,10 +327,10 @@ xfs_growfs_data_private(
block->bb_numrecs = 0;
block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- error = xfs_bwrite(mp, bp);
- if (error) {
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
goto error0;
- }
}
xfs_trans_agblocks_delta(tp, nfree);
/*
@@ -396,9 +420,9 @@ xfs_growfs_data_private(
* just issue a warning and continue. The real work is
* already done and committed.
*/
- if (!(error = xfs_bwrite(mp, bp))) {
- continue;
- } else {
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error) {
xfs_warn(mp,
"write error %d updating secondary superblock for ag %d",
error, agno);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 9f24ec28283..169380e6605 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -150,7 +150,7 @@ xfs_check_agi_freecount(
/*
* Initialise a new set of inodes.
*/
-STATIC void
+STATIC int
xfs_ialloc_inode_init(
struct xfs_mount *mp,
struct xfs_trans *tp,
@@ -202,8 +202,8 @@ xfs_ialloc_inode_init(
fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
mp->m_bsize * blks_per_cluster,
XBF_LOCK);
- ASSERT(!xfs_buf_geterror(fbuf));
-
+ if (!fbuf)
+ return ENOMEM;
/*
* Initialize all inodes in this buffer and then log them.
*
@@ -225,6 +225,7 @@ xfs_ialloc_inode_init(
}
xfs_trans_inode_alloc_buf(tp, fbuf);
}
+ return 0;
}
/*
@@ -369,9 +370,11 @@ xfs_ialloc_ag_alloc(
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
- random32());
+ error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
+ args.len, random32());
+ if (error)
+ return error;
/*
* Convert the results.
*/
@@ -1502,7 +1505,7 @@ xfs_read_agi(
return XFS_ERROR(EFSCORRUPTED);
}
- XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
+ xfs_buf_set_ref(*bpp, XFS_AGI_REF);
xfs_check_agi_unlinked(agi);
return 0;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 7759812c1bb..0fa98b1c70e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -75,7 +75,6 @@ xfs_inode_alloc(
return NULL;
}
- ASSERT(atomic_read(&ip->i_iocount) == 0);
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush));
@@ -150,7 +149,6 @@ xfs_inode_free(
}
/* asserts to verify all state is correct here */
- ASSERT(atomic_read(&ip->i_iocount) == 0);
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush));
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0239a7c7c88..755ee816488 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -190,12 +190,6 @@ xfs_imap_to_bp(
}
xfs_inobp_check(mp, bp);
-
- /*
- * Mark the buffer as an inode buffer now that it looks good
- */
- XFS_BUF_SET_VTYPE(bp, B_FS_INO);
-
*bpp = bp;
return 0;
}
@@ -1152,7 +1146,7 @@ xfs_ialloc(
/*
* Log the new values stuffed into the inode.
*/
- xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, flags);
/* now that we have an i_mode we can setup inode ops and unlock */
@@ -1187,6 +1181,7 @@ xfs_isize_check(
xfs_fileoff_t map_first;
int nimaps;
xfs_bmbt_irec_t imaps[2];
+ int error;
if (!S_ISREG(ip->i_d.di_mode))
return;
@@ -1203,13 +1198,12 @@ xfs_isize_check(
* The filesystem could be shutting down, so bmapi may return
* an error.
*/
- if (xfs_bmapi(NULL, ip, map_first,
+ error = xfs_bmapi_read(ip, map_first,
(XFS_B_TO_FSB(mp,
- (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
- map_first),
- XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
- NULL))
- return;
+ (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
+ imaps, &nimaps, XFS_BMAPI_ENTIRE);
+ if (error)
+ return;
ASSERT(nimaps == 1);
ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
}
@@ -1297,7 +1291,7 @@ xfs_itruncate_extents(
*/
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (committed)
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
if (error)
goto out_bmap_cancel;
@@ -1313,7 +1307,7 @@ xfs_itruncate_extents(
error = xfs_trans_commit(tp, 0);
tp = ntp;
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
if (error)
goto out;
@@ -1644,7 +1638,7 @@ xfs_iunlink_remove(
* inodes that are in memory - they all must be marked stale and attached to
* the cluster buffer.
*/
-STATIC void
+STATIC int
xfs_ifree_cluster(
xfs_inode_t *free_ip,
xfs_trans_t *tp,
@@ -1690,6 +1684,8 @@ xfs_ifree_cluster(
mp->m_bsize * blks_per_cluster,
XBF_LOCK);
+ if (!bp)
+ return ENOMEM;
/*
* Walk the inodes already attached to the buffer and mark them
* stale. These will all have the flush locks held, so an
@@ -1799,6 +1795,7 @@ retry:
}
xfs_perag_put(pag);
+ return 0;
}
/*
@@ -1878,10 +1875,10 @@ xfs_ifree(
dip->di_mode = 0;
if (delete) {
- xfs_ifree_cluster(ip, tp, first_ino);
+ error = xfs_ifree_cluster(ip, tp, first_ino);
}
- return 0;
+ return error;
}
/*
@@ -2472,11 +2469,11 @@ cluster_corrupt_out:
*/
if (bp->b_iodone) {
XFS_BUF_UNDONE(bp);
- XFS_BUF_STALE(bp);
+ xfs_buf_stale(bp);
xfs_buf_ioerror(bp, EIO);
xfs_buf_ioend(bp, 0);
} else {
- XFS_BUF_STALE(bp);
+ xfs_buf_stale(bp);
xfs_buf_relse(bp);
}
}
@@ -2597,9 +2594,11 @@ xfs_iflush(
goto cluster_corrupt_out;
if (flags & SYNC_WAIT)
- error = xfs_bwrite(mp, bp);
+ error = xfs_bwrite(bp);
else
- xfs_bdwrite(mp, bp);
+ xfs_buf_delwri_queue(bp);
+
+ xfs_buf_relse(bp);
return error;
corrupt_out:
@@ -2836,6 +2835,27 @@ corrupt_out:
return XFS_ERROR(EFSCORRUPTED);
}
+void
+xfs_promote_inode(
+ struct xfs_inode *ip)
+{
+ struct xfs_buf *bp;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+
+ bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno,
+ ip->i_imap.im_len, XBF_TRYLOCK);
+ if (!bp)
+ return;
+
+ if (XFS_BUF_ISDELAYWRITE(bp)) {
+ xfs_buf_delwri_promote(bp);
+ wake_up_process(ip->i_mount->m_ddev_targp->bt_task);
+ }
+
+ xfs_buf_relse(bp);
+}
+
/*
* Return a pointer to the extent record at file index idx.
*/
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 2380a4bcbec..b4cd4739f98 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -257,7 +257,6 @@ typedef struct xfs_inode {
xfs_fsize_t i_size; /* in-memory size */
xfs_fsize_t i_new_size; /* size when write completes */
- atomic_t i_iocount; /* outstanding I/O count */
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
@@ -499,6 +498,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
void xfs_iext_realloc(xfs_inode_t *, int, int);
void xfs_iunpin_wait(xfs_inode_t *);
int xfs_iflush(xfs_inode_t *, uint);
+void xfs_promote_inode(struct xfs_inode *);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 836ad80d4f2..abaafdbb3e6 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -658,10 +658,8 @@ xfs_inode_item_unlock(
lock_flags = iip->ili_lock_flags;
iip->ili_lock_flags = 0;
- if (lock_flags) {
+ if (lock_flags)
xfs_iunlock(ip, lock_flags);
- IRELE(ip);
- }
}
/*
@@ -797,7 +795,7 @@ xfs_inode_item_committing(
/*
* This is the ops vector shared by all buf log items.
*/
-static struct xfs_item_ops xfs_inode_item_ops = {
+static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_size = xfs_inode_item_size,
.iop_format = xfs_inode_item_format,
.iop_pin = xfs_inode_item_pin,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f7ce7debe14..d99a9051890 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1069,7 +1069,7 @@ xfs_ioctl_setattr(
}
}
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
/*
* Change file ownership. Must be the owner or privileged.
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 091d82b94c4..9afa282aa93 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -208,22 +208,20 @@ xfs_iomap_write_direct(
if (error)
goto error1;
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
- bmapi_flag = XFS_BMAPI_WRITE;
+ bmapi_flag = 0;
if (offset < ip->i_size || extsz)
bmapi_flag |= XFS_BMAPI_PREALLOC;
/*
- * Issue the xfs_bmapi() call to allocate the blocks.
- *
* From this point onwards we overwrite the imap pointer that the
* caller gave to us.
*/
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
- error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
- &firstfsb, 0, imap, &nimaps, &free_list);
+ error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
+ &firstfsb, 0, imap, &nimaps, &free_list);
if (error)
goto error0;
@@ -300,8 +298,8 @@ xfs_iomap_eof_want_preallocate(
while (count_fsb > 0) {
imaps = nimaps;
firstblock = NULLFSBLOCK;
- error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
- &firstblock, 0, imap, &imaps, NULL);
+ error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
+ 0);
if (error)
return error;
for (n = 0; n < imaps; n++) {
@@ -381,7 +379,6 @@ xfs_iomap_write_delay(
xfs_fileoff_t last_fsb;
xfs_off_t aligned_offset;
xfs_fileoff_t ioalign;
- xfs_fsblock_t firstblock;
xfs_extlen_t extsz;
int nimaps;
xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
@@ -425,12 +422,8 @@ retry:
}
nimaps = XFS_WRITE_IMAPS;
- firstblock = NULLFSBLOCK;
- error = xfs_bmapi(NULL, ip, offset_fsb,
- (xfs_filblks_t)(last_fsb - offset_fsb),
- XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
- XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
- &nimaps, NULL);
+ error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
+ imap, &nimaps, XFS_BMAPI_ENTIRE);
switch (error) {
case 0:
case ENOSPC:
@@ -535,7 +528,7 @@ xfs_iomap_write_allocate(
return XFS_ERROR(error);
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
xfs_bmap_init(&free_list, &first_block);
@@ -587,14 +580,12 @@ xfs_iomap_write_allocate(
}
/*
- * Go get the actual blocks.
- *
* From this point onwards we overwrite the imap
* pointer that the caller gave to us.
*/
- error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
- XFS_BMAPI_WRITE, &first_block, 1,
- imap, &nimaps, &free_list);
+ error = xfs_bmapi_write(tp, ip, map_start_fsb,
+ count_fsb, 0, &first_block, 1,
+ imap, &nimaps, &free_list);
if (error)
goto trans_cancel;
@@ -701,15 +692,15 @@ xfs_iomap_write_unwritten(
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
/*
* Modify the unwritten extent state of the buffer.
*/
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
- error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
+ error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+ XFS_BMAPI_CONVERT, &firstfsb,
1, &imap, &nimaps, &free_list);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 673704fab74..23ce927973a 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -102,37 +102,38 @@ xfs_mark_inode_dirty(
}
+
+int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
+{
+ const struct xattr *xattr;
+ struct xfs_inode *ip = XFS_I(inode);
+ int error = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ error = xfs_attr_set(ip, xattr->name, xattr->value,
+ xattr->value_len, ATTR_SECURE);
+ if (error < 0)
+ break;
+ }
+ return error;
+}
+
/*
* Hook in SELinux. This is not quite correct yet, what we really need
* here (as we do for default ACLs) is a mechanism by which creation of
* these attrs can be journalled at inode creation time (along with the
* inode, of course, such that log replay can't cause these to be lost).
*/
+
STATIC int
xfs_init_security(
struct inode *inode,
struct inode *dir,
const struct qstr *qstr)
{
- struct xfs_inode *ip = XFS_I(inode);
- size_t length;
- void *value;
- unsigned char *name;
- int error;
-
- error = security_inode_init_security(inode, dir, qstr, (char **)&name,
- &value, &length);
- if (error) {
- if (error == -EOPNOTSUPP)
- return 0;
- return -error;
- }
-
- error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
-
- kfree(name);
- kfree(value);
- return error;
+ return security_inode_init_security(inode, dir, qstr,
+ &xfs_initxattrs, NULL);
}
static void
@@ -465,7 +466,7 @@ xfs_vn_getattr(
trace_xfs_getattr(ip);
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -XFS_ERROR(EIO);
stat->size = XFS_ISIZE(ip);
stat->dev = inode->i_sb->s_dev;
@@ -611,7 +612,7 @@ xfs_setattr_nonsize(
}
}
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
/*
* Change file ownership. Must be the owner or privileged.
@@ -833,16 +834,16 @@ xfs_setattr_size(
* care about here.
*/
if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
- error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size,
- XBF_ASYNC, FI_NONE);
+ error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+ FI_NONE);
if (error)
goto out_unlock;
}
/*
- * Wait for all I/O to complete.
+ * Wait for all direct I/O to complete.
*/
- xfs_ioend_wait(ip);
+ inode_dio_wait(inode);
error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
xfs_get_blocks);
@@ -863,7 +864,7 @@ xfs_setattr_size(
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
/*
* Only change the c/mtime if we are changing the size or we are
@@ -1152,7 +1153,7 @@ xfs_setup_inode(
hlist_add_fake(&inode->i_hash);
inode->i_mode = ip->i_d.di_mode;
- inode->i_nlink = ip->i_d.di_nlink;
+ set_nlink(inode, ip->i_d.di_nlink);
inode->i_uid = ip->i_d.di_uid;
inode->i_gid = ip->i_d.di_gid;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3a8d4f66d70..34817adf4b9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -150,6 +150,117 @@ xlog_grant_add_space(
} while (head_val != old);
}
+STATIC bool
+xlog_reserveq_wake(
+ struct log *log,
+ int *free_bytes)
+{
+ struct xlog_ticket *tic;
+ int need_bytes;
+
+ list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+ if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+ need_bytes = tic->t_unit_res * tic->t_cnt;
+ else
+ need_bytes = tic->t_unit_res;
+
+ if (*free_bytes < need_bytes)
+ return false;
+ *free_bytes -= need_bytes;
+
+ trace_xfs_log_grant_wake_up(log, tic);
+ wake_up(&tic->t_wait);
+ }
+
+ return true;
+}
+
+STATIC bool
+xlog_writeq_wake(
+ struct log *log,
+ int *free_bytes)
+{
+ struct xlog_ticket *tic;
+ int need_bytes;
+
+ list_for_each_entry(tic, &log->l_writeq, t_queue) {
+ ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+
+ need_bytes = tic->t_unit_res;
+
+ if (*free_bytes < need_bytes)
+ return false;
+ *free_bytes -= need_bytes;
+
+ trace_xfs_log_regrant_write_wake_up(log, tic);
+ wake_up(&tic->t_wait);
+ }
+
+ return true;
+}
+
+STATIC int
+xlog_reserveq_wait(
+ struct log *log,
+ struct xlog_ticket *tic,
+ int need_bytes)
+{
+ list_add_tail(&tic->t_queue, &log->l_reserveq);
+
+ do {
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto shutdown;
+ xlog_grant_push_ail(log, need_bytes);
+
+ XFS_STATS_INC(xs_sleep_logspace);
+ trace_xfs_log_grant_sleep(log, tic);
+
+ xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+ trace_xfs_log_grant_wake(log, tic);
+
+ spin_lock(&log->l_grant_reserve_lock);
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto shutdown;
+ } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
+
+ list_del_init(&tic->t_queue);
+ return 0;
+shutdown:
+ list_del_init(&tic->t_queue);
+ return XFS_ERROR(EIO);
+}
+
+STATIC int
+xlog_writeq_wait(
+ struct log *log,
+ struct xlog_ticket *tic,
+ int need_bytes)
+{
+ list_add_tail(&tic->t_queue, &log->l_writeq);
+
+ do {
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto shutdown;
+ xlog_grant_push_ail(log, need_bytes);
+
+ XFS_STATS_INC(xs_sleep_logspace);
+ trace_xfs_log_regrant_write_sleep(log, tic);
+
+ xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+ trace_xfs_log_regrant_write_wake(log, tic);
+
+ spin_lock(&log->l_grant_write_lock);
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto shutdown;
+ } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
+
+ list_del_init(&tic->t_queue);
+ return 0;
+shutdown:
+ list_del_init(&tic->t_queue);
+ return XFS_ERROR(EIO);
+}
+
static void
xlog_tic_reset_res(xlog_ticket_t *tic)
{
@@ -350,8 +461,19 @@ xfs_log_reserve(
retval = xlog_grant_log_space(log, internal_ticket);
}
+ if (unlikely(retval)) {
+ /*
+ * If we are failing, make sure the ticket doesn't have any
+ * current reservations. We don't want to add this back
+ * when the ticket/ transaction gets cancelled.
+ */
+ internal_ticket->t_curr_res = 0;
+ /* ungrant will give back unit_res * t_cnt. */
+ internal_ticket->t_cnt = 0;
+ }
+
return retval;
-} /* xfs_log_reserve */
+}
/*
@@ -626,7 +748,7 @@ xfs_log_item_init(
struct xfs_mount *mp,
struct xfs_log_item *item,
int type,
- struct xfs_item_ops *ops)
+ const struct xfs_item_ops *ops)
{
item->li_mountp = mp;
item->li_ailp = mp->m_ail;
@@ -880,8 +1002,8 @@ xlog_iodone(xfs_buf_t *bp)
*/
if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
- xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
- XFS_BUF_STALE(bp);
+ xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_stale(bp);
xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
/*
* This flag will be propagated to the trans-committed
@@ -1047,7 +1169,7 @@ xlog_alloc_log(xfs_mount_t *mp,
xlog_get_iclog_buffer_size(mp, log);
error = ENOMEM;
- bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
+ bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0);
if (!bp)
goto out_free_log;
bp->b_iodone = xlog_iodone;
@@ -1247,7 +1369,7 @@ xlog_bdstrat(
if (iclog->ic_state & XLOG_STATE_IOERROR) {
xfs_buf_ioerror(bp, EIO);
- XFS_BUF_STALE(bp);
+ xfs_buf_stale(bp);
xfs_buf_ioend(bp, 0);
/*
* It would seem logical to return EIO here, but we rely on
@@ -1387,9 +1509,9 @@ xlog_sync(xlog_t *log,
*/
XFS_BUF_WRITE(bp);
- if ((error = xlog_bdstrat(bp))) {
- xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
- XFS_BUF_ADDR(bp));
+ error = xlog_bdstrat(bp);
+ if (error) {
+ xfs_buf_ioerror_alert(bp, "xlog_sync");
return error;
}
if (split) {
@@ -1423,9 +1545,9 @@ xlog_sync(xlog_t *log,
/* account for internal log which doesn't start at block #0 */
XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
XFS_BUF_WRITE(bp);
- if ((error = xlog_bdstrat(bp))) {
- xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
- bp, XFS_BUF_ADDR(bp));
+ error = xlog_bdstrat(bp);
+ if (error) {
+ xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
return error;
}
}
@@ -2481,8 +2603,8 @@ restart:
/*
* Atomically get the log space required for a log ticket.
*
- * Once a ticket gets put onto the reserveq, it will only return after
- * the needed reservation is satisfied.
+ * Once a ticket gets put onto the reserveq, it will only return after the
+ * needed reservation is satisfied.
*
* This function is structured so that it has a lock free fast path. This is
* necessary because every new transaction reservation will come through this
@@ -2490,113 +2612,53 @@ restart:
* every pass.
*
* As tickets are only ever moved on and off the reserveq under the
- * l_grant_reserve_lock, we only need to take that lock if we are going
- * to add the ticket to the queue and sleep. We can avoid taking the lock if the
- * ticket was never added to the reserveq because the t_queue list head will be
- * empty and we hold the only reference to it so it can safely be checked
- * unlocked.
+ * l_grant_reserve_lock, we only need to take that lock if we are going to add
+ * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
+ * was never added to the reserveq because the t_queue list head will be empty
+ * and we hold the only reference to it so it can safely be checked unlocked.
*/
STATIC int
-xlog_grant_log_space(xlog_t *log,
- xlog_ticket_t *tic)
+xlog_grant_log_space(
+ struct log *log,
+ struct xlog_ticket *tic)
{
- int free_bytes;
- int need_bytes;
+ int free_bytes, need_bytes;
+ int error = 0;
-#ifdef DEBUG
- if (log->l_flags & XLOG_ACTIVE_RECOVERY)
- panic("grant Recovery problem");
-#endif
+ ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
trace_xfs_log_grant_enter(log, tic);
+ /*
+ * If there are other waiters on the queue then give them a chance at
+ * logspace before us. Wake up the first waiters, if we do not wake
+ * up all the waiters then go to sleep waiting for more free space,
+ * otherwise try to get some space for this transaction.
+ */
need_bytes = tic->t_unit_res;
if (tic->t_flags & XFS_LOG_PERM_RESERV)
need_bytes *= tic->t_ocnt;
-
- /* something is already sleeping; insert new transaction at end */
- if (!list_empty_careful(&log->l_reserveq)) {
- spin_lock(&log->l_grant_reserve_lock);
- /* recheck the queue now we are locked */
- if (list_empty(&log->l_reserveq)) {
- spin_unlock(&log->l_grant_reserve_lock);
- goto redo;
- }
- list_add_tail(&tic->t_queue, &log->l_reserveq);
-
- trace_xfs_log_grant_sleep1(log, tic);
-
- /*
- * Gotta check this before going to sleep, while we're
- * holding the grant lock.
- */
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return;
-
- XFS_STATS_INC(xs_sleep_logspace);
- xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-
- /*
- * If we got an error, and the filesystem is shutting down,
- * we'll catch it down below. So just continue...
- */
- trace_xfs_log_grant_wake1(log, tic);
- }
-
-redo:
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return_unlocked;
-
free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
- if (free_bytes < need_bytes) {
+ if (!list_empty_careful(&log->l_reserveq)) {
spin_lock(&log->l_grant_reserve_lock);
- if (list_empty(&tic->t_queue))
- list_add_tail(&tic->t_queue, &log->l_reserveq);
-
- trace_xfs_log_grant_sleep2(log, tic);
-
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return;
-
- xlog_grant_push_ail(log, need_bytes);
-
- XFS_STATS_INC(xs_sleep_logspace);
- xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-
- trace_xfs_log_grant_wake2(log, tic);
- goto redo;
- }
-
- if (!list_empty(&tic->t_queue)) {
+ if (!xlog_reserveq_wake(log, &free_bytes) ||
+ free_bytes < need_bytes)
+ error = xlog_reserveq_wait(log, tic, need_bytes);
+ spin_unlock(&log->l_grant_reserve_lock);
+ } else if (free_bytes < need_bytes) {
spin_lock(&log->l_grant_reserve_lock);
- list_del_init(&tic->t_queue);
+ error = xlog_reserveq_wait(log, tic, need_bytes);
spin_unlock(&log->l_grant_reserve_lock);
}
+ if (error)
+ return error;
- /* we've got enough space */
xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
trace_xfs_log_grant_exit(log, tic);
xlog_verify_grant_tail(log);
return 0;
-
-error_return_unlocked:
- spin_lock(&log->l_grant_reserve_lock);
-error_return:
- list_del_init(&tic->t_queue);
- spin_unlock(&log->l_grant_reserve_lock);
- trace_xfs_log_grant_error(log, tic);
-
- /*
- * If we are failing, make sure the ticket doesn't have any
- * current reservations. We don't want to add this back when
- * the ticket/transaction gets cancelled.
- */
- tic->t_curr_res = 0;
- tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
- return XFS_ERROR(EIO);
-} /* xlog_grant_log_space */
-
+}
/*
* Replenish the byte reservation required by moving the grant write head.
@@ -2605,10 +2667,12 @@ error_return:
* free fast path.
*/
STATIC int
-xlog_regrant_write_log_space(xlog_t *log,
- xlog_ticket_t *tic)
+xlog_regrant_write_log_space(
+ struct log *log,
+ struct xlog_ticket *tic)
{
- int free_bytes, need_bytes;
+ int free_bytes, need_bytes;
+ int error = 0;
tic->t_curr_res = tic->t_unit_res;
xlog_tic_reset_res(tic);
@@ -2616,104 +2680,38 @@ xlog_regrant_write_log_space(xlog_t *log,
if (tic->t_cnt > 0)
return 0;
-#ifdef DEBUG
- if (log->l_flags & XLOG_ACTIVE_RECOVERY)
- panic("regrant Recovery problem");
-#endif
+ ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
trace_xfs_log_regrant_write_enter(log, tic);
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return_unlocked;
- /* If there are other waiters on the queue then give them a
- * chance at logspace before us. Wake up the first waiters,
- * if we do not wake up all the waiters then go to sleep waiting
- * for more free space, otherwise try to get some space for
- * this transaction.
+ /*
+ * If there are other waiters on the queue then give them a chance at
+ * logspace before us. Wake up the first waiters, if we do not wake
+ * up all the waiters then go to sleep waiting for more free space,
+ * otherwise try to get some space for this transaction.
*/
need_bytes = tic->t_unit_res;
- if (!list_empty_careful(&log->l_writeq)) {
- struct xlog_ticket *ntic;
-
- spin_lock(&log->l_grant_write_lock);
- free_bytes = xlog_space_left(log, &log->l_grant_write_head);
- list_for_each_entry(ntic, &log->l_writeq, t_queue) {
- ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
-
- if (free_bytes < ntic->t_unit_res)
- break;
- free_bytes -= ntic->t_unit_res;
- wake_up(&ntic->t_wait);
- }
-
- if (ntic != list_first_entry(&log->l_writeq,
- struct xlog_ticket, t_queue)) {
- if (list_empty(&tic->t_queue))
- list_add_tail(&tic->t_queue, &log->l_writeq);
- trace_xfs_log_regrant_write_sleep1(log, tic);
-
- xlog_grant_push_ail(log, need_bytes);
-
- XFS_STATS_INC(xs_sleep_logspace);
- xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
- trace_xfs_log_regrant_write_wake1(log, tic);
- } else
- spin_unlock(&log->l_grant_write_lock);
- }
-
-redo:
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return_unlocked;
-
free_bytes = xlog_space_left(log, &log->l_grant_write_head);
- if (free_bytes < need_bytes) {
+ if (!list_empty_careful(&log->l_writeq)) {
spin_lock(&log->l_grant_write_lock);
- if (list_empty(&tic->t_queue))
- list_add_tail(&tic->t_queue, &log->l_writeq);
-
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return;
-
- xlog_grant_push_ail(log, need_bytes);
-
- XFS_STATS_INC(xs_sleep_logspace);
- trace_xfs_log_regrant_write_sleep2(log, tic);
- xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-
- trace_xfs_log_regrant_write_wake2(log, tic);
- goto redo;
- }
-
- if (!list_empty(&tic->t_queue)) {
+ if (!xlog_writeq_wake(log, &free_bytes) ||
+ free_bytes < need_bytes)
+ error = xlog_writeq_wait(log, tic, need_bytes);
+ spin_unlock(&log->l_grant_write_lock);
+ } else if (free_bytes < need_bytes) {
spin_lock(&log->l_grant_write_lock);
- list_del_init(&tic->t_queue);
+ error = xlog_writeq_wait(log, tic, need_bytes);
spin_unlock(&log->l_grant_write_lock);
}
- /* we've got enough space */
+ if (error)
+ return error;
+
xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
trace_xfs_log_regrant_write_exit(log, tic);
xlog_verify_grant_tail(log);
return 0;
-
-
- error_return_unlocked:
- spin_lock(&log->l_grant_write_lock);
- error_return:
- list_del_init(&tic->t_queue);
- spin_unlock(&log->l_grant_write_lock);
- trace_xfs_log_regrant_write_error(log, tic);
-
- /*
- * If we are failing, make sure the ticket doesn't have any
- * current reservations. We don't want to add this back when
- * the ticket/transaction gets cancelled.
- */
- tic->t_curr_res = 0;
- tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
- return XFS_ERROR(EIO);
-} /* xlog_regrant_write_log_space */
-
+}
/* The first cnt-1 times through here we don't need to
* move the grant write head because the permanent
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 78c9039994a..3f7bf451c03 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -137,7 +137,7 @@ struct xfs_trans;
void xfs_log_item_init(struct xfs_mount *mp,
struct xfs_log_item *item,
int type,
- struct xfs_item_ops *ops);
+ const struct xfs_item_ops *ops);
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
struct xlog_ticket *ticket,
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a199dbcee7d..541a508adea 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -183,8 +183,7 @@ xlog_bread_noalign(
xfsbdstrat(log->l_mp, bp);
error = xfs_buf_iowait(bp);
if (error)
- xfs_ioerror_alert("xlog_bread", log->l_mp,
- bp, XFS_BUF_ADDR(bp));
+ xfs_buf_ioerror_alert(bp, __func__);
return error;
}
@@ -268,9 +267,10 @@ xlog_bwrite(
xfs_buf_lock(bp);
XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
- if ((error = xfs_bwrite(log->l_mp, bp)))
- xfs_ioerror_alert("xlog_bwrite", log->l_mp,
- bp, XFS_BUF_ADDR(bp));
+ error = xfs_bwrite(bp);
+ if (error)
+ xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_relse(bp);
return error;
}
@@ -361,9 +361,7 @@ xlog_recover_iodone(
* We're not going to bother about retrying
* this during recovery. One strike!
*/
- xfs_ioerror_alert("xlog_recover_iodone",
- bp->b_target->bt_mount, bp,
- XFS_BUF_ADDR(bp));
+ xfs_buf_ioerror_alert(bp, __func__);
xfs_force_shutdown(bp->b_target->bt_mount,
SHUTDOWN_META_IO_ERROR);
}
@@ -2135,8 +2133,7 @@ xlog_recover_buffer_pass2(
return XFS_ERROR(ENOMEM);
error = bp->b_error;
if (error) {
- xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
- bp, buf_f->blf_blkno);
+ xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
xfs_buf_relse(bp);
return error;
}
@@ -2171,15 +2168,16 @@ xlog_recover_buffer_pass2(
be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
(XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
- XFS_BUF_STALE(bp);
- error = xfs_bwrite(mp, bp);
+ xfs_buf_stale(bp);
+ error = xfs_bwrite(bp);
} else {
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
- xfs_bdwrite(mp, bp);
+ xfs_buf_delwri_queue(bp);
}
- return (error);
+ xfs_buf_relse(bp);
+ return error;
}
STATIC int
@@ -2230,8 +2228,7 @@ xlog_recover_inode_pass2(
}
error = bp->b_error;
if (error) {
- xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
- bp, in_f->ilf_blkno);
+ xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
xfs_buf_relse(bp);
goto error;
}
@@ -2439,7 +2436,8 @@ xlog_recover_inode_pass2(
write_inode_buffer:
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
- xfs_bdwrite(mp, bp);
+ xfs_buf_delwri_queue(bp);
+ xfs_buf_relse(bp);
error:
if (need_free)
kmem_free(in_f);
@@ -2537,8 +2535,7 @@ xlog_recover_dquot_pass2(
XFS_FSB_TO_BB(mp, dq_f->qlf_len),
0, &bp);
if (error) {
- xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
- bp, dq_f->qlf_blkno);
+ xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)");
return error;
}
ASSERT(bp);
@@ -2561,7 +2558,8 @@ xlog_recover_dquot_pass2(
ASSERT(dq_f->qlf_size == 2);
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
- xfs_bdwrite(mp, bp);
+ xfs_buf_delwri_queue(bp);
+ xfs_buf_relse(bp);
return (0);
}
@@ -3656,7 +3654,7 @@ xlog_do_recover(
return error;
}
- XFS_bflush(log->l_mp->m_ddev_targp);
+ xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
/*
* If IO errors happened during recovery, bail out.
@@ -3689,8 +3687,7 @@ xlog_do_recover(
xfsbdstrat(log->l_mp, bp);
error = xfs_buf_iowait(bp);
if (error) {
- xfs_ioerror_alert("xlog_do_recover",
- log->l_mp, bp, XFS_BUF_ADDR(bp));
+ xfs_buf_ioerror_alert(bp, __func__);
ASSERT(0);
xfs_buf_relse(bp);
return error;
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 7fb7ea00767..56dc0c17f16 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -3,31 +3,29 @@
struct xfs_mount;
-extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
-extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
-extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
-extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
-extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
-extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
-extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
-extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
+void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(3, 4)
+void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_err(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...);
+extern __printf(2, 3)
+void xfs_info(const struct xfs_mount *mp, const char *fmt, ...);
#ifdef DEBUG
-extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
+extern __printf(2, 3)
+void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...);
#else
-static inline void
-__attribute__ ((format (printf, 2, 3)))
-xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+static inline __printf(2, 3)
+void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
{
}
#endif
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0081657ad98..d06afbc3540 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -44,9 +44,6 @@
#include "xfs_trace.h"
-STATIC void xfs_unmountfs_wait(xfs_mount_t *);
-
-
#ifdef HAVE_PERCPU_SB
STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
int);
@@ -1484,7 +1481,7 @@ xfs_unmountfs(
* state as much as possible.
*/
xfs_reclaim_inodes(mp, 0);
- XFS_bflush(mp->m_ddev_targp);
+ xfs_flush_buftarg(mp->m_ddev_targp, 1);
xfs_reclaim_inodes(mp, SYNC_WAIT);
xfs_qm_unmount(mp);
@@ -1496,11 +1493,6 @@ xfs_unmountfs(
*/
xfs_log_force(mp, XFS_LOG_SYNC);
- xfs_binval(mp->m_ddev_targp);
- if (mp->m_rtdev_targp) {
- xfs_binval(mp->m_rtdev_targp);
- }
-
/*
* Unreserve any blocks we have so that when we unmount we don't account
* the reserved free space as used. This is really only necessary for
@@ -1526,7 +1518,16 @@ xfs_unmountfs(
xfs_warn(mp, "Unable to update superblock counters. "
"Freespace may not be correct on next mount.");
xfs_unmountfs_writesb(mp);
- xfs_unmountfs_wait(mp); /* wait for async bufs */
+
+ /*
+ * Make sure all buffers have been flushed and completed before
+ * unmounting the log.
+ */
+ error = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+ if (error)
+ xfs_warn(mp, "%d busy buffers during unmount.", error);
+ xfs_wait_buftarg(mp->m_ddev_targp);
+
xfs_log_unmount_write(mp);
xfs_log_unmount(mp);
xfs_uuid_unmount(mp);
@@ -1537,16 +1538,6 @@ xfs_unmountfs(
xfs_free_perag(mp);
}
-STATIC void
-xfs_unmountfs_wait(xfs_mount_t *mp)
-{
- if (mp->m_logdev_targp != mp->m_ddev_targp)
- xfs_wait_buftarg(mp->m_logdev_targp);
- if (mp->m_rtdev_targp)
- xfs_wait_buftarg(mp->m_rtdev_targp);
- xfs_wait_buftarg(mp->m_ddev_targp);
-}
-
int
xfs_fs_writable(xfs_mount_t *mp)
{
@@ -1612,15 +1603,14 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
XFS_BUF_UNDONE(sbp);
XFS_BUF_UNREAD(sbp);
- XFS_BUF_UNDELAYWRITE(sbp);
+ xfs_buf_delwri_dequeue(sbp);
XFS_BUF_WRITE(sbp);
XFS_BUF_UNASYNC(sbp);
ASSERT(sbp->b_target == mp->m_ddev_targp);
xfsbdstrat(mp, sbp);
error = xfs_buf_iowait(sbp);
if (error)
- xfs_ioerror_alert("xfs_unmountfs_writesb",
- mp, sbp, XFS_BUF_ADDR(sbp));
+ xfs_buf_ioerror_alert(sbp, __func__);
xfs_buf_relse(sbp);
}
return error;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 9a0aa76facd..0bbb1a41998 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -674,7 +674,8 @@ xfs_qm_dqattach_one(
* disk and we didn't ask it to allocate;
* ESRCH if quotas got turned off suddenly.
*/
- error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp);
+ error = xfs_qm_dqget(ip->i_mount, ip, id, type,
+ doalloc | XFS_QMOPT_DOWARN, &dqp);
if (error)
return error;
@@ -1296,7 +1297,8 @@ xfs_qm_dqiter_bufs(
break;
xfs_qm_reset_dqcounts(mp, bp, firstid, type);
- xfs_bdwrite(mp, bp);
+ xfs_buf_delwri_queue(bp);
+ xfs_buf_relse(bp);
/*
* goto the next block.
*/
@@ -1346,11 +1348,8 @@ xfs_qm_dqiterate(
* the inode is never added to the transaction.
*/
xfs_ilock(qip, XFS_ILOCK_SHARED);
- error = xfs_bmapi(NULL, qip, lblkno,
- maxlblkcnt - lblkno,
- XFS_BMAPI_METADATA,
- NULL,
- 0, map, &nmaps, NULL);
+ error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
+ map, &nmaps, 0);
xfs_iunlock(qip, XFS_ILOCK_SHARED);
if (error)
break;
@@ -1683,7 +1682,7 @@ xfs_qm_quotacheck(
* quotacheck'd stamp on the superblock. So, here we do a synchronous
* flush.
*/
- XFS_bflush(mp->m_ddev_targp);
+ xfs_flush_buftarg(mp->m_ddev_targp, 1);
/*
* If one type of quotas is off, then it will lose its
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 609246f42e6..5cc3dde1bc9 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -261,7 +261,7 @@ xfs_qm_scall_trunc_qfile(
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
error = xfs_itruncate_data(&tp, ip, 0);
if (error) {
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index df78c297d1a..866de277079 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -170,12 +170,12 @@ xfs_rename(
* we can rely on either trans_commit or trans_cancel to unlock
* them.
*/
- xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
if (new_parent)
- xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
if (target_ip)
- xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
/*
* If we are using project inheritance, we only allow renames
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 35561a511b5..87323f1ded6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -112,7 +112,7 @@ xfs_growfs_rt_alloc(
* Lock the inode.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_bmap_init(&flist, &firstblock);
/*
@@ -120,9 +120,9 @@ xfs_growfs_rt_alloc(
*/
nmap = 1;
cancelflags |= XFS_TRANS_ABORT;
- error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
- XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
- resblks, &map, &nmap, &flist);
+ error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
+ XFS_BMAPI_METADATA, &firstblock,
+ resblks, &map, &nmap, &flist);
if (!error && nmap < 1)
error = XFS_ERROR(ENOSPC);
if (error)
@@ -155,7 +155,7 @@ xfs_growfs_rt_alloc(
* Lock the bitmap inode.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
/*
* Get a buffer for the block.
*/
@@ -856,33 +856,23 @@ xfs_rtbuf_get(
xfs_buf_t **bpp) /* output: buffer for the block */
{
xfs_buf_t *bp; /* block buffer, result */
- xfs_daddr_t d; /* disk addr of block */
- int error; /* error value */
- xfs_fsblock_t fsb; /* fs block number for block */
xfs_inode_t *ip; /* bitmap or summary inode */
+ xfs_bmbt_irec_t map;
+ int nmap;
+ int error; /* error value */
ip = issum ? mp->m_rsumip : mp->m_rbmip;
- /*
- * Map from the file offset (block) and inode number to the
- * file system block.
- */
- error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block);
- if (error) {
+
+ error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
+ if (error)
return error;
- }
- ASSERT(fsb != NULLFSBLOCK);
- /*
- * Convert to disk address for buffer cache.
- */
- d = XFS_FSB_TO_DADDR(mp, fsb);
- /*
- * Read the buffer.
- */
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+
+ ASSERT(map.br_startblock != NULLFSBLOCK);
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, map.br_startblock),
mp->m_bsize, 0, &bp);
- if (error) {
+ if (error)
return error;
- }
ASSERT(!xfs_buf_geterror(bp));
*bpp = bp;
return 0;
@@ -1970,7 +1960,7 @@ xfs_growfs_rt(
* Lock out other callers by grabbing the bitmap inode lock.
*/
xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
/*
* Update the bitmap inode's size.
*/
@@ -1982,7 +1972,7 @@ xfs_growfs_rt(
* Get the summary inode into the transaction.
*/
xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
/*
* Update the summary inode's size.
*/
@@ -2153,7 +2143,7 @@ xfs_rtfree_extent(
* Synchronize by locking the bitmap inode.
*/
xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
#if defined(__KERNEL__) && defined(DEBUG)
/*
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index c96a8a05ac0..597d044a09a 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -92,24 +92,6 @@ xfs_do_force_shutdown(
}
/*
- * Prints out an ALERT message about I/O error.
- */
-void
-xfs_ioerror_alert(
- char *func,
- struct xfs_mount *mp,
- xfs_buf_t *bp,
- xfs_daddr_t blkno)
-{
- xfs_alert(mp,
- "I/O error occurred: meta-data dev %s block 0x%llx"
- " (\"%s\") error %d buf count %zd",
- xfs_buf_target_name(bp->b_target),
- (__uint64_t)blkno, func,
- bp->b_error, XFS_BUF_COUNT(bp));
-}
-
-/*
* This isn't an absolute requirement, but it is
* just a good idea to call xfs_read_buf instead of
* directly doing a read_buf call. For one, we shouldn't
@@ -143,14 +125,13 @@ xfs_read_buf(
} else {
*bpp = NULL;
if (error) {
- xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
+ xfs_buf_ioerror_alert(bp, __func__);
} else {
error = XFS_ERROR(EIO);
}
if (bp) {
XFS_BUF_UNDONE(bp);
- XFS_BUF_UNDELAYWRITE(bp);
- XFS_BUF_STALE(bp);
+ xfs_buf_stale(bp);
/*
* brelse clears B_ERROR and b_error
*/
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index 11c41ec6ed7..bbdb9ad6a4b 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -42,8 +42,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
xfs_daddr_t blkno, int len, uint flags,
struct xfs_buf **bpp);
-extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
- xfs_buf_t *bp, xfs_daddr_t blkno);
extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 5cf06b85fd9..8a899496fd5 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -796,8 +796,6 @@ xfs_fs_destroy_inode(
if (is_bad_inode(inode))
goto out_reclaim;
- xfs_ioend_wait(ip);
-
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
/*
@@ -837,7 +835,6 @@ xfs_fs_inode_init_once(
inode_init_once(VFS_I(ip));
/* xfs inode */
- atomic_set(&ip->i_iocount, 0);
atomic_set(&ip->i_pincount, 0);
spin_lock_init(&ip->i_flags_lock);
init_waitqueue_head(&ip->i_ipin_wait);
@@ -872,27 +869,6 @@ xfs_fs_dirty_inode(
}
STATIC int
-xfs_log_inode(
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
- int error;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return error;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- return xfs_trans_commit(tp, 0);
-}
-
-STATIC int
xfs_fs_write_inode(
struct inode *inode,
struct writeback_control *wbc)
@@ -905,23 +881,23 @@ xfs_fs_write_inode(
if (XFS_FORCED_SHUTDOWN(mp))
return -XFS_ERROR(EIO);
- if (!ip->i_update_core)
- return 0;
- if (wbc->sync_mode == WB_SYNC_ALL) {
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) {
/*
* Make sure the inode has made it it into the log. Instead
* of forcing it all the way to stable storage using a
* synchronous transaction we let the log force inside the
* ->sync_fs call do that for thus, which reduces the number
- * of synchronous log foces dramatically.
+ * of synchronous log forces dramatically.
*/
- xfs_ioend_wait(ip);
- error = xfs_log_inode(ip);
+ error = xfs_log_dirty_inode(ip, NULL, 0);
if (error)
goto out;
return 0;
} else {
+ if (!ip->i_update_core)
+ return 0;
+
/*
* We make this non-blocking if the inode is contended, return
* EAGAIN to indicate to the caller that they did not succeed.
@@ -1019,7 +995,7 @@ xfs_fs_put_super(
*/
xfs_filestream_unmount(mp);
- XFS_bflush(mp->m_ddev_targp);
+ xfs_flush_buftarg(mp->m_ddev_targp, 1);
xfs_unmountfs(mp);
xfs_freesb(mp);
@@ -1443,7 +1419,7 @@ xfs_fs_fill_super(
*/
xfs_filestream_unmount(mp);
- XFS_bflush(mp->m_ddev_targp);
+ xfs_flush_buftarg(mp->m_ddev_targp, 1);
xfs_unmountfs(mp);
goto out_free_sb;
@@ -1670,7 +1646,6 @@ init_xfs_fs(void)
printk(KERN_INFO XFS_VERSION_STRING " with "
XFS_BUILD_OPTIONS " enabled\n");
- xfs_ioend_init();
xfs_dir_startup();
error = xfs_init_zones();
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 4604f90f86a..f0994aedcd1 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -227,21 +227,17 @@ xfs_sync_inode_data(
int error = 0;
if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- goto out_wait;
+ return 0;
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
if (flags & SYNC_TRYLOCK)
- goto out_wait;
+ return 0;
xfs_ilock(ip, XFS_IOLOCK_SHARED);
}
error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
0 : XBF_ASYNC, FI_NONE);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
- out_wait:
- if (flags & SYNC_WAIT)
- xfs_ioend_wait(ip);
return error;
}
@@ -322,6 +318,7 @@ xfs_sync_fsdata(
struct xfs_mount *mp)
{
struct xfs_buf *bp;
+ int error;
/*
* If the buffer is pinned then push on the log so we won't get stuck
@@ -334,8 +331,35 @@ xfs_sync_fsdata(
bp = xfs_getsb(mp, 0);
if (xfs_buf_ispinned(bp))
xfs_log_force(mp, 0);
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ return error;
+}
+
+int
+xfs_log_dirty_inode(
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ int flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ if (!ip->i_update_core)
+ return 0;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+ error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
- return xfs_bwrite(mp, bp);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return xfs_trans_commit(tp, 0);
}
/*
@@ -361,6 +385,16 @@ xfs_quiesce_data(
{
int error, error2 = 0;
+ /*
+ * Log all pending size and timestamp updates. The vfs writeback
+ * code is supposed to do this, but due to its overagressive
+ * livelock detection it will skip inodes where appending writes
+ * were written out in the first non-blocking sync phase if their
+ * completion took long enough that it happened after taking the
+ * timestamp for the cut-off in the blocking phase.
+ */
+ xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
+
xfs_qm_sync(mp, SYNC_TRYLOCK);
xfs_qm_sync(mp, SYNC_WAIT);
@@ -379,7 +413,7 @@ xfs_quiesce_data(
/* flush data-only devices */
if (mp->m_rtdev_targp)
- XFS_bflush(mp->m_rtdev_targp);
+ xfs_flush_buftarg(mp->m_rtdev_targp, 1);
return error ? error : error2;
}
@@ -772,6 +806,17 @@ restart:
if (!xfs_iflock_nowait(ip)) {
if (!(sync_mode & SYNC_WAIT))
goto out;
+
+ /*
+ * If we only have a single dirty inode in a cluster there is
+ * a fair chance that the AIL push may have pushed it into
+ * the buffer, but xfsbufd won't touch it until 30 seconds
+ * from now, and thus we will lock up here.
+ *
+ * Promote the inode buffer to the front of the delwri list
+ * and wake up xfsbufd now.
+ */
+ xfs_promote_inode(ip);
xfs_iflock(ip);
}
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 941202e7ac6..fa965479d78 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -34,6 +34,8 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
void xfs_flush_inodes(struct xfs_inode *ip);
+int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
+
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
int xfs_reclaim_inodes_count(struct xfs_mount *mp);
void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 690fc7a7bd7..49403579887 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -30,6 +30,7 @@ struct xfs_buf_log_item;
struct xfs_da_args;
struct xfs_da_node_entry;
struct xfs_dquot;
+struct xfs_log_item;
struct xlog_ticket;
struct log;
struct xlog_recover;
@@ -320,7 +321,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele);
DEFINE_BUF_EVENT(xfs_buf_iodone);
DEFINE_BUF_EVENT(xfs_buf_iorequest);
DEFINE_BUF_EVENT(xfs_buf_bawrite);
-DEFINE_BUF_EVENT(xfs_buf_bdwrite);
DEFINE_BUF_EVENT(xfs_buf_lock);
DEFINE_BUF_EVENT(xfs_buf_lock_done);
DEFINE_BUF_EVENT(xfs_buf_trylock);
@@ -577,6 +577,7 @@ DEFINE_INODE_EVENT(xfs_vm_bmap);
DEFINE_INODE_EVENT(xfs_file_ioctl);
DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_dir_fsync);
DEFINE_INODE_EVENT(xfs_file_fsync);
DEFINE_INODE_EVENT(xfs_destroy_inode);
DEFINE_INODE_EVENT(xfs_write_inode);
@@ -833,18 +834,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
@@ -853,6 +850,42 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+DECLARE_EVENT_CLASS(xfs_log_item_class,
+ TP_PROTO(struct xfs_log_item *lip),
+ TP_ARGS(lip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(void *, lip)
+ __field(uint, type)
+ __field(uint, flags)
+ __field(xfs_lsn_t, lsn)
+ ),
+ TP_fast_assign(
+ __entry->dev = lip->li_mountp->m_super->s_dev;
+ __entry->lip = lip;
+ __entry->type = lip->li_type;
+ __entry->flags = lip->li_flags;
+ __entry->lsn = lip->li_lsn;
+ ),
+ TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->lip,
+ CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn),
+ __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+ __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_LOG_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_log_item_class, name, \
+ TP_PROTO(struct xfs_log_item *lip), \
+ TP_ARGS(lip))
+DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
+
+
DECLARE_EVENT_CLASS(xfs_file_class,
TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
TP_ARGS(ip, count, offset, flags),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index efc147f0e9b..1f35b2feca9 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1790,9 +1790,7 @@ xfs_trans_commit_cil(
}
/*
- * xfs_trans_commit
- *
- * Commit the given transaction to the log a/synchronously.
+ * Commit the given transaction to the log.
*
* XFS disk error handling mechanism is not based on a typical
* transaction abort mechanism. Logically after the filesystem
@@ -1804,10 +1802,9 @@ xfs_trans_commit_cil(
* Do not reference the transaction structure after this call.
*/
int
-_xfs_trans_commit(
+xfs_trans_commit(
struct xfs_trans *tp,
- uint flags,
- int *log_flushed)
+ uint flags)
{
struct xfs_mount *mp = tp->t_mountp;
xfs_lsn_t commit_lsn = -1;
@@ -1866,7 +1863,7 @@ _xfs_trans_commit(
if (sync) {
if (!error) {
error = _xfs_log_force_lsn(mp, commit_lsn,
- XFS_LOG_SYNC, log_flushed);
+ XFS_LOG_SYNC, NULL);
}
XFS_STATS_INC(xs_trans_sync);
} else {
@@ -2021,6 +2018,6 @@ xfs_trans_roll(
if (error)
return error;
- xfs_trans_ijoin(trans, dp);
+ xfs_trans_ijoin(trans, dp, 0);
return 0;
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 53597f4db9b..3ae713c0abd 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -326,7 +326,7 @@ typedef struct xfs_log_item {
struct xfs_log_item *);
/* buffer item iodone */
/* callback func */
- struct xfs_item_ops *li_ops; /* function list */
+ const struct xfs_item_ops *li_ops; /* function list */
/* delayed logging */
struct list_head li_cil; /* CIL pointers */
@@ -341,7 +341,7 @@ typedef struct xfs_log_item {
{ XFS_LI_IN_AIL, "IN_AIL" }, \
{ XFS_LI_ABORTED, "ABORTED" }
-typedef struct xfs_item_ops {
+struct xfs_item_ops {
uint (*iop_size)(xfs_log_item_t *);
void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
void (*iop_pin)(xfs_log_item_t *);
@@ -352,7 +352,7 @@ typedef struct xfs_item_ops {
void (*iop_push)(xfs_log_item_t *);
bool (*iop_pushbuf)(xfs_log_item_t *);
void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
-} xfs_item_ops_t;
+};
#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
@@ -470,8 +470,7 @@ void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
-void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
-void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
+void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
@@ -487,10 +486,7 @@ void xfs_trans_log_efd_extent(xfs_trans_t *,
struct xfs_efd_log_item *,
xfs_fsblock_t,
xfs_extlen_t);
-int _xfs_trans_commit(xfs_trans_t *,
- uint flags,
- int *);
-#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL)
+int xfs_trans_commit(xfs_trans_t *, uint flags);
void xfs_trans_cancel(xfs_trans_t *, int);
int xfs_trans_ail_init(struct xfs_mount *);
void xfs_trans_ail_destroy(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 3a1e7ca54c2..ed9252bcdac 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -26,6 +26,7 @@
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_trans_priv.h"
+#include "xfs_trace.h"
#include "xfs_error.h"
#ifdef DEBUG
@@ -364,12 +365,24 @@ xfsaild_push(
xfs_lsn_t lsn;
xfs_lsn_t target;
long tout = 10;
- int flush_log = 0;
int stuck = 0;
int count = 0;
int push_xfsbufd = 0;
+ /*
+ * If last time we ran we encountered pinned items, force the log first
+ * and wait for it before pushing again.
+ */
spin_lock(&ailp->xa_lock);
+ if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush &&
+ !list_empty(&ailp->xa_ail)) {
+ ailp->xa_log_flush = 0;
+ spin_unlock(&ailp->xa_lock);
+ XFS_STATS_INC(xs_push_ail_flush);
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ spin_lock(&ailp->xa_lock);
+ }
+
target = ailp->xa_target;
lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
@@ -413,16 +426,20 @@ xfsaild_push(
switch (lock_result) {
case XFS_ITEM_SUCCESS:
XFS_STATS_INC(xs_push_ail_success);
+ trace_xfs_ail_push(lip);
+
IOP_PUSH(lip);
ailp->xa_last_pushed_lsn = lsn;
break;
case XFS_ITEM_PUSHBUF:
XFS_STATS_INC(xs_push_ail_pushbuf);
+ trace_xfs_ail_pushbuf(lip);
if (!IOP_PUSHBUF(lip)) {
+ trace_xfs_ail_pushbuf_pinned(lip);
stuck++;
- flush_log = 1;
+ ailp->xa_log_flush++;
} else {
ailp->xa_last_pushed_lsn = lsn;
}
@@ -431,12 +448,15 @@ xfsaild_push(
case XFS_ITEM_PINNED:
XFS_STATS_INC(xs_push_ail_pinned);
+ trace_xfs_ail_pinned(lip);
+
stuck++;
- flush_log = 1;
+ ailp->xa_log_flush++;
break;
case XFS_ITEM_LOCKED:
XFS_STATS_INC(xs_push_ail_locked);
+ trace_xfs_ail_locked(lip);
stuck++;
break;
@@ -476,16 +496,6 @@ xfsaild_push(
xfs_trans_ail_cursor_done(ailp, &cur);
spin_unlock(&ailp->xa_lock);
- if (flush_log) {
- /*
- * If something we need to push out was pinned, then
- * push out the log so it will become unpinned and
- * move forward in the AIL.
- */
- XFS_STATS_INC(xs_push_ail_flush);
- xfs_log_force(mp, 0);
- }
-
if (push_xfsbufd) {
/* we've got delayed write buffers to flush */
wake_up_process(mp->m_ddev_targp->bt_task);
@@ -496,6 +506,7 @@ out_done:
if (!count) {
/* We're past our target or empty, so idle */
ailp->xa_last_pushed_lsn = 0;
+ ailp->xa_log_flush = 0;
tout = 50;
} else if (XFS_LSN_CMP(lsn, target) >= 0) {
@@ -514,9 +525,13 @@ out_done:
* were stuck.
*
* Backoff a bit more to allow some I/O to complete before
- * continuing from where we were.
+ * restarting from the start of the AIL. This prevents us
+ * from spinning on the same items, and if they are pinned will
+ * all the restart to issue a log force to unpin the stuck
+ * items.
*/
tout = 20;
+ ailp->xa_last_pushed_lsn = 0;
}
return tout;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 137e2b9e294..475a4ded4f4 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -160,8 +160,10 @@ xfs_trans_get_buf(xfs_trans_t *tp,
bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
if (bp != NULL) {
ASSERT(xfs_buf_islocked(bp));
- if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
- XFS_BUF_SUPER_STALE(bp);
+ if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
+ xfs_buf_stale(bp);
+ XFS_BUF_DONE(bp);
+ }
/*
* If the buffer is stale then it was binval'ed
@@ -294,8 +296,7 @@ xfs_trans_read_buf(
if (bp->b_error) {
error = bp->b_error;
- xfs_ioerror_alert("xfs_trans_read_buf", mp,
- bp, blkno);
+ xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_relse(bp);
return error;
}
@@ -337,8 +338,7 @@ xfs_trans_read_buf(
xfsbdstrat(tp->t_mountp, bp);
error = xfs_buf_iowait(bp);
if (error) {
- xfs_ioerror_alert("xfs_trans_read_buf", mp,
- bp, blkno);
+ xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_relse(bp);
/*
* We can gracefully recover from most read
@@ -387,9 +387,9 @@ xfs_trans_read_buf(
}
if (bp->b_error) {
error = bp->b_error;
- XFS_BUF_SUPER_STALE(bp);
- xfs_ioerror_alert("xfs_trans_read_buf", mp,
- bp, blkno);
+ xfs_buf_stale(bp);
+ XFS_BUF_DONE(bp);
+ xfs_buf_ioerror_alert(bp, __func__);
if (tp->t_flags & XFS_TRANS_DIRTY)
xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
xfs_buf_relse(bp);
@@ -643,13 +643,14 @@ xfs_trans_log_buf(xfs_trans_t *tp,
* inside the b_bdstrat callback so that this won't get written to
* disk.
*/
- XFS_BUF_DELAYWRITE(bp);
XFS_BUF_DONE(bp);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bp->b_iodone = xfs_buf_iodone_callbacks;
bip->bli_item.li_cb = xfs_buf_iodone;
+ xfs_buf_delwri_queue(bp);
+
trace_xfs_trans_log_buf(bip);
/*
@@ -738,8 +739,7 @@ xfs_trans_binval(
* We set the stale bit in the buffer as well since we're getting
* rid of it.
*/
- XFS_BUF_UNDELAYWRITE(bp);
- XFS_BUF_STALE(bp);
+ xfs_buf_stale(bp);
bip->bli_flags |= XFS_BLI_STALE;
bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index c8dea2fd7e6..32f0288ae10 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -47,11 +47,13 @@ xfs_trans_inode_broot_debug(
* Add a locked inode to the transaction.
*
* The inode must be locked, and it cannot be associated with any transaction.
+ * If lock_flags is non-zero the inode will be unlocked on transaction commit.
*/
void
xfs_trans_ijoin(
struct xfs_trans *tp,
- struct xfs_inode *ip)
+ struct xfs_inode *ip,
+ uint lock_flags)
{
xfs_inode_log_item_t *iip;
@@ -59,7 +61,9 @@ xfs_trans_ijoin(
if (ip->i_itemp == NULL)
xfs_inode_item_init(ip, ip->i_mount);
iip = ip->i_itemp;
+
ASSERT(iip->ili_lock_flags == 0);
+ iip->ili_lock_flags = lock_flags;
/*
* Get a log_item_desc to point at the new item.
@@ -70,25 +74,6 @@ xfs_trans_ijoin(
}
/*
- * Add a locked inode to the transaction.
- *
- *
- * Grabs a reference to the inode which will be dropped when the transaction
- * is committed. The inode will also be unlocked at that point. The inode
- * must be locked, and it cannot be associated with any transaction.
- */
-void
-xfs_trans_ijoin_ref(
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- uint lock_flags)
-{
- xfs_trans_ijoin(tp, ip);
- IHOLD(ip);
- ip->i_itemp->ili_lock_flags = lock_flags;
-}
-
-/*
* Transactional inode timestamp update. Requires the inode to be locked and
* joined to the transaction supplied. Relies on the transaction subsystem to
* track dirty state and update/writeback the inode accordingly.
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 22750b5e4a8..44820b9fcb4 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -70,6 +70,7 @@ struct xfs_ail {
struct list_head xa_cursors;
spinlock_t xa_lock;
xfs_lsn_t xa_last_pushed_lsn;
+ int xa_log_flush;
};
/*
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 51fc429527b..ce9268a2f56 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -72,8 +72,8 @@ xfs_readlink_bmap(
xfs_buf_t *bp;
int error = 0;
- error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
- mval, &nmaps, NULL);
+ error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps,
+ 0);
if (error)
goto out;
@@ -87,8 +87,7 @@ xfs_readlink_bmap(
return XFS_ERROR(ENOMEM);
error = bp->b_error;
if (error) {
- xfs_ioerror_alert("xfs_readlink",
- ip->i_mount, bp, XFS_BUF_ADDR(bp));
+ xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_relse(bp);
goto out;
}
@@ -113,7 +112,7 @@ xfs_readlink(
char *link)
{
xfs_mount_t *mp = ip->i_mount;
- int pathlen;
+ xfs_fsize_t pathlen;
int error = 0;
trace_xfs_readlink(ip);
@@ -123,13 +122,19 @@ xfs_readlink(
xfs_ilock(ip, XFS_ILOCK_SHARED);
- ASSERT(S_ISLNK(ip->i_d.di_mode));
- ASSERT(ip->i_d.di_size <= MAXPATHLEN);
-
pathlen = ip->i_d.di_size;
if (!pathlen)
goto out;
+ if (pathlen < 0 || pathlen > MAXPATHLEN) {
+ xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
+ __func__, (unsigned long long) ip->i_ino,
+ (long long) pathlen);
+ ASSERT(0);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+
if (ip->i_df.if_flags & XFS_IFINLINE) {
memcpy(link, ip->i_df.if_u1.if_data, pathlen);
link[pathlen] = '\0';
@@ -178,8 +183,7 @@ xfs_free_eofblocks(
nimaps = 1;
xfs_ilock(ip, XFS_ILOCK_SHARED);
- error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
- NULL, 0, &imap, &nimaps, NULL);
+ error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!error && (nimaps != 0) &&
@@ -220,7 +224,7 @@ xfs_free_eofblocks(
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
error = xfs_itruncate_data(&tp, ip, ip->i_size);
if (error) {
@@ -289,7 +293,7 @@ xfs_inactive_symlink_rmt(
xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
size = (int)ip->i_d.di_size;
ip->i_d.di_size = 0;
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
/*
* Find the block(s) so we can inval and unmap them.
@@ -297,9 +301,9 @@ xfs_inactive_symlink_rmt(
done = 0;
xfs_bmap_init(&free_list, &first_block);
nmaps = ARRAY_SIZE(mval);
- if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
- XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
- &free_list)))
+ error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size),
+ mval, &nmaps, 0);
+ if (error)
goto error0;
/*
* Invalidate the block(s).
@@ -308,6 +312,10 @@ xfs_inactive_symlink_rmt(
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+ if (!bp) {
+ error = ENOMEM;
+ goto error1;
+ }
xfs_trans_binval(tp, bp);
}
/*
@@ -333,7 +341,7 @@ xfs_inactive_symlink_rmt(
* Mark it dirty so it will be logged and moved forward in the log as
* part of every commit.
*/
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
/*
* Get a new, empty transaction to return to our caller.
@@ -466,7 +474,7 @@ xfs_inactive_attrs(
goto error_cancel;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
xfs_idestroy_fork(ip, XFS_ATTR_FORK);
ASSERT(ip->i_d.di_anextents == 0);
@@ -647,8 +655,6 @@ xfs_inactive(
if (truncate) {
xfs_ilock(ip, XFS_IOLOCK_EXCL);
- xfs_ioend_wait(ip);
-
error = xfs_trans_reserve(tp, 0,
XFS_ITRUNCATE_LOG_RES(mp),
0, XFS_TRANS_PERM_LOG_RES,
@@ -662,7 +668,7 @@ xfs_inactive(
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
error = xfs_itruncate_data(&tp, ip, 0);
if (error) {
@@ -686,7 +692,7 @@ xfs_inactive(
return VN_INACTIVE_CACHE;
}
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
} else {
error = xfs_trans_reserve(tp, 0,
XFS_IFREE_LOG_RES(mp),
@@ -699,7 +705,7 @@ xfs_inactive(
}
xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
}
/*
@@ -939,7 +945,7 @@ xfs_create(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = B_FALSE;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1260,8 +1266,8 @@ xfs_remove(
xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
/*
* If we're removing a directory perform some additional validation.
@@ -1406,8 +1412,8 @@ xfs_link(
xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
/*
* If the source has too many links, we can't make any more to it.
@@ -1601,7 +1607,7 @@ xfs_symlink(
* transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = B_FALSE;
/*
@@ -1632,10 +1638,9 @@ xfs_symlink(
first_fsb = 0;
nmaps = SYMLINK_MAPS;
- error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
- XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
- &first_block, resblks, mval, &nmaps,
- &free_list);
+ error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
+ XFS_BMAPI_METADATA, &first_block, resblks,
+ mval, &nmaps, &free_list);
if (error)
goto error2;
@@ -1650,7 +1655,10 @@ xfs_symlink(
byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
BTOBB(byte_cnt), 0);
- ASSERT(!xfs_buf_geterror(bp));
+ if (!bp) {
+ error = ENOMEM;
+ goto error2;
+ }
if (pathlen < byte_cnt) {
byte_cnt = pathlen;
}
@@ -1732,7 +1740,7 @@ xfs_set_dmattrs(
return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
ip->i_d.di_dmevmask = evmask;
ip->i_d.di_dmstate = state;
@@ -1778,7 +1786,6 @@ xfs_alloc_file_space(
xfs_fileoff_t startoffset_fsb;
xfs_fsblock_t firstfsb;
int nimaps;
- int bmapi_flag;
int quota_flag;
int rt;
xfs_trans_t *tp;
@@ -1806,7 +1813,6 @@ xfs_alloc_file_space(
count = len;
imapp = &imaps[0];
nimaps = 1;
- bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
allocatesize_fsb = XFS_B_TO_FSB(mp, count);
@@ -1877,16 +1883,12 @@ xfs_alloc_file_space(
if (error)
goto error1;
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
- /*
- * Issue the xfs_bmapi() call to allocate the blocks
- */
xfs_bmap_init(&free_list, &firstfsb);
- error = xfs_bmapi(tp, ip, startoffset_fsb,
- allocatesize_fsb, bmapi_flag,
- &firstfsb, 0, imapp, &nimaps,
- &free_list);
+ error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+ allocatesize_fsb, alloc_type, &firstfsb,
+ 0, imapp, &nimaps, &free_list);
if (error) {
goto error0;
}
@@ -1976,8 +1978,7 @@ xfs_zero_remaining_bytes(
for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
offset_fsb = XFS_B_TO_FSBT(mp, offset);
nimap = 1;
- error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
- NULL, 0, &imap, &nimap, NULL);
+ error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
if (error || nimap < 1)
break;
ASSERT(imap.br_blockcount >= 1);
@@ -1997,8 +1998,8 @@ xfs_zero_remaining_bytes(
xfsbdstrat(mp, bp);
error = xfs_buf_iowait(bp);
if (error) {
- xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
- mp, bp, XFS_BUF_ADDR(bp));
+ xfs_buf_ioerror_alert(bp,
+ "xfs_zero_remaining_bytes(read)");
break;
}
memset(bp->b_addr +
@@ -2010,8 +2011,8 @@ xfs_zero_remaining_bytes(
xfsbdstrat(mp, bp);
error = xfs_buf_iowait(bp);
if (error) {
- xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
- mp, bp, XFS_BUF_ADDR(bp));
+ xfs_buf_ioerror_alert(bp,
+ "xfs_zero_remaining_bytes(write)");
break;
}
}
@@ -2076,7 +2077,7 @@ xfs_free_file_space(
if (need_iolock) {
xfs_ilock(ip, XFS_IOLOCK_EXCL);
/* wait for the completion of any pending DIOs */
- xfs_ioend_wait(ip);
+ inode_dio_wait(VFS_I(ip));
}
rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
@@ -2096,8 +2097,8 @@ xfs_free_file_space(
*/
if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
nimap = 1;
- error = xfs_bmapi(NULL, ip, startoffset_fsb,
- 1, 0, NULL, 0, &imap, &nimap, NULL);
+ error = xfs_bmapi_read(ip, startoffset_fsb, 1,
+ &imap, &nimap, 0);
if (error)
goto out_unlock_iolock;
ASSERT(nimap == 0 || nimap == 1);
@@ -2111,8 +2112,8 @@ xfs_free_file_space(
startoffset_fsb += mp->m_sb.sb_rextsize - mod;
}
nimap = 1;
- error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
- 1, 0, NULL, 0, &imap, &nimap, NULL);
+ error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
+ &imap, &nimap, 0);
if (error)
goto out_unlock_iolock;
ASSERT(nimap == 0 || nimap == 1);
@@ -2180,7 +2181,7 @@ xfs_free_file_space(
if (error)
goto error1;
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, 0);
/*
* issue the bunmapi() call to free the blocks
@@ -2353,8 +2354,7 @@ xfs_change_file_space(
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, ip);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
if ((attr_flags & XFS_ATTR_DMI) == 0) {
ip->i_d.di_mode &= ~S_ISUID;
@@ -2379,10 +2379,5 @@ xfs_change_file_space(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (attr_flags & XFS_ATTR_SYNC)
xfs_trans_set_sync(tp);
-
- error = xfs_trans_commit(tp, 0);
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- return error;
+ return xfs_trans_commit(tp, 0);
}